1 /* -*- mode: c++ -*- */
2 /* ------------------------------ */
3 /*  driver for lmder example.     */
4 /* ------------------------------ */
5 
6 #include <stdio.h>
7 #include <math.h>
8 #include <string.h>
9 #include <cminpack.h>
10 
11 #include <lmder.cu>
12 #include <covar1.cu>
13 #define real __cminpack_real__
14 
15 #define cutilSafeCall(err)           __cudaSafeCall      (err, __FILE__, __LINE__)
__cudaSafeCall(cudaError err,const char * file,const int line)16 inline void __cudaSafeCall( cudaError err, const char *file, const int line )
17 {
18     if( cudaSuccess != err) {
19         fprintf(stderr, "cudaSafeCall() Runtime API error in file <%s>, line %i : %s.\n",
20                 file, line, cudaGetErrorString( err) );
21         exit(-1);
22     }
23 }
24 
25 //--------------------------------------------------------------------------
26 //--------------------------------------------------------------------------
27 const unsigned int NUM_OBSERVATIONS = 15; // m
28 const unsigned int NUM_PARAMS = 3; // 3 = n
29 
30 //--------------------------------------------------------------------------
31 //--------------------------------------------------------------------------
32 //
33 //  fixed arrangement of threads to be run
34 //
35 const unsigned int NUM_THREADS = 2048;
36 const unsigned int NUM_THREADS_PER_BLOCK = 128;
37 const unsigned int NUM_BLOCKS = NUM_THREADS / NUM_THREADS_PER_BLOCK;
38 
39 //--------------------------------------------------------------------------
40 //
41 // the struct for returning results from the GPU
42 //
43 
44 typedef struct
45 {
46     real fnorm;
47     int nfev;
48     int njev;
49     int info;
50     int rankJ;
51     real solution[NUM_PARAMS];
52     real covar[NUM_PARAMS][NUM_PARAMS];
53 } ResultType;
54 
55 //--------------------------------------------------------------------------
56 // the cost function
57 //--------------------------------------------------------------------------
58 __cminpack_attr__ /* __device__ */
fcnder_mn(void * p,int m,int n,const real * x,real * fvec,real * fjac,int ldfjac,int iflag)59 int fcnder_mn(
60     void *p, int m, int n, const real *x,
61     real *fvec, real *fjac,
62     int ldfjac, int iflag)
63 {
64 
65     /*      subroutine fcn for lmder example. */
66 
67     int i;
68     real tmp1, tmp2, tmp3, tmp4;
69     real y[NUM_OBSERVATIONS]={1.4e-1, 1.8e-1, 2.2e-1, 2.5e-1,
70                               2.9e-1, 3.2e-1, 3.5e-1, 3.9e-1, 3.7e-1,
71                               5.8e-1, 7.3e-1, 9.6e-1, 1.34, 2.1, 4.39};
72 
73     if (iflag == 0) {
74         /*      insert print statements here when nprint is positive. */
75         return 0;
76     }
77 
78     if (iflag != 2) {
79 
80 	for (i = 1; i <= NUM_OBSERVATIONS; i++) {
81             tmp1 = i;
82             tmp2 = (NUM_OBSERVATIONS+1) - i;
83             tmp3 = tmp1;
84             if (i > 8) tmp3 = tmp2;
85             fvec[i-1] = y[i-1] - (x[1-1] + tmp1/(x[2-1]*tmp2 + x[3-1]*tmp3));
86         } // for
87 
88     } else {
89 
90         for (i=1; i<=NUM_OBSERVATIONS; i++) {
91             tmp1 = i;
92             tmp2 = (NUM_OBSERVATIONS+1) - i;
93             tmp3 = tmp1;
94             if (i > 8) tmp3 = tmp2;
95             tmp4 = (x[2-1]*tmp2 + x[3-1]*tmp3); tmp4 = tmp4*tmp4;
96             fjac[i-1 + ldfjac*(1-1)] = -1.;
97             fjac[i-1 + ldfjac*(2-1)] = tmp1*tmp2/tmp4;
98             fjac[i-1 + ldfjac*(3-1)] = tmp1*tmp3/tmp4;
99         } // for
100     } // if
101 
102     return 0;
103 }
104 
105 //--------------------------------------------------------------------------
106 // the kernel in the GPU
107 //--------------------------------------------------------------------------
mainKernel(ResultType pResults[])108 __global__ void mainKernel(ResultType  pResults[])
109 {
110     int ldfjac, maxfev, mode, nprint, info, nfev, njev;
111     int ipvt[NUM_PARAMS];
112     real ftol, xtol, gtol, factor, fnorm;
113     real x[NUM_PARAMS], fvec[NUM_OBSERVATIONS],
114             diag[NUM_PARAMS], fjac[NUM_OBSERVATIONS*NUM_PARAMS], qtf[NUM_PARAMS],
115             wa1[NUM_PARAMS], wa2[NUM_PARAMS], wa3[NUM_PARAMS], wa4[NUM_OBSERVATIONS];
116     int k;
117 
118     // m = NUM_OBSERVATIONS;
119     // n = NUM_PARAMS;
120 
121     /*      the following starting values provide a rough fit. */
122 
123     x[1-1] = 1.;
124     x[2-1] = 1.;
125     x[3-1] = 1.;
126 
127     ldfjac = NUM_OBSERVATIONS;
128 
129     /*      set ftol and xtol to the square root of the machine */
130     /*      and gtol to zero. unless high solutions are */
131     /*      required, these are the recommended settings. */
132 
133     ftol = sqrt(__cminpack_func__(dpmpar)(1));
134     xtol = sqrt(__cminpack_func__(dpmpar)(1));
135     gtol = 0.;
136 
137     maxfev = 400;
138     mode = 1;
139     factor = 1.e2;
140     nprint = 0;
141 
142     // -------------------------------
143     // call lmder, enorm, and covar1
144     // -------------------------------
145     info = __cminpack_func__(lmder)(__cminpack_param_fcnder_mn__ 0, NUM_OBSERVATIONS, NUM_PARAMS,
146                  x, fvec, fjac, ldfjac, ftol, xtol, gtol,
147                  maxfev, diag, mode, factor, nprint, &nfev, &njev,
148                  ipvt, qtf, wa1, wa2, wa3, wa4);
149 
150     fnorm = __cminpack_func__(enorm)(NUM_OBSERVATIONS, fvec);
151 
152     // NOTE: REMOVED THE TEST OF ORIGINAL MINPACK covar routine
153 
154     /* test covar1, which also estimates the rank of the Jacobian */
155     ftol = __cminpack_func__(dpmpar)(1);
156     k = __cminpack_func__(covar1)(NUM_OBSERVATIONS, NUM_PARAMS,
157                fnorm*fnorm, fjac, ldfjac, ipvt, ftol, wa1);
158 
159     // ----------------------------------
160     // save the results in global memory
161     // ----------------------------------
162     int threadId = (blockIdx.x * blockDim.x) + threadIdx.x;
163 
164     pResults[threadId].fnorm = fnorm;
165     pResults[threadId].nfev = nfev;
166     pResults[threadId].njev = njev;
167     pResults[threadId].info = info;
168 
169     for (int j=1; j<=NUM_PARAMS; j++) {
170         pResults[threadId].solution[j-1] = x[j-1];
171     }
172 
173     for (int i=1; i<=NUM_PARAMS; i++) {
174         for (int j=1; j<=NUM_PARAMS; j++) {
175             pResults[threadId].covar[i-1][j-1] = fjac[(i-1)*ldfjac+j-1];
176 	} // for
177     } // for
178 
179     pResults[threadId].rankJ =  (k != 0 ? k : NUM_PARAMS);
180 
181 } // ()
182 
183 //--------------------------------------------------------------------------
184 //--------------------------------------------------------------------------
main(int argc,char ** argv)185 int main (int argc, char** argv)
186 {
187 
188     fprintf (stderr, "\ntlmder starts ! \n");
189     //  ...............................................................
190     // choose the fastest GPU device
191     //  ...............................................................
192     unsigned int GPU_ID = 1;
193     // unsigned int GPU_ID =  cutGetMaxGflopsDeviceId() ;
194     cudaSetDevice(GPU_ID);
195     fprintf (stderr, " CUDA device chosen = %d \n", GPU_ID);
196 
197     // .......................................................
198     //  get memory in the GPU to store the results
199     // .......................................................
200     ResultType * results_GPU = 0;
201     cutilSafeCall(cudaMalloc( &results_GPU,  NUM_THREADS * sizeof(ResultType) ));
202 
203     // .......................................................
204     //  get memory in the CPU to store the results
205     // .......................................................
206     ResultType * results_CPU = 0;
207     cutilSafeCall(cudaMallocHost( &results_CPU, NUM_THREADS * sizeof(ResultType) ));
208 
209     // .......................................................
210     //  launch the kernel
211     // .......................................................
212     fprintf (stderr, " \nlaunching the kernel num. blocks = %d, threads per block = %d\n total threads = %d\n\n",
213              NUM_BLOCKS, NUM_THREADS_PER_BLOCK, NUM_THREADS);
214 
215     mainKernel<<<NUM_BLOCKS,NUM_THREADS_PER_BLOCK>>> ( results_GPU );
216 
217     // .......................................................
218     // wait for termination
219     // .......................................................
220     cudaThreadSynchronize();
221     fprintf (stderr, " GPU processing done \n\n");
222 
223     // .......................................................
224     // copy back to CPU the results
225     // .......................................................
226     cutilSafeCall(cudaMemcpy( results_CPU, results_GPU,
227                               NUM_THREADS * sizeof(ResultType),
228                               cudaMemcpyDeviceToHost
229                               ));
230 
231     // .......................................................
232     // check all the threads computed the same results
233     // .......................................................
234     bool ok = true;
235     for (unsigned int i = 1; i<NUM_THREADS; i++) {
236 	if ( memcmp (&results_CPU[0], &results_CPU[i], sizeof(ResultType)) != 0) {
237             // warning: may the padding bytes be different ?
238             ok = false;
239 	}
240     } // for
241 
242     if (ok) {
243 	fprintf (stderr, " !!! all threads computed the same results !!! \n\n");
244     } else {
245 	fprintf (stderr, "ERROR in results of threads \n");
246     }
247 
248     // .......................................................
249     // show the results !
250     // .......................................................
251 
252     printf("      final l2 norm of the residuals%15.7g\n\n", results_CPU[0].fnorm);
253     printf("      number of function evaluations%10i\n\n", results_CPU[0].nfev);
254     printf("      number of Jacobian evaluations%10i\n\n", results_CPU[0].njev);
255     printf("      exit parameter                %10i\n\n", results_CPU[0].info);
256     printf("      final approximate solution\n");
257 
258     for (int j=0; j<NUM_PARAMS; j++)  {
259         printf("%15.7g", results_CPU[0].solution[j]);
260     }
261     printf("\n");
262 
263     printf("      covariance\n");
264 
265     for (unsigned int i=0; i<NUM_PARAMS; i++) {
266         for (unsigned int j=0; j<NUM_PARAMS; j++) {
267             printf("%15.7g", results_CPU[0].covar[i][j]);
268 	} // for
269 	printf ("\n");
270     } // for
271 
272     printf("\n");
273     printf(" rank(J) = %d\n", results_CPU[0].rankJ );
274 
275     cutilSafeCall(cudaFree(results_GPU));
276     cutilSafeCall(cudaFreeHost(results_CPU));
277     cudaThreadExit();
278     //cutilExit(argc, argv);
279 } // ()
280