1 /******************************************************************************
2 * Copyright (c) Intel Corporation - All rights reserved. *
3 * This file is part of the LIBXSMM library. *
4 * *
5 * For information on the license, see the LICENSE file. *
6 * Further information: https://github.com/hfp/libxsmm/ *
7 * SPDX-License-Identifier: BSD-3-Clause *
8 ******************************************************************************/
9 /* Evangelos Georganas (Intel Corp.)
10 ******************************************************************************/
11 #include <libxsmm.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <stdio.h>
15 #include <math.h>
16
17
18 LIBXSMM_INLINE
sfill_matrix(float * matrix,unsigned int ld,unsigned int m,unsigned int n)19 void sfill_matrix ( float *matrix, unsigned int ld, unsigned int m, unsigned int n )
20 {
21 unsigned int i, j;
22 double dtmp;
23
24 if ( ld < m )
25 {
26 fprintf(stderr,"Error is sfill_matrix: ld=%u m=%u mismatched!\n",ld,m);
27 exit(EXIT_FAILURE);
28 }
29 for ( j = 1; j <= n; j++ )
30 {
31 /* Fill through the leading dimension */
32 for ( i = 1; i <= ld; i++ )
33 {
34 dtmp = 1.0 - 2.0*libxsmm_rng_f64();
35 matrix [ (j-1)*ld + (i-1) ] = (float) dtmp;
36 }
37 }
38 }
39
main(int argc,char * argv[])40 int main(int argc, char* argv[])
41 {
42 unsigned int m = 64, n = 64, perform_scale = 1, perform_shift = 1, perform_bias = 1, scale_rows = 1, vectors_size, i, j, k, iters = 10000;
43 libxsmm_blasint ld_in = 64, ld_out = 64;
44 float *sinp, *sout, *scale_vals, *shift_vals, *bias_vals, *ref_out;
45 libxsmm_meltw_scal_flags jit_flags = 0;
46 libxsmm_meltwfunction_scale kernel;
47 libxsmm_meltw_scale_param params;
48 libxsmm_matdiff_info norms_out;
49 unsigned long long l_start, l_end;
50 double l_total = 0.0, l_total2 = 0.0;
51
52 libxsmm_init();
53
54 libxsmm_matdiff_clear(&norms_out);
55
56 if ( argc > 1 ) m = atoi(argv[1]);
57 if ( argc > 2 ) n = atoi(argv[2]);
58 if ( argc > 3 ) ld_in = atoi(argv[3]);
59 if ( argc > 4 ) ld_out = atoi(argv[4]);
60 if ( argc > 5 ) perform_shift = atoi(argv[5]);
61 if ( argc > 6 ) perform_scale = atoi(argv[6]);
62 if ( argc > 7 ) perform_bias = atoi(argv[7]);
63 if ( argc > 8 ) scale_rows = atoi(argv[8]);
64 if ( argc > 9 ) iters = atoi(argv[9]);
65
66 m = LIBXSMM_MAX(m,1);
67 n = LIBXSMM_MAX(n,1);
68 ld_in = LIBXSMM_MAX(ld_in,(libxsmm_blasint)m);
69 ld_out = LIBXSMM_MAX(ld_out,(libxsmm_blasint)m);
70
71 vectors_size = (scale_rows == 1) ? n : m;
72
73 /* Allocate arrays */
74 sinp = (float*) malloc( ld_in*n*sizeof(float) );
75 sout = (float*) malloc( ld_out*n*sizeof(float) );
76 ref_out = (float*) malloc( ld_out*n*sizeof(float) );
77
78 scale_vals = (float*) malloc(vectors_size*sizeof(float) );
79 shift_vals = (float*) malloc(vectors_size*sizeof(float) );
80 bias_vals = (float*) malloc(vectors_size*sizeof(float) );
81
82 /* Fill matrices with random data */
83 sfill_matrix ( sinp, ld_in, m, n );
84 sfill_matrix ( scale_vals, vectors_size, vectors_size, 1 );
85 sfill_matrix ( shift_vals, vectors_size, vectors_size, 1 );
86 sfill_matrix ( bias_vals, vectors_size, vectors_size, 1 );
87
88 /* Calculate reference results... */
89 if (scale_rows == 1) {
90 for (j = 0; j < n; j++) {
91 float scale = scale_vals[j];
92 float shift = shift_vals[j];
93 float bias = bias_vals[j];
94 for (i = 0; i < m; i++) {
95 float out;
96 out = sinp[j*ld_in + i];
97 if (perform_shift) out += shift;
98 if (perform_scale) out *= scale;
99 if (perform_bias) out += bias;
100 ref_out[j*ld_out + i] = out;
101 }
102 }
103 } else {
104 /* In this case we reduce columns */
105 for (i = 0; i < m; i++) {
106 float scale = scale_vals[i];
107 float shift = shift_vals[i];
108 float bias = bias_vals[i];
109 for (j = 0; j < n; j++) {
110 float out;
111 out = sinp[j*ld_in + i];
112 if (perform_shift) out += shift;
113 if (perform_scale) out *= scale;
114 if (perform_bias) out += bias;
115 ref_out[j*ld_out + i] = out;
116 }
117 }
118 }
119
120 /* Generate JITED kernel */
121 if (scale_rows == 1) {
122 jit_flags = LIBXSMM_MELTW_FLAG_SCALE_ROWS;
123 } else {
124 jit_flags = LIBXSMM_MELTW_FLAG_SCALE_COLS;
125 }
126 if (perform_scale == 1) {
127 jit_flags |= LIBXSMM_MELTW_FLAG_SCALE_MULT;
128 }
129 if (perform_shift == 1) {
130 jit_flags |= LIBXSMM_MELTW_FLAG_SCALE_SHIFT;
131 }
132 if (perform_bias == 1) {
133 jit_flags |= LIBXSMM_MELTW_FLAG_SCALE_ADD_BIAS;
134 }
135
136 printf("JITing scale kernel... \n");
137 kernel = libxsmm_dispatch_meltw_scale(m, n, &ld_in, &ld_out, LIBXSMM_DATATYPE_F32, LIBXSMM_DATATYPE_F32, jit_flags);
138
139 /* Call JITed kernel and compare result */
140 printf("Calling JITed reduce kernel... \n");
141 params.in_ptr = sinp;
142 params.out_ptr = sout;
143 params.shift_vals_ptr = shift_vals;
144 params.scale_vals_ptr = scale_vals;
145 params.bias_vals_ptr = bias_vals;
146 kernel( ¶ms );
147
148 /* compare */
149 printf("##########################################\n");
150 printf("# Correctness - Eltwise scale out #\n");
151 printf("##########################################\n");
152 libxsmm_matdiff(&norms_out, LIBXSMM_DATATYPE_F32, n * ld_out, 1, ref_out, sout, 0, 0);
153 printf("L1 reference : %.25g\n", norms_out.l1_ref);
154 printf("L1 test : %.25g\n", norms_out.l1_tst);
155 printf("L2 abs.error : %.24f\n", norms_out.l2_abs);
156 printf("L2 rel.error : %.24f\n", norms_out.l2_rel);
157 printf("Linf abs.error: %.24f\n", norms_out.linf_abs);
158 printf("Linf rel.error: %.24f\n", norms_out.linf_rel);
159 printf("Check-norm : %.24f\n\n", norms_out.normf_rel);
160
161 l_start = libxsmm_timer_tick();
162 /* Calculate reference results... */
163 for (k = 0; k < iters; k++) {
164 /* Calculate reference results... */
165 if (scale_rows == 1) {
166 for (j = 0; j < n; j++) {
167 float scale = scale_vals[j];
168 float shift = shift_vals[j];
169 float bias = bias_vals[j];
170 for (i = 0; i < m; i++) {
171 float out;
172 out = sinp[j*ld_in + i];
173 if (perform_shift) out += shift;
174 if (perform_scale) out *= scale;
175 if (perform_bias) out += bias;
176 ref_out[j*ld_out + i] = out;
177 }
178 }
179 } else {
180 /* In this case we reduce columns */
181 for (i = 0; i < m; i++) {
182 float scale = scale_vals[i];
183 float shift = shift_vals[i];
184 float bias = bias_vals[i];
185 for (j = 0; j < n; j++) {
186 float out;
187 out = sinp[j*ld_in + i];
188 if (perform_shift) out += shift;
189 if (perform_scale) out *= scale;
190 if (perform_bias) out += bias;
191 ref_out[j*ld_out + i] = out;
192 }
193 }
194 }
195 }
196 l_end = libxsmm_timer_tick();
197 l_total = libxsmm_timer_duration(l_start, l_end);
198 printf("Reference time = %.5g\n", ((double)(l_total)));
199
200 l_start = libxsmm_timer_tick();
201 for (k = 0; k < iters; k++) {
202 kernel( ¶ms );
203 }
204 l_end = libxsmm_timer_tick();
205 l_total2 = libxsmm_timer_duration(l_start, l_end);
206 printf("Optimized time = %.5g\n", ((double)(l_total2)));
207 printf("Speedup is = %.5g\n", ((double)(l_total/l_total2)));
208
209
210
211 free(sinp);
212 free(sout);
213 free(ref_out);
214 free(scale_vals);
215 free(bias_vals);
216 free(shift_vals);
217
218 return EXIT_SUCCESS;
219 }
220
221