1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // author:BUG1989 (https://github.com/BUG1989/) Long-term support.
4 // author:JansonZhu (https://github.com/JansonZhu) Implemented the function of entropy calibration.
5 //
6 // Copyright (C) 2019 BUG1989. All rights reserved.
7 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
8 //
9 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
10 // in compliance with the License. You may obtain a copy of the License at
11 //
12 // https://opensource.org/licenses/BSD-3-Clause
13 //
14 // Unless required by applicable law or agreed to in writing, software distributed
15 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
16 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
17 // specific language governing permissions and limitations under the License.
18 
19 #ifdef _MSC_VER
20 #define _CRT_SECURE_NO_DEPRECATE
21 #endif
22 
23 #include <float.h>
24 #include <limits.h>
25 #include <math.h>
26 #include <stdio.h>
27 #include <stdint.h>
28 #include <stdlib.h>
29 #include <string.h>
30 
31 #if defined(USE_NCNN_SIMPLEOCV)
32 #include "simpleocv.h"
33 #elif defined(USE_LOCAL_IMREADWRITE)
34 #include "imreadwrite.h"
35 #else
36 #include <opencv2/core/core.hpp>
37 #include <opencv2/highgui/highgui.hpp>
38 #endif
39 #include <string>
40 #include <vector>
41 
42 // ncnn public header
43 #include "benchmark.h"
44 #include "cpu.h"
45 #include "net.h"
46 
47 // ncnn private header
48 #include "layer/convolution.h"
49 #include "layer/convolutiondepthwise.h"
50 #include "layer/innerproduct.h"
51 
52 class QuantBlobStat
53 {
54 public:
QuantBlobStat()55     QuantBlobStat()
56     {
57         threshold = 0.f;
58         absmax = 0.f;
59         total = 0;
60     }
61 
62 public:
63     float threshold;
64     float absmax;
65 
66     // ACIQ
67     int total;
68 
69     // KL
70     std::vector<uint64_t> histogram;
71     std::vector<float> histogram_normed;
72 };
73 
74 class QuantNet : public ncnn::Net
75 {
76 public:
77     QuantNet();
78 
79     std::vector<ncnn::Blob>& blobs;
80     std::vector<ncnn::Layer*>& layers;
81 
82 public:
83     std::vector<std::vector<std::string> > listspaths;
84     std::vector<std::vector<float> > means;
85     std::vector<std::vector<float> > norms;
86     std::vector<std::vector<int> > shapes;
87     std::vector<int> type_to_pixels;
88     int quantize_num_threads;
89 
90 public:
91     int init();
92     void print_quant_info() const;
93     int save_table(const char* tablepath);
94     int quantize_KL();
95     int quantize_ACIQ();
96     int quantize_EQ();
97 
98 public:
99     std::vector<int> input_blobs;
100     std::vector<int> conv_layers;
101     std::vector<int> conv_bottom_blobs;
102     std::vector<int> conv_top_blobs;
103 
104     // result
105     std::vector<QuantBlobStat> quant_blob_stats;
106     std::vector<ncnn::Mat> weight_scales;
107     std::vector<ncnn::Mat> bottom_blob_scales;
108 };
109 
QuantNet()110 QuantNet::QuantNet()
111     : blobs(mutable_blobs()), layers(mutable_layers())
112 {
113     quantize_num_threads = ncnn::get_cpu_count();
114 }
115 
init()116 int QuantNet::init()
117 {
118     // find all input layers
119     for (int i = 0; i < (int)layers.size(); i++)
120     {
121         const ncnn::Layer* layer = layers[i];
122         if (layer->type == "Input")
123         {
124             input_blobs.push_back(layer->tops[0]);
125         }
126     }
127 
128     // find all conv layers
129     for (int i = 0; i < (int)layers.size(); i++)
130     {
131         const ncnn::Layer* layer = layers[i];
132         if (layer->type == "Convolution" || layer->type == "ConvolutionDepthWise" || layer->type == "InnerProduct")
133         {
134             conv_layers.push_back(i);
135             conv_bottom_blobs.push_back(layer->bottoms[0]);
136             conv_top_blobs.push_back(layer->tops[0]);
137         }
138     }
139 
140     const int conv_layer_count = (int)conv_layers.size();
141     const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
142 
143     quant_blob_stats.resize(conv_bottom_blob_count);
144     weight_scales.resize(conv_layer_count);
145     bottom_blob_scales.resize(conv_bottom_blob_count);
146 
147     return 0;
148 }
149 
save_table(const char * tablepath)150 int QuantNet::save_table(const char* tablepath)
151 {
152     FILE* fp = fopen(tablepath, "wb");
153     if (!fp)
154     {
155         fprintf(stderr, "fopen %s failed\n", tablepath);
156         return -1;
157     }
158 
159     const int conv_layer_count = (int)conv_layers.size();
160     const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
161 
162     for (int i = 0; i < conv_layer_count; i++)
163     {
164         const ncnn::Mat& weight_scale = weight_scales[i];
165 
166         fprintf(fp, "%s_param_0 ", layers[conv_layers[i]]->name.c_str());
167         for (int j = 0; j < weight_scale.w; j++)
168         {
169             fprintf(fp, "%f ", weight_scale[j]);
170         }
171         fprintf(fp, "\n");
172     }
173 
174     for (int i = 0; i < conv_bottom_blob_count; i++)
175     {
176         const ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i];
177 
178         fprintf(fp, "%s ", layers[conv_layers[i]]->name.c_str());
179         for (int j = 0; j < bottom_blob_scale.w; j++)
180         {
181             fprintf(fp, "%f ", bottom_blob_scale[j]);
182         }
183         fprintf(fp, "\n");
184     }
185 
186     fclose(fp);
187 
188     fprintf(stderr, "ncnn int8 calibration table create success, best wish for your int8 inference has a low accuracy loss...\\(^0^)/...233...\n");
189 
190     return 0;
191 }
192 
print_quant_info() const193 void QuantNet::print_quant_info() const
194 {
195     for (int i = 0; i < (int)conv_bottom_blobs.size(); i++)
196     {
197         const QuantBlobStat& stat = quant_blob_stats[i];
198 
199         float scale = 127 / stat.threshold;
200 
201         fprintf(stderr, "%-40s : max = %-15f  threshold = %-15f  scale = %-15f\n", layers[conv_layers[i]]->name.c_str(), stat.absmax, stat.threshold, scale);
202     }
203 }
204 
205 /**
206  * Read and resize image
207  * shape is input as [w,h,...]
208  * if w and h both are given, image will be resized to exactly size.
209  * if w and h both are zero or negative, image will not be resized.
210  * if only h is zero or negative, image's width will scaled resize to w, keeping aspect ratio.
211  * if only w is zero or negative, image's height will scaled resize to h
212  * @return ncnn::Mat
213  */
214 
read_and_resize_image(const std::vector<int> & shape,const std::string & imagepath,int pixel_convert_type)215 inline ncnn::Mat read_and_resize_image(const std::vector<int>& shape, const std::string& imagepath, int pixel_convert_type)
216 {
217     int target_w = shape[0];
218     int target_h = shape[1];
219     cv::Mat bgr = cv::imread(imagepath, 1);
220     if (target_h <= 0 && target_w <= 0)
221     {
222         return ncnn::Mat::from_pixels(bgr.data, pixel_convert_type, bgr.cols, bgr.rows);
223     }
224     if (target_h <= 0 || target_w <= 0)
225     {
226         float scale = 1.0;
227         if (target_h <= 0)
228         {
229             scale = 1.0 * bgr.cols / target_w;
230             target_h = int(1.0 * bgr.rows / scale);
231         }
232         if (target_w <= 0)
233         {
234             scale = 1.0 * bgr.rows / target_h;
235             target_w = int(1.0 * bgr.cols / scale);
236         }
237     }
238     return ncnn::Mat::from_pixels_resize(bgr.data, pixel_convert_type, bgr.cols, bgr.rows, target_w, target_h);
239 }
240 
compute_kl_divergence(const std::vector<float> & a,const std::vector<float> & b)241 static float compute_kl_divergence(const std::vector<float>& a, const std::vector<float>& b)
242 {
243     const size_t length = a.size();
244 
245     float result = 0;
246     for (size_t i = 0; i < length; i++)
247     {
248         result += a[i] * log(a[i] / b[i]);
249     }
250 
251     return result;
252 }
253 
quantize_KL()254 int QuantNet::quantize_KL()
255 {
256     const int input_blob_count = (int)input_blobs.size();
257     const int conv_layer_count = (int)conv_layers.size();
258     const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
259     const int image_count = (int)listspaths[0].size();
260 
261     const int num_histogram_bins = 2048;
262 
263     std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
264     std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
265 
266     // initialize conv weight scales
267     #pragma omp parallel for num_threads(quantize_num_threads)
268     for (int i = 0; i < conv_layer_count; i++)
269     {
270         const ncnn::Layer* layer = layers[conv_layers[i]];
271 
272         if (layer->type == "Convolution")
273         {
274             const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer;
275 
276             const int num_output = convolution->num_output;
277             const int kernel_w = convolution->kernel_w;
278             const int kernel_h = convolution->kernel_h;
279             const int dilation_w = convolution->dilation_w;
280             const int dilation_h = convolution->dilation_h;
281             const int stride_w = convolution->stride_w;
282             const int stride_h = convolution->stride_h;
283 
284             const int weight_data_size_output = convolution->weight_data_size / num_output;
285 
286             // int8 winograd F43 needs weight data to use 6bit quantization
287             // TODO proper condition for winograd 3x3 int8
288             bool quant_6bit = false;
289             if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
290                 quant_6bit = true;
291 
292             weight_scales[i].create(num_output);
293 
294             for (int n = 0; n < num_output; n++)
295             {
296                 const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output);
297 
298                 float absmax = 0.f;
299                 for (int k = 0; k < weight_data_size_output; k++)
300                 {
301                     absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
302                 }
303 
304                 if (quant_6bit)
305                 {
306                     weight_scales[i][n] = 31 / absmax;
307                 }
308                 else
309                 {
310                     weight_scales[i][n] = 127 / absmax;
311                 }
312             }
313         }
314 
315         if (layer->type == "ConvolutionDepthWise")
316         {
317             const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer;
318 
319             const int group = convolutiondepthwise->group;
320             const int weight_data_size_output = convolutiondepthwise->weight_data_size / group;
321 
322             std::vector<float> scales;
323 
324             weight_scales[i].create(group);
325 
326             for (int n = 0; n < group; n++)
327             {
328                 const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output);
329 
330                 float absmax = 0.f;
331                 for (int k = 0; k < weight_data_size_output; k++)
332                 {
333                     absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
334                 }
335 
336                 weight_scales[i][n] = 127 / absmax;
337             }
338         }
339 
340         if (layer->type == "InnerProduct")
341         {
342             const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer;
343 
344             const int num_output = innerproduct->num_output;
345             const int weight_data_size_output = innerproduct->weight_data_size / num_output;
346 
347             weight_scales[i].create(num_output);
348 
349             for (int n = 0; n < num_output; n++)
350             {
351                 const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output);
352 
353                 float absmax = 0.f;
354                 for (int k = 0; k < weight_data_size_output; k++)
355                 {
356                     absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
357                 }
358 
359                 weight_scales[i][n] = 127 / absmax;
360             }
361         }
362     }
363 
364     // count the absmax
365     #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
366     for (int i = 0; i < image_count; i++)
367     {
368         if (i % 100 == 0)
369         {
370             fprintf(stderr, "count the absmax %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
371         }
372 
373         ncnn::Extractor ex = create_extractor();
374 
375         const int thread_num = ncnn::get_omp_thread_num();
376         ex.set_blob_allocator(&blob_allocators[thread_num]);
377         ex.set_workspace_allocator(&workspace_allocators[thread_num]);
378 
379         for (int j = 0; j < input_blob_count; j++)
380         {
381             const int type_to_pixel = type_to_pixels[j];
382             const std::vector<float>& mean_vals = means[j];
383             const std::vector<float>& norm_vals = norms[j];
384 
385             int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
386             if (type_to_pixel != pixel_convert_type)
387             {
388                 pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
389             }
390 
391             ncnn::Mat in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
392 
393             in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
394 
395             ex.input(input_blobs[j], in);
396         }
397 
398         for (int j = 0; j < conv_bottom_blob_count; j++)
399         {
400             ncnn::Mat out;
401             ex.extract(conv_bottom_blobs[j], out);
402 
403             // count absmax
404             {
405                 float absmax = 0.f;
406 
407                 const int outc = out.c;
408                 const int outsize = out.w * out.h;
409                 for (int p = 0; p < outc; p++)
410                 {
411                     const float* ptr = out.channel(p);
412                     for (int k = 0; k < outsize; k++)
413                     {
414                         absmax = std::max(absmax, (float)fabs(ptr[k]));
415                     }
416                 }
417 
418                 #pragma omp critical
419                 {
420                     QuantBlobStat& stat = quant_blob_stats[j];
421                     stat.absmax = std::max(stat.absmax, absmax);
422                 }
423             }
424         }
425     }
426 
427     // initialize histogram
428     #pragma omp parallel for num_threads(quantize_num_threads)
429     for (int i = 0; i < conv_bottom_blob_count; i++)
430     {
431         QuantBlobStat& stat = quant_blob_stats[i];
432 
433         stat.histogram.resize(num_histogram_bins, 0);
434         stat.histogram_normed.resize(num_histogram_bins, 0);
435     }
436 
437     // build histogram
438     #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
439     for (int i = 0; i < image_count; i++)
440     {
441         if (i % 100 == 0)
442         {
443             fprintf(stderr, "build histogram %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
444         }
445 
446         ncnn::Extractor ex = create_extractor();
447 
448         const int thread_num = ncnn::get_omp_thread_num();
449         ex.set_blob_allocator(&blob_allocators[thread_num]);
450         ex.set_workspace_allocator(&workspace_allocators[thread_num]);
451 
452         for (int j = 0; j < input_blob_count; j++)
453         {
454             const int type_to_pixel = type_to_pixels[j];
455             const std::vector<float>& mean_vals = means[j];
456             const std::vector<float>& norm_vals = norms[j];
457 
458             int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
459             if (type_to_pixel != pixel_convert_type)
460             {
461                 pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
462             }
463 
464             ncnn::Mat in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
465 
466             in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
467 
468             ex.input(input_blobs[j], in);
469         }
470 
471         for (int j = 0; j < conv_bottom_blob_count; j++)
472         {
473             ncnn::Mat out;
474             ex.extract(conv_bottom_blobs[j], out);
475 
476             // count histogram bin
477             {
478                 const float absmax = quant_blob_stats[j].absmax;
479 
480                 std::vector<uint64_t> histogram(num_histogram_bins, 0);
481 
482                 const int outc = out.c;
483                 const int outsize = out.w * out.h;
484                 for (int p = 0; p < outc; p++)
485                 {
486                     const float* ptr = out.channel(p);
487                     for (int k = 0; k < outsize; k++)
488                     {
489                         if (ptr[k] == 0.f)
490                             continue;
491 
492                         const int index = std::min((int)(fabs(ptr[k]) / absmax * num_histogram_bins), (num_histogram_bins - 1));
493 
494                         histogram[index] += 1;
495                     }
496                 }
497 
498                 #pragma omp critical
499                 {
500                     QuantBlobStat& stat = quant_blob_stats[j];
501 
502                     for (int k = 0; k < num_histogram_bins; k++)
503                     {
504                         stat.histogram[k] += histogram[k];
505                     }
506                 }
507             }
508         }
509     }
510 
511     // using kld to find the best threshold value
512     #pragma omp parallel for num_threads(quantize_num_threads)
513     for (int i = 0; i < conv_bottom_blob_count; i++)
514     {
515         QuantBlobStat& stat = quant_blob_stats[i];
516 
517         // normalize histogram bin
518         {
519             uint64_t sum = 0;
520             for (int j = 0; j < num_histogram_bins; j++)
521             {
522                 sum += stat.histogram[j];
523             }
524 
525             for (int j = 0; j < num_histogram_bins; j++)
526             {
527                 stat.histogram_normed[j] = (float)(stat.histogram[j] / (double)sum);
528             }
529         }
530 
531         const int target_bin = 128;
532 
533         int target_threshold = target_bin;
534         float min_kl_divergence = FLT_MAX;
535 
536         for (int threshold = target_bin; threshold < num_histogram_bins; threshold++)
537         {
538             const float kl_eps = 0.0001f;
539 
540             std::vector<float> clip_distribution(threshold, kl_eps);
541             {
542                 for (int j = 0; j < threshold; j++)
543                 {
544                     clip_distribution[j] += stat.histogram_normed[j];
545                 }
546                 for (int j = threshold; j < num_histogram_bins; j++)
547                 {
548                     clip_distribution[threshold - 1] += stat.histogram_normed[j];
549                 }
550             }
551 
552             const float num_per_bin = (float)threshold / target_bin;
553 
554             std::vector<float> quantize_distribution(target_bin, 0.f);
555             {
556                 {
557                     const float end = num_per_bin;
558 
559                     const int right_lower = (int)floor(end);
560                     const float right_scale = end - right_lower;
561 
562                     if (right_scale > 0)
563                     {
564                         quantize_distribution[0] += right_scale * stat.histogram_normed[right_lower];
565                     }
566 
567                     for (int k = 0; k < right_lower; k++)
568                     {
569                         quantize_distribution[0] += stat.histogram_normed[k];
570                     }
571 
572                     quantize_distribution[0] /= right_lower + right_scale;
573                 }
574                 for (int j = 1; j < target_bin - 1; j++)
575                 {
576                     const float start = j * num_per_bin;
577                     const float end = (j + 1) * num_per_bin;
578 
579                     const int left_upper = (int)ceil(start);
580                     const float left_scale = left_upper - start;
581 
582                     const int right_lower = (int)floor(end);
583                     const float right_scale = end - right_lower;
584 
585                     if (left_scale > 0)
586                     {
587                         quantize_distribution[j] += left_scale * stat.histogram_normed[left_upper - 1];
588                     }
589 
590                     if (right_scale > 0)
591                     {
592                         quantize_distribution[j] += right_scale * stat.histogram_normed[right_lower];
593                     }
594 
595                     for (int k = left_upper; k < right_lower; k++)
596                     {
597                         quantize_distribution[j] += stat.histogram_normed[k];
598                     }
599 
600                     quantize_distribution[j] /= right_lower - left_upper + left_scale + right_scale;
601                 }
602                 {
603                     const float start = threshold - num_per_bin;
604 
605                     const int left_upper = (int)ceil(start);
606                     const float left_scale = left_upper - start;
607 
608                     if (left_scale > 0)
609                     {
610                         quantize_distribution[target_bin - 1] += left_scale * stat.histogram_normed[left_upper - 1];
611                     }
612 
613                     for (int k = left_upper; k < threshold; k++)
614                     {
615                         quantize_distribution[target_bin - 1] += stat.histogram_normed[k];
616                     }
617 
618                     quantize_distribution[target_bin - 1] /= threshold - left_upper + left_scale;
619                 }
620             }
621 
622             std::vector<float> expand_distribution(threshold, kl_eps);
623             {
624                 {
625                     const float end = num_per_bin;
626 
627                     const int right_lower = (int)floor(end);
628                     const float right_scale = end - right_lower;
629 
630                     if (right_scale > 0)
631                     {
632                         expand_distribution[right_lower] += right_scale * quantize_distribution[0];
633                     }
634 
635                     for (int k = 0; k < right_lower; k++)
636                     {
637                         expand_distribution[k] += quantize_distribution[0];
638                     }
639                 }
640                 for (int j = 1; j < target_bin - 1; j++)
641                 {
642                     const float start = j * num_per_bin;
643                     const float end = (j + 1) * num_per_bin;
644 
645                     const int left_upper = (int)ceil(start);
646                     const float left_scale = left_upper - start;
647 
648                     const int right_lower = (int)floor(end);
649                     const float right_scale = end - right_lower;
650 
651                     if (left_scale > 0)
652                     {
653                         expand_distribution[left_upper - 1] += left_scale * quantize_distribution[j];
654                     }
655 
656                     if (right_scale > 0)
657                     {
658                         expand_distribution[right_lower] += right_scale * quantize_distribution[j];
659                     }
660 
661                     for (int k = left_upper; k < right_lower; k++)
662                     {
663                         expand_distribution[k] += quantize_distribution[j];
664                     }
665                 }
666                 {
667                     const float start = threshold - num_per_bin;
668 
669                     const int left_upper = (int)ceil(start);
670                     const float left_scale = left_upper - start;
671 
672                     if (left_scale > 0)
673                     {
674                         expand_distribution[left_upper - 1] += left_scale * quantize_distribution[target_bin - 1];
675                     }
676 
677                     for (int k = left_upper; k < threshold; k++)
678                     {
679                         expand_distribution[k] += quantize_distribution[target_bin - 1];
680                     }
681                 }
682             }
683 
684             // kl
685             const float kl_divergence = compute_kl_divergence(clip_distribution, expand_distribution);
686 
687             // the best num of bin
688             if (kl_divergence < min_kl_divergence)
689             {
690                 min_kl_divergence = kl_divergence;
691                 target_threshold = threshold;
692             }
693         }
694 
695         stat.threshold = (target_threshold + 0.5f) * stat.absmax / num_histogram_bins;
696         float scale = 127 / stat.threshold;
697 
698         bottom_blob_scales[i].create(1);
699         bottom_blob_scales[i][0] = scale;
700     }
701 
702     return 0;
703 }
704 
compute_aciq_gaussian_clip(float absmax,int N,int num_bits=8)705 static float compute_aciq_gaussian_clip(float absmax, int N, int num_bits = 8)
706 {
707     const float alpha_gaussian[8] = {0, 1.71063519, 2.15159277, 2.55913646, 2.93620062, 3.28691474, 3.6151146, 3.92403714};
708 
709     const double gaussian_const = (0.5 * 0.35) * (1 + sqrt(3.14159265358979323846 * log(4)));
710 
711     double std = (absmax * 2 * gaussian_const) / sqrt(2 * log(N));
712 
713     return (float)(alpha_gaussian[num_bits - 1] * std);
714 }
715 
quantize_ACIQ()716 int QuantNet::quantize_ACIQ()
717 {
718     const int input_blob_count = (int)input_blobs.size();
719     const int conv_layer_count = (int)conv_layers.size();
720     const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
721     const int image_count = (int)listspaths[0].size();
722 
723     std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
724     std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
725 
726     // initialize conv weight scales
727     #pragma omp parallel for num_threads(quantize_num_threads)
728     for (int i = 0; i < conv_layer_count; i++)
729     {
730         const ncnn::Layer* layer = layers[conv_layers[i]];
731 
732         if (layer->type == "Convolution")
733         {
734             const ncnn::Convolution* convolution = (const ncnn::Convolution*)layer;
735 
736             const int num_output = convolution->num_output;
737             const int kernel_w = convolution->kernel_w;
738             const int kernel_h = convolution->kernel_h;
739             const int dilation_w = convolution->dilation_w;
740             const int dilation_h = convolution->dilation_h;
741             const int stride_w = convolution->stride_w;
742             const int stride_h = convolution->stride_h;
743 
744             const int weight_data_size_output = convolution->weight_data_size / num_output;
745 
746             // int8 winograd F43 needs weight data to use 6bit quantization
747             // TODO proper condition for winograd 3x3 int8
748             bool quant_6bit = false;
749             if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
750                 quant_6bit = true;
751 
752             weight_scales[i].create(num_output);
753 
754             for (int n = 0; n < num_output; n++)
755             {
756                 const ncnn::Mat weight_data_n = convolution->weight_data.range(weight_data_size_output * n, weight_data_size_output);
757 
758                 float absmax = 0.f;
759                 for (int k = 0; k < weight_data_size_output; k++)
760                 {
761                     absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
762                 }
763 
764                 if (quant_6bit)
765                 {
766                     const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output, 6);
767                     weight_scales[i][n] = 31 / threshold;
768                 }
769                 else
770                 {
771                     const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
772                     weight_scales[i][n] = 127 / threshold;
773                 }
774             }
775         }
776 
777         if (layer->type == "ConvolutionDepthWise")
778         {
779             const ncnn::ConvolutionDepthWise* convolutiondepthwise = (const ncnn::ConvolutionDepthWise*)layer;
780 
781             const int group = convolutiondepthwise->group;
782             const int weight_data_size_output = convolutiondepthwise->weight_data_size / group;
783 
784             std::vector<float> scales;
785 
786             weight_scales[i].create(group);
787 
788             for (int n = 0; n < group; n++)
789             {
790                 const ncnn::Mat weight_data_n = convolutiondepthwise->weight_data.range(weight_data_size_output * n, weight_data_size_output);
791 
792                 float absmax = 0.f;
793                 for (int k = 0; k < weight_data_size_output; k++)
794                 {
795                     absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
796                 }
797 
798                 const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
799                 weight_scales[i][n] = 127 / threshold;
800             }
801         }
802 
803         if (layer->type == "InnerProduct")
804         {
805             const ncnn::InnerProduct* innerproduct = (const ncnn::InnerProduct*)layer;
806 
807             const int num_output = innerproduct->num_output;
808             const int weight_data_size_output = innerproduct->weight_data_size / num_output;
809 
810             weight_scales[i].create(num_output);
811 
812             for (int n = 0; n < num_output; n++)
813             {
814                 const ncnn::Mat weight_data_n = innerproduct->weight_data.range(weight_data_size_output * n, weight_data_size_output);
815 
816                 float absmax = 0.f;
817                 for (int k = 0; k < weight_data_size_output; k++)
818                 {
819                     absmax = std::max(absmax, (float)fabs(weight_data_n[k]));
820                 }
821 
822                 const float threshold = compute_aciq_gaussian_clip(absmax, weight_data_size_output);
823                 weight_scales[i][n] = 127 / threshold;
824             }
825         }
826     }
827 
828     // count the absmax
829     #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
830     for (int i = 0; i < image_count; i++)
831     {
832         if (i % 100 == 0)
833         {
834             fprintf(stderr, "count the absmax %.2f%% [ %d / %d ]\n", i * 100.f / image_count, i, image_count);
835         }
836 
837         ncnn::Extractor ex = create_extractor();
838 
839         const int thread_num = ncnn::get_omp_thread_num();
840         ex.set_blob_allocator(&blob_allocators[thread_num]);
841         ex.set_workspace_allocator(&workspace_allocators[thread_num]);
842 
843         for (int j = 0; j < input_blob_count; j++)
844         {
845             const int type_to_pixel = type_to_pixels[j];
846             const std::vector<float>& mean_vals = means[j];
847             const std::vector<float>& norm_vals = norms[j];
848 
849             int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
850             if (type_to_pixel != pixel_convert_type)
851             {
852                 pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
853             }
854 
855             ncnn::Mat in = read_and_resize_image(shapes[j], listspaths[j][i], pixel_convert_type);
856 
857             in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
858 
859             ex.input(input_blobs[j], in);
860         }
861 
862         for (int j = 0; j < conv_bottom_blob_count; j++)
863         {
864             ncnn::Mat out;
865             ex.extract(conv_bottom_blobs[j], out);
866 
867             // count absmax
868             {
869                 float absmax = 0.f;
870 
871                 const int outc = out.c;
872                 const int outsize = out.w * out.h;
873                 for (int p = 0; p < outc; p++)
874                 {
875                     const float* ptr = out.channel(p);
876                     for (int k = 0; k < outsize; k++)
877                     {
878                         absmax = std::max(absmax, (float)fabs(ptr[k]));
879                     }
880                 }
881 
882                 #pragma omp critical
883                 {
884                     QuantBlobStat& stat = quant_blob_stats[j];
885                     stat.absmax = std::max(stat.absmax, absmax);
886                     stat.total = outc * outsize;
887                 }
888             }
889         }
890     }
891 
892     // alpha gaussian
893     #pragma omp parallel for num_threads(quantize_num_threads)
894     for (int i = 0; i < conv_bottom_blob_count; i++)
895     {
896         QuantBlobStat& stat = quant_blob_stats[i];
897 
898         stat.threshold = compute_aciq_gaussian_clip(stat.absmax, stat.total);
899         float scale = 127 / stat.threshold;
900 
901         bottom_blob_scales[i].create(1);
902         bottom_blob_scales[i][0] = scale;
903     }
904 
905     return 0;
906 }
907 
cosine_similarity(const ncnn::Mat & a,const ncnn::Mat & b)908 static float cosine_similarity(const ncnn::Mat& a, const ncnn::Mat& b)
909 {
910     const int chanenls = a.c;
911     const int size = a.w * a.h;
912 
913     float sa = 0;
914     float sb = 0;
915     float sum = 0;
916 
917     for (int p = 0; p < chanenls; p++)
918     {
919         const float* pa = a.channel(p);
920         const float* pb = b.channel(p);
921 
922         for (int i = 0; i < size; i++)
923         {
924             sa += pa[i] * pa[i];
925             sb += pb[i] * pb[i];
926             sum += pa[i] * pb[i];
927         }
928     }
929 
930     float sim = (float)sum / sqrt(sa) / sqrt(sb);
931 
932     return sim;
933 }
934 
get_layer_param(const ncnn::Layer * layer,ncnn::ParamDict & pd)935 static int get_layer_param(const ncnn::Layer* layer, ncnn::ParamDict& pd)
936 {
937     if (layer->type == "Convolution")
938     {
939         ncnn::Convolution* convolution = (ncnn::Convolution*)layer;
940 
941         pd.set(0, convolution->num_output);
942         pd.set(1, convolution->kernel_w);
943         pd.set(11, convolution->kernel_h);
944         pd.set(2, convolution->dilation_w);
945         pd.set(12, convolution->dilation_h);
946         pd.set(3, convolution->stride_w);
947         pd.set(13, convolution->stride_h);
948         pd.set(4, convolution->pad_left);
949         pd.set(15, convolution->pad_right);
950         pd.set(14, convolution->pad_top);
951         pd.set(16, convolution->pad_bottom);
952         pd.set(18, convolution->pad_value);
953         pd.set(5, convolution->bias_term);
954         pd.set(6, convolution->weight_data_size);
955         pd.set(8, convolution->int8_scale_term);
956         pd.set(9, convolution->activation_type);
957         pd.set(10, convolution->activation_params);
958     }
959     else if (layer->type == "ConvolutionDepthWise")
960     {
961         ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layer;
962 
963         pd.set(0, convolutiondepthwise->num_output);
964         pd.set(1, convolutiondepthwise->kernel_w);
965         pd.set(11, convolutiondepthwise->kernel_h);
966         pd.set(2, convolutiondepthwise->dilation_w);
967         pd.set(12, convolutiondepthwise->dilation_h);
968         pd.set(3, convolutiondepthwise->stride_w);
969         pd.set(13, convolutiondepthwise->stride_h);
970         pd.set(4, convolutiondepthwise->pad_left);
971         pd.set(15, convolutiondepthwise->pad_right);
972         pd.set(14, convolutiondepthwise->pad_top);
973         pd.set(16, convolutiondepthwise->pad_bottom);
974         pd.set(18, convolutiondepthwise->pad_value);
975         pd.set(5, convolutiondepthwise->bias_term);
976         pd.set(6, convolutiondepthwise->weight_data_size);
977         pd.set(7, convolutiondepthwise->group);
978         pd.set(8, convolutiondepthwise->int8_scale_term);
979         pd.set(9, convolutiondepthwise->activation_type);
980         pd.set(10, convolutiondepthwise->activation_params);
981     }
982     else if (layer->type == "InnerProduct")
983     {
984         ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layer;
985 
986         pd.set(0, innerproduct->num_output);
987         pd.set(1, innerproduct->bias_term);
988         pd.set(2, innerproduct->weight_data_size);
989         pd.set(8, innerproduct->int8_scale_term);
990         pd.set(9, innerproduct->activation_type);
991         pd.set(10, innerproduct->activation_params);
992     }
993     else
994     {
995         fprintf(stderr, "unexpected layer type %s in get_layer_param\n", layer->type.c_str());
996         return -1;
997     }
998 
999     return 0;
1000 }
1001 
get_layer_weights(const ncnn::Layer * layer,std::vector<ncnn::Mat> & weights)1002 static int get_layer_weights(const ncnn::Layer* layer, std::vector<ncnn::Mat>& weights)
1003 {
1004     if (layer->type == "Convolution")
1005     {
1006         ncnn::Convolution* convolution = (ncnn::Convolution*)layer;
1007         weights.push_back(convolution->weight_data);
1008         if (convolution->bias_term)
1009             weights.push_back(convolution->bias_data);
1010     }
1011     else if (layer->type == "ConvolutionDepthWise")
1012     {
1013         ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layer;
1014         weights.push_back(convolutiondepthwise->weight_data);
1015         if (convolutiondepthwise->bias_term)
1016             weights.push_back(convolutiondepthwise->bias_data);
1017     }
1018     else if (layer->type == "InnerProduct")
1019     {
1020         ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layer;
1021         weights.push_back(innerproduct->weight_data);
1022         if (innerproduct->bias_term)
1023             weights.push_back(innerproduct->bias_data);
1024     }
1025     else
1026     {
1027         fprintf(stderr, "unexpected layer type %s in get_layer_weights\n", layer->type.c_str());
1028         return -1;
1029     }
1030 
1031     return 0;
1032 }
1033 
quantize_EQ()1034 int QuantNet::quantize_EQ()
1035 {
1036     // find the initial scale via KL
1037     quantize_KL();
1038 
1039     print_quant_info();
1040 
1041     const int input_blob_count = (int)input_blobs.size();
1042     const int conv_layer_count = (int)conv_layers.size();
1043     const int conv_bottom_blob_count = (int)conv_bottom_blobs.size();
1044 
1045     std::vector<ncnn::UnlockedPoolAllocator> blob_allocators(quantize_num_threads);
1046     std::vector<ncnn::UnlockedPoolAllocator> workspace_allocators(quantize_num_threads);
1047 
1048     // max 50 images for EQ
1049     const int image_count = std::min((int)listspaths[0].size(), 50);
1050 
1051     const float scale_range_lower = 0.5f;
1052     const float scale_range_upper = 2.0f;
1053     const int search_steps = 100;
1054 
1055     for (int i = 0; i < conv_layer_count; i++)
1056     {
1057         ncnn::Mat& weight_scale = weight_scales[i];
1058         ncnn::Mat& bottom_blob_scale = bottom_blob_scales[i];
1059 
1060         const ncnn::Layer* layer = layers[conv_layers[i]];
1061 
1062         // search weight scale
1063         for (int j = 0; j < weight_scale.w; j++)
1064         {
1065             const float scale = weight_scale[j];
1066             const float scale_lower = scale * scale_range_lower;
1067             const float scale_upper = scale * scale_range_upper;
1068             const float scale_step = (scale_upper - scale_lower) / search_steps;
1069 
1070             std::vector<double> avgsims(search_steps, 0.0);
1071 
1072             #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
1073             for (int ii = 0; ii < image_count; ii++)
1074             {
1075                 if (ii % 100 == 0)
1076                 {
1077                     fprintf(stderr, "search weight scale %.2f%% [ %d / %d ] for %d / %d of %d / %d\n", ii * 100.f / image_count, ii, image_count, j, weight_scale.w, i, conv_layer_count);
1078                 }
1079 
1080                 ncnn::Extractor ex = create_extractor();
1081 
1082                 const int thread_num = ncnn::get_omp_thread_num();
1083                 ex.set_blob_allocator(&blob_allocators[thread_num]);
1084                 ex.set_workspace_allocator(&workspace_allocators[thread_num]);
1085 
1086                 for (int jj = 0; jj < input_blob_count; jj++)
1087                 {
1088                     const int type_to_pixel = type_to_pixels[jj];
1089                     const std::vector<float>& mean_vals = means[jj];
1090                     const std::vector<float>& norm_vals = norms[jj];
1091 
1092                     int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
1093                     if (type_to_pixel != pixel_convert_type)
1094                     {
1095                         pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
1096                     }
1097 
1098                     ncnn::Mat in = read_and_resize_image(shapes[jj], listspaths[jj][ii], pixel_convert_type);
1099 
1100                     in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
1101 
1102                     ex.input(input_blobs[jj], in);
1103                 }
1104 
1105                 ncnn::Mat in;
1106                 ex.extract(conv_bottom_blobs[i], in);
1107 
1108                 ncnn::Mat out;
1109                 ex.extract(conv_top_blobs[i], out);
1110 
1111                 ncnn::Layer* layer_int8 = ncnn::create_layer(layer->typeindex);
1112 
1113                 ncnn::ParamDict pd;
1114                 get_layer_param(layer, pd);
1115                 pd.set(8, 1); //int8_scale_term
1116                 layer_int8->load_param(pd);
1117 
1118                 std::vector<float> sims(search_steps);
1119                 for (int k = 0; k < search_steps; k++)
1120                 {
1121                     ncnn::Mat new_weight_scale = weight_scale.clone();
1122                     new_weight_scale[j] = scale_lower + k * scale_step;
1123 
1124                     std::vector<ncnn::Mat> weights;
1125                     get_layer_weights(layer, weights);
1126                     weights.push_back(new_weight_scale);
1127                     weights.push_back(bottom_blob_scale);
1128                     layer_int8->load_model(ncnn::ModelBinFromMatArray(weights.data()));
1129 
1130                     ncnn::Option opt_int8;
1131                     opt_int8.use_packing_layout = false;
1132 
1133                     layer_int8->create_pipeline(opt_int8);
1134 
1135                     ncnn::Mat out_int8;
1136                     layer_int8->forward(in, out_int8, opt_int8);
1137 
1138                     layer_int8->destroy_pipeline(opt_int8);
1139 
1140                     sims[k] = cosine_similarity(out, out_int8);
1141                 }
1142 
1143                 delete layer_int8;
1144 
1145                 #pragma omp critical
1146                 {
1147                     for (int k = 0; k < search_steps; k++)
1148                     {
1149                         avgsims[k] += sims[k];
1150                     }
1151                 }
1152             }
1153 
1154             double max_avgsim = 0.0;
1155             float new_scale = scale;
1156 
1157             // find the scale with min cosine distance
1158             for (int k = 0; k < search_steps; k++)
1159             {
1160                 if (max_avgsim < avgsims[k])
1161                 {
1162                     max_avgsim = avgsims[k];
1163                     new_scale = scale_lower + k * scale_step;
1164                 }
1165             }
1166 
1167             fprintf(stderr, "%s w %d  = %f -> %f\n", layer->name.c_str(), j, scale, new_scale);
1168             weight_scale[j] = new_scale;
1169         }
1170 
1171         // search bottom blob scale
1172         for (int j = 0; j < bottom_blob_scale.w; j++)
1173         {
1174             const float scale = bottom_blob_scale[j];
1175             const float scale_lower = scale * scale_range_lower;
1176             const float scale_upper = scale * scale_range_upper;
1177             const float scale_step = (scale_upper - scale_lower) / search_steps;
1178 
1179             std::vector<double> avgsims(search_steps, 0.0);
1180 
1181             #pragma omp parallel for num_threads(quantize_num_threads) schedule(static, 1)
1182             for (int ii = 0; ii < image_count; ii++)
1183             {
1184                 if (ii % 100 == 0)
1185                 {
1186                     fprintf(stderr, "search bottom blob scale %.2f%% [ %d / %d ] for %d / %d of %d / %d\n", ii * 100.f / image_count, ii, image_count, j, bottom_blob_scale.w, i, conv_layer_count);
1187                 }
1188 
1189                 ncnn::Extractor ex = create_extractor();
1190 
1191                 const int thread_num = ncnn::get_omp_thread_num();
1192                 ex.set_blob_allocator(&blob_allocators[thread_num]);
1193                 ex.set_workspace_allocator(&workspace_allocators[thread_num]);
1194 
1195                 for (int jj = 0; jj < input_blob_count; jj++)
1196                 {
1197                     const int type_to_pixel = type_to_pixels[jj];
1198                     const std::vector<float>& mean_vals = means[jj];
1199                     const std::vector<float>& norm_vals = norms[jj];
1200 
1201                     int pixel_convert_type = ncnn::Mat::PIXEL_BGR;
1202                     if (type_to_pixel != pixel_convert_type)
1203                     {
1204                         pixel_convert_type = pixel_convert_type | (type_to_pixel << ncnn::Mat::PIXEL_CONVERT_SHIFT);
1205                     }
1206 
1207                     ncnn::Mat in = read_and_resize_image(shapes[jj], listspaths[jj][ii], pixel_convert_type);
1208 
1209                     in.substract_mean_normalize(mean_vals.data(), norm_vals.data());
1210 
1211                     ex.input(input_blobs[jj], in);
1212                 }
1213 
1214                 ncnn::Mat in;
1215                 ex.extract(conv_bottom_blobs[i], in);
1216 
1217                 ncnn::Mat out;
1218                 ex.extract(conv_top_blobs[i], out);
1219 
1220                 ncnn::Layer* layer_int8 = ncnn::create_layer(layer->typeindex);
1221 
1222                 ncnn::ParamDict pd;
1223                 get_layer_param(layer, pd);
1224                 pd.set(8, 1); //int8_scale_term
1225                 layer_int8->load_param(pd);
1226 
1227                 std::vector<float> sims(search_steps);
1228                 for (int k = 0; k < search_steps; k++)
1229                 {
1230                     ncnn::Mat new_bottom_blob_scale = bottom_blob_scale.clone();
1231                     new_bottom_blob_scale[j] = scale_lower + k * scale_step;
1232 
1233                     std::vector<ncnn::Mat> weights;
1234                     get_layer_weights(layer, weights);
1235                     weights.push_back(weight_scale);
1236                     weights.push_back(new_bottom_blob_scale);
1237                     layer_int8->load_model(ncnn::ModelBinFromMatArray(weights.data()));
1238 
1239                     ncnn::Option opt_int8;
1240                     opt_int8.use_packing_layout = false;
1241 
1242                     layer_int8->create_pipeline(opt_int8);
1243 
1244                     ncnn::Mat out_int8;
1245                     layer_int8->forward(in, out_int8, opt_int8);
1246 
1247                     layer_int8->destroy_pipeline(opt_int8);
1248 
1249                     sims[k] = cosine_similarity(out, out_int8);
1250                 }
1251 
1252                 delete layer_int8;
1253 
1254                 #pragma omp critical
1255                 {
1256                     for (int k = 0; k < search_steps; k++)
1257                     {
1258                         avgsims[k] += sims[k];
1259                     }
1260                 }
1261             }
1262 
1263             double max_avgsim = 0.0;
1264             float new_scale = scale;
1265 
1266             // find the scale with min cosine distance
1267             for (int k = 0; k < search_steps; k++)
1268             {
1269                 if (max_avgsim < avgsims[k])
1270                 {
1271                     max_avgsim = avgsims[k];
1272                     new_scale = scale_lower + k * scale_step;
1273                 }
1274             }
1275 
1276             fprintf(stderr, "%s b %d  = %f -> %f\n", layer->name.c_str(), j, scale, new_scale);
1277             bottom_blob_scale[j] = new_scale;
1278         }
1279 
1280         // update quant info
1281         QuantBlobStat& stat = quant_blob_stats[i];
1282         stat.threshold = 127 / bottom_blob_scale[0];
1283     }
1284 
1285     return 0;
1286 }
1287 
parse_comma_path_list(char * s)1288 static std::vector<std::vector<std::string> > parse_comma_path_list(char* s)
1289 {
1290     std::vector<std::vector<std::string> > aps;
1291 
1292     char* pch = strtok(s, ",");
1293     while (pch != NULL)
1294     {
1295         FILE* fp = fopen(pch, "rb");
1296         if (!fp)
1297         {
1298             fprintf(stderr, "fopen %s failed\n", pch);
1299             break;
1300         }
1301 
1302         std::vector<std::string> paths;
1303 
1304         // one filepath per line
1305         char line[1024];
1306         while (!feof(fp))
1307         {
1308             char* ss = fgets(line, 1024, fp);
1309             if (!ss)
1310                 break;
1311 
1312             char filepath[256];
1313             int nscan = sscanf(line, "%255s", filepath);
1314             if (nscan != 1)
1315                 continue;
1316 
1317             paths.push_back(std::string(filepath));
1318         }
1319 
1320         fclose(fp);
1321 
1322         aps.push_back(paths);
1323 
1324         pch = strtok(NULL, ",");
1325     }
1326 
1327     return aps;
1328 }
1329 
vstr_to_float(const char vstr[20])1330 static float vstr_to_float(const char vstr[20])
1331 {
1332     double v = 0.0;
1333 
1334     const char* p = vstr;
1335 
1336     // sign
1337     bool sign = *p != '-';
1338     if (*p == '+' || *p == '-')
1339     {
1340         p++;
1341     }
1342 
1343     // digits before decimal point or exponent
1344     uint64_t v1 = 0;
1345     while (isdigit(*p))
1346     {
1347         v1 = v1 * 10 + (*p - '0');
1348         p++;
1349     }
1350 
1351     v = (double)v1;
1352 
1353     // digits after decimal point
1354     if (*p == '.')
1355     {
1356         p++;
1357 
1358         uint64_t pow10 = 1;
1359         uint64_t v2 = 0;
1360 
1361         while (isdigit(*p))
1362         {
1363             v2 = v2 * 10 + (*p - '0');
1364             pow10 *= 10;
1365             p++;
1366         }
1367 
1368         v += v2 / (double)pow10;
1369     }
1370 
1371     // exponent
1372     if (*p == 'e' || *p == 'E')
1373     {
1374         p++;
1375 
1376         // sign of exponent
1377         bool fact = *p != '-';
1378         if (*p == '+' || *p == '-')
1379         {
1380             p++;
1381         }
1382 
1383         // digits of exponent
1384         uint64_t expon = 0;
1385         while (isdigit(*p))
1386         {
1387             expon = expon * 10 + (*p - '0');
1388             p++;
1389         }
1390 
1391         double scale = 1.0;
1392         while (expon >= 8)
1393         {
1394             scale *= 1e8;
1395             expon -= 8;
1396         }
1397         while (expon > 0)
1398         {
1399             scale *= 10.0;
1400             expon -= 1;
1401         }
1402 
1403         v = fact ? v * scale : v / scale;
1404     }
1405 
1406     //     fprintf(stderr, "v = %f\n", v);
1407     return sign ? (float)v : (float)-v;
1408 }
1409 
parse_comma_float_array_list(char * s)1410 static std::vector<std::vector<float> > parse_comma_float_array_list(char* s)
1411 {
1412     std::vector<std::vector<float> > aaf;
1413 
1414     char* pch = strtok(s, "[]");
1415     while (pch != NULL)
1416     {
1417         // parse a,b,c
1418         char vstr[20];
1419         int nconsumed = 0;
1420         int nscan = sscanf(pch, "%19[^,]%n", vstr, &nconsumed);
1421         if (nscan == 1)
1422         {
1423             // ok we get array
1424             pch += nconsumed;
1425 
1426             std::vector<float> af;
1427             float v = vstr_to_float(vstr);
1428             af.push_back(v);
1429 
1430             nscan = sscanf(pch, ",%19[^,]%n", vstr, &nconsumed);
1431             while (nscan == 1)
1432             {
1433                 pch += nconsumed;
1434 
1435                 float v = vstr_to_float(vstr);
1436                 af.push_back(v);
1437 
1438                 nscan = sscanf(pch, ",%19[^,]%n", vstr, &nconsumed);
1439             }
1440 
1441             // array end
1442             aaf.push_back(af);
1443         }
1444 
1445         pch = strtok(NULL, "[]");
1446     }
1447 
1448     return aaf;
1449 }
1450 
parse_comma_int_array_list(char * s)1451 static std::vector<std::vector<int> > parse_comma_int_array_list(char* s)
1452 {
1453     std::vector<std::vector<int> > aai;
1454 
1455     char* pch = strtok(s, "[]");
1456     while (pch != NULL)
1457     {
1458         // parse a,b,c
1459         int v;
1460         int nconsumed = 0;
1461         int nscan = sscanf(pch, "%d%n", &v, &nconsumed);
1462         if (nscan == 1)
1463         {
1464             // ok we get array
1465             pch += nconsumed;
1466 
1467             std::vector<int> ai;
1468             ai.push_back(v);
1469 
1470             nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
1471             while (nscan == 1)
1472             {
1473                 pch += nconsumed;
1474 
1475                 ai.push_back(v);
1476 
1477                 nscan = sscanf(pch, ",%d%n", &v, &nconsumed);
1478             }
1479 
1480             // array end
1481             aai.push_back(ai);
1482         }
1483 
1484         pch = strtok(NULL, "[]");
1485     }
1486 
1487     return aai;
1488 }
1489 
parse_comma_pixel_type_list(char * s)1490 static std::vector<int> parse_comma_pixel_type_list(char* s)
1491 {
1492     std::vector<int> aps;
1493 
1494     char* pch = strtok(s, ",");
1495     while (pch != NULL)
1496     {
1497         // RAW/RGB/BGR/GRAY/RGBA/BGRA
1498         if (strcmp(pch, "RAW") == 0)
1499             aps.push_back(-233);
1500         if (strcmp(pch, "RGB") == 0)
1501             aps.push_back(ncnn::Mat::PIXEL_RGB);
1502         if (strcmp(pch, "BGR") == 0)
1503             aps.push_back(ncnn::Mat::PIXEL_BGR);
1504         if (strcmp(pch, "GRAY") == 0)
1505             aps.push_back(ncnn::Mat::PIXEL_GRAY);
1506         if (strcmp(pch, "RGBA") == 0)
1507             aps.push_back(ncnn::Mat::PIXEL_RGBA);
1508         if (strcmp(pch, "BGRA") == 0)
1509             aps.push_back(ncnn::Mat::PIXEL_BGRA);
1510 
1511         pch = strtok(NULL, ",");
1512     }
1513 
1514     return aps;
1515 }
1516 
print_float_array_list(const std::vector<std::vector<float>> & list)1517 static void print_float_array_list(const std::vector<std::vector<float> >& list)
1518 {
1519     for (size_t i = 0; i < list.size(); i++)
1520     {
1521         const std::vector<float>& array = list[i];
1522         fprintf(stderr, "[");
1523         for (size_t j = 0; j < array.size(); j++)
1524         {
1525             fprintf(stderr, "%f", array[j]);
1526             if (j != array.size() - 1)
1527                 fprintf(stderr, ",");
1528         }
1529         fprintf(stderr, "]");
1530         if (i != list.size() - 1)
1531             fprintf(stderr, ",");
1532     }
1533 }
1534 
print_int_array_list(const std::vector<std::vector<int>> & list)1535 static void print_int_array_list(const std::vector<std::vector<int> >& list)
1536 {
1537     for (size_t i = 0; i < list.size(); i++)
1538     {
1539         const std::vector<int>& array = list[i];
1540         fprintf(stderr, "[");
1541         for (size_t j = 0; j < array.size(); j++)
1542         {
1543             fprintf(stderr, "%d", array[j]);
1544             if (j != array.size() - 1)
1545                 fprintf(stderr, ",");
1546         }
1547         fprintf(stderr, "]");
1548         if (i != list.size() - 1)
1549             fprintf(stderr, ",");
1550     }
1551 }
1552 
print_pixel_type_list(const std::vector<int> & list)1553 static void print_pixel_type_list(const std::vector<int>& list)
1554 {
1555     for (size_t i = 0; i < list.size(); i++)
1556     {
1557         const int type = list[i];
1558         if (type == -233)
1559             fprintf(stderr, "RAW");
1560         if (type == ncnn::Mat::PIXEL_RGB)
1561             fprintf(stderr, "RGB");
1562         if (type == ncnn::Mat::PIXEL_BGR)
1563             fprintf(stderr, "BGR");
1564         if (type == ncnn::Mat::PIXEL_GRAY)
1565             fprintf(stderr, "GRAY");
1566         if (type == ncnn::Mat::PIXEL_RGBA)
1567             fprintf(stderr, "RGBA");
1568         if (type == ncnn::Mat::PIXEL_BGRA)
1569             fprintf(stderr, "BGRA");
1570         if (i != list.size() - 1)
1571             fprintf(stderr, ",");
1572     }
1573 }
1574 
show_usage()1575 static void show_usage()
1576 {
1577     fprintf(stderr, "Usage: ncnn2table [ncnnparam] [ncnnbin] [list,...] [ncnntable] [(key=value)...]\n");
1578     fprintf(stderr, "  mean=[104.0,117.0,123.0],...\n");
1579     fprintf(stderr, "  norm=[1.0,1.0,1.0],...\n");
1580     fprintf(stderr, "  shape=[224,224,3],...[w,h,c] or [w,h] **[0,0] will not resize\n");
1581     fprintf(stderr, "  pixel=RAW/RGB/BGR/GRAY/RGBA/BGRA,...\n");
1582     fprintf(stderr, "  thread=8\n");
1583     fprintf(stderr, "  method=kl/aciq/eq\n");
1584     fprintf(stderr, "Sample usage: ncnn2table squeezenet.param squeezenet.bin imagelist.txt squeezenet.table mean=[104.0,117.0,123.0] norm=[1.0,1.0,1.0] shape=[227,227,3] pixel=BGR method=kl\n");
1585 }
1586 
main(int argc,char ** argv)1587 int main(int argc, char** argv)
1588 {
1589     if (argc < 5)
1590     {
1591         show_usage();
1592         return -1;
1593     }
1594 
1595     for (int i = 1; i < argc; i++)
1596     {
1597         if (argv[i][0] == '-')
1598         {
1599             show_usage();
1600             return -1;
1601         }
1602     }
1603 
1604     const char* inparam = argv[1];
1605     const char* inbin = argv[2];
1606     char* lists = argv[3];
1607     const char* outtable = argv[4];
1608 
1609     ncnn::Option opt;
1610     opt.num_threads = 1;
1611     opt.use_fp16_packed = false;
1612     opt.use_fp16_storage = false;
1613     opt.use_fp16_arithmetic = false;
1614 
1615     QuantNet net;
1616     net.opt = opt;
1617     net.load_param(inparam);
1618     net.load_model(inbin);
1619 
1620     net.init();
1621 
1622     // load lists
1623     net.listspaths = parse_comma_path_list(lists);
1624 
1625     std::string method = "kl";
1626 
1627     for (int i = 5; i < argc; i++)
1628     {
1629         // key=value
1630         char* kv = argv[i];
1631 
1632         char* eqs = strchr(kv, '=');
1633         if (eqs == NULL)
1634         {
1635             fprintf(stderr, "unrecognized arg %s\n", kv);
1636             continue;
1637         }
1638 
1639         // split k v
1640         eqs[0] = '\0';
1641         const char* key = kv;
1642         char* value = eqs + 1;
1643 
1644         // load mean norm shape
1645         if (memcmp(key, "mean", 4) == 0)
1646             net.means = parse_comma_float_array_list(value);
1647         if (memcmp(key, "norm", 4) == 0)
1648             net.norms = parse_comma_float_array_list(value);
1649         if (memcmp(key, "shape", 5) == 0)
1650             net.shapes = parse_comma_int_array_list(value);
1651         if (memcmp(key, "pixel", 5) == 0)
1652             net.type_to_pixels = parse_comma_pixel_type_list(value);
1653         if (memcmp(key, "thread", 6) == 0)
1654             net.quantize_num_threads = atoi(value);
1655         if (memcmp(key, "method", 6) == 0)
1656             method = std::string(value);
1657     }
1658 
1659     // sanity check
1660     const size_t input_blob_count = net.input_blobs.size();
1661     if (net.listspaths.size() != input_blob_count)
1662     {
1663         fprintf(stderr, "expect %d lists, but got %d\n", (int)input_blob_count, (int)net.listspaths.size());
1664         return -1;
1665     }
1666     if (net.means.size() != input_blob_count)
1667     {
1668         fprintf(stderr, "expect %d means, but got %d\n", (int)input_blob_count, (int)net.means.size());
1669         return -1;
1670     }
1671     if (net.norms.size() != input_blob_count)
1672     {
1673         fprintf(stderr, "expect %d norms, but got %d\n", (int)input_blob_count, (int)net.norms.size());
1674         return -1;
1675     }
1676     if (net.shapes.size() != input_blob_count)
1677     {
1678         fprintf(stderr, "expect %d shapes, but got %d\n", (int)input_blob_count, (int)net.shapes.size());
1679         return -1;
1680     }
1681     if (net.type_to_pixels.size() != input_blob_count)
1682     {
1683         fprintf(stderr, "expect %d pixels, but got %d\n", (int)input_blob_count, (int)net.type_to_pixels.size());
1684         return -1;
1685     }
1686     if (net.quantize_num_threads < 0)
1687     {
1688         fprintf(stderr, "malformed thread %d\n", net.quantize_num_threads);
1689         return -1;
1690     }
1691 
1692     // print quantnet config
1693     {
1694         fprintf(stderr, "mean = ");
1695         print_float_array_list(net.means);
1696         fprintf(stderr, "\n");
1697         fprintf(stderr, "norm = ");
1698         print_float_array_list(net.norms);
1699         fprintf(stderr, "\n");
1700         fprintf(stderr, "shape = ");
1701         print_int_array_list(net.shapes);
1702         fprintf(stderr, "\n");
1703         fprintf(stderr, "pixel = ");
1704         print_pixel_type_list(net.type_to_pixels);
1705         fprintf(stderr, "\n");
1706         fprintf(stderr, "thread = %d\n", net.quantize_num_threads);
1707         fprintf(stderr, "method = %s\n", method.c_str());
1708         fprintf(stderr, "---------------------------------------\n");
1709     }
1710 
1711     if (method == "kl")
1712     {
1713         net.quantize_KL();
1714     }
1715     else if (method == "aciq")
1716     {
1717         net.quantize_ACIQ();
1718     }
1719     else if (method == "eq")
1720     {
1721         net.quantize_EQ();
1722     }
1723     else
1724     {
1725         fprintf(stderr, "not implemented yet !\n");
1726         fprintf(stderr, "unknown method %s, expect kl / aciq / eq\n", method.c_str());
1727         return -1;
1728     }
1729 
1730     net.print_quant_info();
1731 
1732     net.save_table(outtable);
1733 
1734     return 0;
1735 }
1736