1 /*
2     Text detection model: https://github.com/argman/EAST
3     Download link: https://www.dropbox.com/s/r2ingd0l3zt8hxs/frozen_east_text_detection.tar.gz?dl=1
4 
5     Text recognition models can be downloaded directly here:
6     Download link: https://drive.google.com/drive/folders/1cTbQ3nuZG-EKWak6emD_s8_hHXWz7lAr?usp=sharing
7     and doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown
8 
9     How to convert from pb to onnx:
10     Using classes from here: https://github.com/meijieru/crnn.pytorch/blob/master/models/crnn.py
11     import torch
12     from models.crnn import CRNN
13     model = CRNN(32, 1, 37, 256)
14     model.load_state_dict(torch.load('crnn.pth'))
15     dummy_input = torch.randn(1, 1, 32, 100)
16     torch.onnx.export(model, dummy_input, "crnn.onnx", verbose=True)
17 
18     For more information, please refer to doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown and doc/tutorials/dnn/dnn_OCR/dnn_OCR.markdown
19 */
20 #include <iostream>
21 #include <fstream>
22 
23 #include <opencv2/imgproc.hpp>
24 #include <opencv2/highgui.hpp>
25 #include <opencv2/dnn.hpp>
26 
27 using namespace cv;
28 using namespace cv::dnn;
29 
30 const char* keys =
31     "{ help  h              | | Print help message. }"
32     "{ input i              | | Path to input image or video file. Skip this argument to capture frames from a camera.}"
33     "{ detModel dmp         | | Path to a binary .pb file contains trained detector network.}"
34     "{ width                | 320 | Preprocess input image by resizing to a specific width. It should be multiple by 32. }"
35     "{ height               | 320 | Preprocess input image by resizing to a specific height. It should be multiple by 32. }"
36     "{ thr                  | 0.5 | Confidence threshold. }"
37     "{ nms                  | 0.4 | Non-maximum suppression threshold. }"
38     "{ recModel rmp         | | Path to a binary .onnx file contains trained CRNN text recognition model. "
39         "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}"
40     "{ RGBInput rgb         |0| 0: imread with flags=IMREAD_GRAYSCALE; 1: imread with flags=IMREAD_COLOR. }"
41     "{ vocabularyPath vp    | alphabet_36.txt | Path to benchmarks for evaluation. "
42         "Download links are provided in doc/tutorials/dnn/dnn_text_spotting/dnn_text_spotting.markdown}";
43 
44 void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result);
45 
main(int argc,char ** argv)46 int main(int argc, char** argv)
47 {
48     // Parse command line arguments.
49     CommandLineParser parser(argc, argv, keys);
50     parser.about("Use this script to run TensorFlow implementation (https://github.com/argman/EAST) of "
51                  "EAST: An Efficient and Accurate Scene Text Detector (https://arxiv.org/abs/1704.03155v2)");
52     if (argc == 1 || parser.has("help"))
53     {
54         parser.printMessage();
55         return 0;
56     }
57 
58     float confThreshold = parser.get<float>("thr");
59     float nmsThreshold = parser.get<float>("nms");
60     int width = parser.get<int>("width");
61     int height = parser.get<int>("height");
62     int imreadRGB = parser.get<int>("RGBInput");
63     String detModelPath = parser.get<String>("detModel");
64     String recModelPath = parser.get<String>("recModel");
65     String vocPath = parser.get<String>("vocabularyPath");
66 
67     if (!parser.check())
68     {
69         parser.printErrors();
70         return 1;
71     }
72 
73     // Load networks.
74     CV_Assert(!detModelPath.empty() && !recModelPath.empty());
75     TextDetectionModel_EAST detector(detModelPath);
76     detector.setConfidenceThreshold(confThreshold)
77             .setNMSThreshold(nmsThreshold);
78 
79     TextRecognitionModel recognizer(recModelPath);
80 
81     // Load vocabulary
82     CV_Assert(!vocPath.empty());
83     std::ifstream vocFile;
84     vocFile.open(samples::findFile(vocPath));
85     CV_Assert(vocFile.is_open());
86     String vocLine;
87     std::vector<String> vocabulary;
88     while (std::getline(vocFile, vocLine)) {
89         vocabulary.push_back(vocLine);
90     }
91     recognizer.setVocabulary(vocabulary);
92     recognizer.setDecodeType("CTC-greedy");
93 
94     // Parameters for Recognition
95     double recScale = 1.0 / 127.5;
96     Scalar recMean = Scalar(127.5, 127.5, 127.5);
97     Size recInputSize = Size(100, 32);
98     recognizer.setInputParams(recScale, recInputSize, recMean);
99 
100     // Parameters for Detection
101     double detScale = 1.0;
102     Size detInputSize = Size(width, height);
103     Scalar detMean = Scalar(123.68, 116.78, 103.94);
104     bool swapRB = true;
105     detector.setInputParams(detScale, detInputSize, detMean, swapRB);
106 
107     // Open a video file or an image file or a camera stream.
108     VideoCapture cap;
109     bool openSuccess = parser.has("input") ? cap.open(parser.get<String>("input")) : cap.open(0);
110     CV_Assert(openSuccess);
111 
112     static const std::string kWinName = "EAST: An Efficient and Accurate Scene Text Detector";
113 
114     Mat frame;
115     while (waitKey(1) < 0)
116     {
117         cap >> frame;
118         if (frame.empty())
119         {
120             waitKey();
121             break;
122         }
123 
124         std::cout << frame.size << std::endl;
125 
126         // Detection
127         std::vector< std::vector<Point> > detResults;
128         detector.detect(frame, detResults);
129 
130         if (detResults.size() > 0) {
131             // Text Recognition
132             Mat recInput;
133             if (!imreadRGB) {
134                 cvtColor(frame, recInput, cv::COLOR_BGR2GRAY);
135             } else {
136                 recInput = frame;
137             }
138             std::vector< std::vector<Point> > contours;
139             for (uint i = 0; i < detResults.size(); i++)
140             {
141                 const auto& quadrangle = detResults[i];
142                 CV_CheckEQ(quadrangle.size(), (size_t)4, "");
143 
144                 contours.emplace_back(quadrangle);
145 
146                 std::vector<Point2f> quadrangle_2f;
147                 for (int j = 0; j < 4; j++)
148                     quadrangle_2f.emplace_back(quadrangle[j]);
149 
150                 Mat cropped;
151                 fourPointsTransform(recInput, &quadrangle_2f[0], cropped);
152 
153                 std::string recognitionResult = recognizer.recognize(cropped);
154                 std::cout << i << ": '" << recognitionResult << "'" << std::endl;
155 
156                 putText(frame, recognitionResult, quadrangle[3], FONT_HERSHEY_SIMPLEX, 1.5, Scalar(0, 0, 255), 2);
157             }
158             polylines(frame, contours, true, Scalar(0, 255, 0), 2);
159         }
160         imshow(kWinName, frame);
161     }
162     return 0;
163 }
164 
fourPointsTransform(const Mat & frame,const Point2f vertices[],Mat & result)165 void fourPointsTransform(const Mat& frame, const Point2f vertices[], Mat& result)
166 {
167     const Size outputSize = Size(100, 32);
168 
169     Point2f targetVertices[4] = {
170         Point(0, outputSize.height - 1),
171         Point(0, 0), Point(outputSize.width - 1, 0),
172         Point(outputSize.width - 1, outputSize.height - 1)
173     };
174     Mat rotationMatrix = getPerspectiveTransform(vertices, targetVertices);
175 
176     warpPerspective(frame, result, rotationMatrix, outputSize);
177 }
178