1 // File: altorenderer.cpp
2 // Description: ALTO rendering interface
3 // Author: Jake Sebright
4
5 // (C) Copyright 2018
6 // Licensed under the Apache License, Version 2.0 (the "License");
7 // you may not use this file except in compliance with the License.
8 // You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15
16 #ifdef _WIN32
17 # include "host.h" // windows.h for MultiByteToWideChar, ...
18 #endif
19
20 #include <tesseract/baseapi.h>
21 #include <tesseract/renderer.h>
22
23 #include <memory>
24 #include <sstream> // for std::stringstream
25
26 namespace tesseract {
27
28 /// Add coordinates to specified TextBlock, TextLine or String bounding box.
29 /// Add word confidence if adding to a String bounding box.
30 ///
AddBoxToAlto(const ResultIterator * it,PageIteratorLevel level,std::stringstream & alto_str)31 static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,
32 std::stringstream &alto_str) {
33 int left, top, right, bottom;
34 it->BoundingBox(level, &left, &top, &right, &bottom);
35
36 int hpos = left;
37 int vpos = top;
38 int height = bottom - top;
39 int width = right - left;
40
41 alto_str << " HPOS=\"" << hpos << "\"";
42 alto_str << " VPOS=\"" << vpos << "\"";
43 alto_str << " WIDTH=\"" << width << "\"";
44 alto_str << " HEIGHT=\"" << height << "\"";
45
46 if (level == RIL_WORD) {
47 int wc = it->Confidence(RIL_WORD);
48 alto_str << " WC=\"0." << wc << "\"";
49 } else {
50 alto_str << ">";
51 }
52 }
53
54 ///
55 /// Append the ALTO XML for the beginning of the document
56 ///
BeginDocumentHandler()57 bool TessAltoRenderer::BeginDocumentHandler() {
58 // Delay the XML output because we need the name of the image file.
59 begin_document = true;
60 return true;
61 }
62
63 ///
64 /// Append the ALTO XML for the layout of the image
65 ///
AddImageHandler(TessBaseAPI * api)66 bool TessAltoRenderer::AddImageHandler(TessBaseAPI *api) {
67 if (begin_document) {
68 AppendString(
69 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
70 "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
71 "xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
72 "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
73 "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# "
74 "http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
75 "\t<Description>\n"
76 "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
77 "\t\t<sourceImageInformation>\n"
78 "\t\t\t<fileName>");
79
80 AppendString(api->GetInputName());
81
82 AppendString(
83 "</fileName>\n"
84 "\t\t</sourceImageInformation>\n"
85 "\t\t<OCRProcessing ID=\"OCR_0\">\n"
86 "\t\t\t<ocrProcessingStep>\n"
87 "\t\t\t\t<processingSoftware>\n"
88 "\t\t\t\t\t<softwareName>tesseract ");
89 AppendString(TessBaseAPI::Version());
90 AppendString(
91 "</softwareName>\n"
92 "\t\t\t\t</processingSoftware>\n"
93 "\t\t\t</ocrProcessingStep>\n"
94 "\t\t</OCRProcessing>\n"
95 "\t</Description>\n"
96 "\t<Layout>\n");
97 begin_document = false;
98 }
99
100 const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));
101 if (text == nullptr) {
102 return false;
103 }
104
105 AppendString(text.get());
106
107 return true;
108 }
109
110 ///
111 /// Append the ALTO XML for the end of the document
112 ///
EndDocumentHandler()113 bool TessAltoRenderer::EndDocumentHandler() {
114 AppendString("\t</Layout>\n</alto>\n");
115
116 return true;
117 }
118
TessAltoRenderer(const char * outputbase)119 TessAltoRenderer::TessAltoRenderer(const char *outputbase)
120 : TessResultRenderer(outputbase, "xml"),
121 begin_document(false) {}
122
123 ///
124 /// Make an XML-formatted string with ALTO markup from the internal
125 /// data structures.
126 ///
GetAltoText(int page_number)127 char *TessBaseAPI::GetAltoText(int page_number) {
128 return GetAltoText(nullptr, page_number);
129 }
130
131 ///
132 /// Make an XML-formatted string with ALTO markup from the internal
133 /// data structures.
134 ///
GetAltoText(ETEXT_DESC * monitor,int page_number)135 char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
136 if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) {
137 return nullptr;
138 }
139
140 int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;
141
142 if (input_file_.empty()) {
143 SetInputName(nullptr);
144 }
145
146 #ifdef _WIN32
147 // convert input name from ANSI encoding to utf-8
148 int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
149 wchar_t *uni16_str = new WCHAR[str16_len];
150 str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
151 int utf8_len =
152 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
153 char *utf8_str = new char[utf8_len];
154 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
155 input_file_ = utf8_str;
156 delete[] uni16_str;
157 delete[] utf8_str;
158 #endif
159
160 std::stringstream alto_str;
161 // Use "C" locale (needed for int values larger than 999).
162 alto_str.imbue(std::locale::classic());
163 alto_str << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\"" << rect_height_
164 << "\" PHYSICAL_IMG_NR=\"" << page_number << "\""
165 << " ID=\"page_" << page_number << "\">\n"
166 << "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
167 << " WIDTH=\"" << rect_width_ << "\""
168 << " HEIGHT=\"" << rect_height_ << "\">\n";
169
170 ResultIterator *res_it = GetIterator();
171 while (!res_it->Empty(RIL_BLOCK)) {
172 if (res_it->Empty(RIL_WORD)) {
173 res_it->Next(RIL_WORD);
174 continue;
175 }
176
177 if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
178 alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
179 AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
180 alto_str << "\n";
181 }
182
183 if (res_it->IsAtBeginningOf(RIL_PARA)) {
184 alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
185 AddBoxToAlto(res_it, RIL_PARA, alto_str);
186 alto_str << "\n";
187 }
188
189 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
190 alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
191 AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
192 alto_str << "\n";
193 }
194
195 alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
196 AddBoxToAlto(res_it, RIL_WORD, alto_str);
197 alto_str << " CONTENT=\"";
198
199 bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
200 bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
201 bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
202
203 int left, top, right, bottom;
204 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
205
206 do {
207 const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
208 if (grapheme && grapheme[0] != 0) {
209 alto_str << HOcrEscape(grapheme.get()).c_str();
210 }
211 res_it->Next(RIL_SYMBOL);
212 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
213
214 alto_str << "\"/>";
215
216 wcnt++;
217
218 if (last_word_in_line) {
219 alto_str << "\n\t\t\t\t\t\t</TextLine>\n";
220 lcnt++;
221 } else {
222 int hpos = right;
223 int vpos = top;
224 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
225 int width = left - hpos;
226 alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos << "\" HPOS=\"" << hpos
227 << "\"/>\n";
228 }
229
230 if (last_word_in_tblock) {
231 alto_str << "\t\t\t\t\t</TextBlock>\n";
232 tcnt++;
233 }
234
235 if (last_word_in_cblock) {
236 alto_str << "\t\t\t\t</ComposedBlock>\n";
237 bcnt++;
238 }
239 }
240
241 alto_str << "\t\t\t</PrintSpace>\n"
242 << "\t\t</Page>\n";
243 const std::string &text = alto_str.str();
244
245 char *result = new char[text.length() + 1];
246 strcpy(result, text.c_str());
247 delete res_it;
248 return result;
249 }
250
251 } // namespace tesseract
252