1 /********************************************************************** 2 * File: boxread.h 3 * Description: Read data from a box file. 4 * Author: Ray Smith 5 * 6 * (C) Copyright 2007, Google Inc. 7 ** Licensed under the Apache License, Version 2.0 (the "License"); 8 ** you may not use this file except in compliance with the License. 9 ** You may obtain a copy of the License at 10 ** http://www.apache.org/licenses/LICENSE-2.0 11 ** Unless required by applicable law or agreed to in writing, software 12 ** distributed under the License is distributed on an "AS IS" BASIS, 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 ** See the License for the specific language governing permissions and 15 ** limitations under the License. 16 * 17 **********************************************************************/ 18 19 #ifndef TESSERACT_CCUTIL_BOXREAD_H_ 20 #define TESSERACT_CCUTIL_BOXREAD_H_ 21 22 #include <cstdio> // for FILE 23 #include <string> // for std::string 24 #include <vector> // for std::vector 25 26 #include <tesseract/export.h> // for TESS_API 27 28 namespace tesseract { 29 30 class TBOX; 31 32 // Size of buffer used to read a line from a box file. 33 const int kBoxReadBufSize = 1024; 34 35 // Open the boxfile based on the given image filename. 36 // Returns nullptr if the box file cannot be opened. 37 TESS_API 38 FILE *OpenBoxFile(const char *filename); 39 40 // Reads all boxes from the given filename. 41 // Reads a specific target_page number if >= 0, or all pages otherwise. 42 // Skips blanks if skip_blanks is true. 43 // The UTF-8 label of the box is put in texts, and the full box definition as 44 // a string is put in box_texts, with the corresponding page number in pages. 45 // Each of the output vectors is optional (may be nullptr). 46 // Returns false if no boxes are found. 47 bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector<TBOX> *boxes, 48 std::vector<std::string> *texts, std::vector<std::string> *box_texts, 49 std::vector<int> *pages); 50 51 // Reads all boxes from the string. Otherwise, as ReadAllBoxes. 52 // continue_on_failure allows reading to continue even if an invalid box is 53 // encountered and will return true if it succeeds in reading some boxes. 54 // It otherwise gives up and returns false on encountering an invalid box. 55 TESS_API 56 bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure, 57 std::vector<TBOX> *boxes, std::vector<std::string> *texts, 58 std::vector<std::string> *box_texts, std::vector<int> *pages); 59 60 // ReadNextBox factors out the code to interpret a line of a box 61 // file so that applybox and unicharset_extractor interpret the same way. 62 // This function returns the next valid box file utf8 string and coords 63 // and returns true, or false on eof (and closes the file). 64 // It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks 65 // for valid utf-8 and allows space or tab between fields. 66 // utf8_str is set with the unichar string, and bounding box with the box. 67 // If there are page numbers in the file, it reads them all. 68 TESS_API 69 bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box); 70 // As ReadNextBox above, but get a specific page number. (0-based) 71 // Use -1 to read any page number. Files without page number all 72 // read as if they are page 0. 73 TESS_API 74 bool ReadNextBox(int target_page, int *line_number, FILE *box_file, std::string &utf8_str, 75 TBOX *bounding_box); 76 77 // Parses the given box file string into a page_number, utf8_str, and 78 // bounding_box. Returns true on a successful parse. 79 TESS_API 80 bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str, 81 TBOX *bounding_box); 82 83 // Creates a box file string from a unichar string, TBOX and page number. 84 TESS_API 85 void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, std::string &box_str); 86 87 } // namespace tesseract 88 89 #endif // TESSERACT_CCUTIL_BOXREAD_H_ 90