1 /////////////////////////////////////////////////////////////////////// 2 // File: imagedata.h 3 // Description: Class to hold information about a single image and its 4 // corresponding boxes or text file. 5 // Author: Ray Smith 6 // 7 // (C) Copyright 2013, Google Inc. 8 // Licensed under the Apache License, Version 2.0 (the "License"); 9 // you may not use this file except in compliance with the License. 10 // You may obtain a copy of the License at 11 // http://www.apache.org/licenses/LICENSE-2.0 12 // Unless required by applicable law or agreed to in writing, software 13 // distributed under the License is distributed on an "AS IS" BASIS, 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 // See the License for the specific language governing permissions and 16 // limitations under the License. 17 /////////////////////////////////////////////////////////////////////// 18 19 #ifndef TESSERACT_IMAGE_IMAGEDATA_H_ 20 #define TESSERACT_IMAGE_IMAGEDATA_H_ 21 22 #include "image.h" 23 #include "points.h" // for FCOORD 24 25 #include <mutex> // for std::mutex 26 #include <thread> // for std::thread 27 28 struct Pix; 29 30 namespace tesseract { 31 32 class TFile; 33 class ScrollView; 34 class TBOX; 35 36 // Amount of padding to apply in output pixels in feature mode. 37 const int kFeaturePadding = 2; 38 // Number of pixels to pad around text boxes. 39 const int kImagePadding = 4; 40 41 // Enum to determine the caching and data sequencing strategy. 42 enum CachingStrategy { 43 // Reads all of one file before moving on to the next. Requires samples to be 44 // shuffled across files. Uses the count of samples in the first file as 45 // the count in all the files to achieve high-speed random access. As a 46 // consequence, if subsequent files are smaller, they get entries used more 47 // than once, and if subsequent files are larger, some entries are not used. 48 // Best for larger data sets that don't fit in memory. 49 CS_SEQUENTIAL, 50 // Reads one sample from each file in rotation. Does not require shuffled 51 // samples, but is extremely disk-intensive. Samples in smaller files also 52 // get used more often than samples in larger files. 53 // Best for smaller data sets that mostly fit in memory. 54 CS_ROUND_ROBIN, 55 }; 56 57 // Class to hold information on a single image: 58 // Filename, cached image as a Pix*, character boxes, text transcription. 59 // The text transcription is the ground truth UTF-8 text for the image. 60 // Character boxes are optional and indicate the desired segmentation of 61 // the text into recognition units. 62 class TESS_API ImageData { 63 public: 64 ImageData(); 65 // Takes ownership of the pix. 66 ImageData(bool vertical, Image pix); 67 ~ImageData(); 68 69 // Builds and returns an ImageData from the basic data. Note that imagedata, 70 // truth_text, and box_text are all the actual file data, NOT filenames. 71 static ImageData *Build(const char *name, int page_number, const char *lang, 72 const char *imagedata, int imagedatasize, const char *truth_text, 73 const char *box_text); 74 75 // Writes to the given file. Returns false in case of error. 76 bool Serialize(TFile *fp) const; 77 // Reads from the given file. Returns false in case of error. 78 bool DeSerialize(TFile *fp); 79 // As DeSerialize, but only seeks past the data - hence a static method. 80 static bool SkipDeSerialize(TFile *fp); 81 82 // Other accessors. imagefilename()83 const std::string &imagefilename() const { 84 return imagefilename_; 85 } set_imagefilename(const std::string & name)86 void set_imagefilename(const std::string &name) { 87 imagefilename_ = name; 88 } page_number()89 int page_number() const { 90 return page_number_; 91 } set_page_number(int num)92 void set_page_number(int num) { 93 page_number_ = num; 94 } image_data()95 const std::vector<char> &image_data() const { 96 return image_data_; 97 } language()98 const std::string &language() const { 99 return language_; 100 } set_language(const std::string & lang)101 void set_language(const std::string &lang) { 102 language_ = lang; 103 } transcription()104 const std::string &transcription() const { 105 return transcription_; 106 } boxes()107 const std::vector<TBOX> &boxes() const { 108 return boxes_; 109 } box_texts()110 const std::vector<std::string> &box_texts() const { 111 return box_texts_; 112 } box_text(int index)113 const std::string &box_text(int index) const { 114 return box_texts_[index]; 115 } 116 // Saves the given Pix as a PNG-encoded string and destroys it. 117 // In case of missing PNG support in Leptonica use PNM format, 118 // which requires more memory. 119 void SetPix(Image pix); 120 // Returns the Pix image for *this. Must be pixDestroyed after use. 121 Image GetPix() const; 122 // Gets anything and everything with a non-nullptr pointer, prescaled to a 123 // given target_height (if 0, then the original image height), and aligned. 124 // Also returns (if not nullptr) the width and height of the scaled image. 125 // The return value is the scaled Pix, which must be pixDestroyed after use, 126 // and scale_factor (if not nullptr) is set to the scale factor that was 127 // applied to the image to achieve the target_height. 128 Image PreScale(int target_height, int max_height, float *scale_factor, int *scaled_width, 129 int *scaled_height, std::vector<TBOX> *boxes) const; 130 131 int MemoryUsed() const; 132 133 // Draws the data in a new window. 134 void Display() const; 135 136 // Adds the supplied boxes and transcriptions that correspond to the correct 137 // page number. 138 void AddBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts, 139 const std::vector<int> &box_pages); 140 141 private: 142 // Saves the given Pix as a PNG-encoded string and destroys it. 143 // In case of missing PNG support in Leptonica use PNM format, 144 // which requires more memory. 145 static void SetPixInternal(Image pix, std::vector<char> *image_data); 146 // Returns the Pix image for the image_data. Must be pixDestroyed after use. 147 static Image GetPixInternal(const std::vector<char> &image_data); 148 // Parses the text string as a box file and adds any discovered boxes that 149 // match the page number. Returns false on error. 150 bool AddBoxes(const char *box_text); 151 152 private: 153 std::string imagefilename_; // File to read image from. 154 int32_t page_number_; // Page number if multi-page tif or -1. 155 // see https://github.com/tesseract-ocr/tesseract/pull/2965 156 // EP: reconsider for tess6.0/opencv 157 #ifdef TESSERACT_IMAGEDATA_AS_PIX 158 Image internal_pix_; 159 #endif 160 std::vector<char> image_data_; // PNG/PNM file data. 161 std::string language_; // Language code for image. 162 std::string transcription_; // UTF-8 ground truth of image. 163 std::vector<TBOX> boxes_; // If non-empty boxes of the image. 164 std::vector<std::string> box_texts_; // String for text in each box. 165 bool vertical_text_; // Image has been rotated from vertical. 166 }; 167 168 // A collection of ImageData that knows roughly how much memory it is using. 169 class DocumentData { 170 public: 171 TESS_API 172 explicit DocumentData(const std::string &name); 173 TESS_API 174 ~DocumentData(); 175 176 // Reads all the pages in the given lstmf filename to the cache. The reader 177 // is used to read the file. 178 TESS_API 179 bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader); 180 // Sets up the document, without actually loading it. 181 void SetDocument(const char *filename, int64_t max_memory, FileReader reader); 182 // Writes all the pages to the given filename. Returns false on error. 183 TESS_API 184 bool SaveDocument(const char *filename, FileWriter writer); 185 186 // Adds the given page data to this document, counting up memory. 187 TESS_API 188 void AddPageToDocument(ImageData *page); 189 document_name()190 const std::string &document_name() const { 191 std::lock_guard<std::mutex> lock(general_mutex_); 192 return document_name_; 193 } NumPages()194 int NumPages() const { 195 std::lock_guard<std::mutex> lock(general_mutex_); 196 return total_pages_; 197 } PagesSize()198 size_t PagesSize() const { 199 return pages_.size(); 200 } memory_used()201 int64_t memory_used() const { 202 std::lock_guard<std::mutex> lock(general_mutex_); 203 return memory_used_; 204 } 205 // If the given index is not currently loaded, loads it using a separate 206 // thread. Note: there are 4 cases: 207 // Document uncached: IsCached() returns false, total_pages_ < 0. 208 // Required page is available: IsPageAvailable returns true. In this case, 209 // total_pages_ > 0 and 210 // pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size() 211 // Pages are loaded, but the required one is not. 212 // The requested page is being loaded by LoadPageInBackground. In this case, 213 // index == pages_offset_. Once the loading starts, the pages lock is held 214 // until it completes, at which point IsPageAvailable will unblock and return 215 // true. 216 void LoadPageInBackground(int index); 217 // Returns a pointer to the page with the given index, modulo the total 218 // number of pages. Blocks until the background load is completed. 219 TESS_API 220 const ImageData *GetPage(int index); 221 // Returns true if the requested page is available, and provides a pointer, 222 // which may be nullptr if the document is empty. May block, even though it 223 // doesn't guarantee to return true. 224 bool IsPageAvailable(int index, ImageData **page); 225 // Takes ownership of the given page index. The page is made nullptr in *this. TakePage(int index)226 ImageData *TakePage(int index) { 227 std::lock_guard<std::mutex> lock(pages_mutex_); 228 ImageData *page = pages_[index]; 229 pages_[index] = nullptr; 230 return page; 231 } 232 // Returns true if the document is currently loaded or in the process of 233 // loading. IsCached()234 bool IsCached() const { 235 return NumPages() >= 0; 236 } 237 // Removes all pages from memory and frees the memory, but does not forget 238 // the document metadata. Returns the memory saved. 239 int64_t UnCache(); 240 // Shuffles all the pages in the document. 241 void Shuffle(); 242 243 private: 244 // Sets the value of total_pages_ behind a mutex. set_total_pages(int total)245 void set_total_pages(int total) { 246 std::lock_guard<std::mutex> lock(general_mutex_); 247 total_pages_ = total; 248 } set_memory_used(int64_t memory_used)249 void set_memory_used(int64_t memory_used) { 250 std::lock_guard<std::mutex> lock(general_mutex_); 251 memory_used_ = memory_used; 252 } 253 // Locks the pages_mutex_ and Loads as many pages can fit in max_memory_ 254 // starting at index pages_offset_. 255 bool ReCachePages(); 256 257 private: 258 // A name for this document. 259 std::string document_name_; 260 // A group of pages that corresponds in some loose way to a document. 261 std::vector<ImageData *> pages_; 262 // Page number of the first index in pages_. 263 int pages_offset_; 264 // Total number of pages in document (may exceed size of pages_.) 265 int total_pages_; 266 // Total of all pix sizes in the document. 267 int64_t memory_used_; 268 // Max memory to use at any time. 269 int64_t max_memory_; 270 // Saved reader from LoadDocument to allow re-caching. 271 FileReader reader_; 272 // Mutex that protects pages_ and pages_offset_ against multiple parallel 273 // loads, and provides a wait for page. 274 std::mutex pages_mutex_; 275 // Mutex that protects other data members that callers want to access without 276 // waiting for a load operation. 277 mutable std::mutex general_mutex_; 278 279 // Thread which loads document. 280 std::thread thread; 281 }; 282 283 // A collection of DocumentData that knows roughly how much memory it is using. 284 // Note that while it supports background read-ahead, it assumes that a single 285 // thread is accessing documents, ie it is not safe for multiple threads to 286 // access different documents in parallel, as one may de-cache the other's 287 // content. 288 class DocumentCache { 289 public: 290 TESS_API 291 explicit DocumentCache(int64_t max_memory); 292 TESS_API 293 ~DocumentCache(); 294 295 // Deletes all existing documents from the cache. Clear()296 void Clear() { 297 for (auto *document : documents_) { 298 delete document; 299 } 300 documents_.clear(); 301 num_pages_per_doc_ = 0; 302 } 303 // Adds all the documents in the list of filenames, counting memory. 304 // The reader is used to read the files. 305 TESS_API 306 bool LoadDocuments(const std::vector<std::string> &filenames, CachingStrategy cache_strategy, 307 FileReader reader); 308 309 // Adds document to the cache. 310 bool AddToCache(DocumentData *data); 311 312 // Finds and returns a document by name. 313 DocumentData *FindDocument(const std::string &document_name) const; 314 315 // Returns a page by serial number using the current cache_strategy_ to 316 // determine the mapping from serial number to page. GetPageBySerial(int serial)317 const ImageData *GetPageBySerial(int serial) { 318 if (cache_strategy_ == CS_SEQUENTIAL) { 319 return GetPageSequential(serial); 320 } else { 321 return GetPageRoundRobin(serial); 322 } 323 } 324 documents()325 const std::vector<DocumentData *> &documents() const { 326 return documents_; 327 } 328 // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache 329 // strategy, could take a long time. 330 TESS_API 331 int TotalPages(); 332 333 private: 334 // Returns a page by serial number, selecting them in a round-robin fashion 335 // from all the documents. Highly disk-intensive, but doesn't need samples 336 // to be shuffled between files to begin with. 337 TESS_API 338 const ImageData *GetPageRoundRobin(int serial); 339 // Returns a page by serial number, selecting them in sequence from each file. 340 // Requires the samples to be shuffled between the files to give a random or 341 // uniform distribution of data. Less disk-intensive than GetPageRoundRobin. 342 TESS_API 343 const ImageData *GetPageSequential(int serial); 344 345 // Helper counts the number of adjacent cached neighbour documents_ of index 346 // looking in direction dir, ie index+dir, index+2*dir etc. 347 int CountNeighbourDocs(int index, int dir); 348 349 // A group of pages that corresponds in some loose way to a document. 350 std::vector<DocumentData *> documents_; 351 // Strategy to use for caching and serializing data samples. 352 CachingStrategy cache_strategy_ = CS_SEQUENTIAL; 353 // Number of pages in the first document, used as a divisor in 354 // GetPageSequential to determine the document index. 355 int num_pages_per_doc_ = 0; 356 // Max memory allowed in this cache. 357 int64_t max_memory_ = 0; 358 }; 359 360 } // namespace tesseract 361 362 #endif // TESSERACT_IMAGE_IMAGEDATA_H_ 363