1 ///////////////////////////////////////////////////////////////////////
2 // File:        imagedata.h
3 // Description: Class to hold information about a single image and its
4 //              corresponding boxes or text file.
5 // Author:      Ray Smith
6 //
7 // (C) Copyright 2013, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifndef TESSERACT_IMAGE_IMAGEDATA_H_
20 #define TESSERACT_IMAGE_IMAGEDATA_H_
21 
22 #include "image.h"
23 #include "points.h" // for FCOORD
24 
25 #include <mutex>  // for std::mutex
26 #include <thread> // for std::thread
27 
28 struct Pix;
29 
30 namespace tesseract {
31 
32 class TFile;
33 class ScrollView;
34 class TBOX;
35 
36 // Amount of padding to apply in output pixels in feature mode.
37 const int kFeaturePadding = 2;
38 // Number of pixels to pad around text boxes.
39 const int kImagePadding = 4;
40 
41 // Enum to determine the caching and data sequencing strategy.
42 enum CachingStrategy {
43   // Reads all of one file before moving on to the next. Requires samples to be
44   // shuffled across files. Uses the count of samples in the first file as
45   // the count in all the files to achieve high-speed random access. As a
46   // consequence, if subsequent files are smaller, they get entries used more
47   // than once, and if subsequent files are larger, some entries are not used.
48   // Best for larger data sets that don't fit in memory.
49   CS_SEQUENTIAL,
50   // Reads one sample from each file in rotation. Does not require shuffled
51   // samples, but is extremely disk-intensive. Samples in smaller files also
52   // get used more often than samples in larger files.
53   // Best for smaller data sets that mostly fit in memory.
54   CS_ROUND_ROBIN,
55 };
56 
57 // Class to hold information on a single image:
58 // Filename, cached image as a Pix*, character boxes, text transcription.
59 // The text transcription is the ground truth UTF-8 text for the image.
60 // Character boxes are optional and indicate the desired segmentation of
61 // the text into recognition units.
62 class TESS_API ImageData {
63 public:
64   ImageData();
65   // Takes ownership of the pix.
66   ImageData(bool vertical, Image pix);
67   ~ImageData();
68 
69   // Builds and returns an ImageData from the basic data. Note that imagedata,
70   // truth_text, and box_text are all the actual file data, NOT filenames.
71   static ImageData *Build(const char *name, int page_number, const char *lang,
72                           const char *imagedata, int imagedatasize, const char *truth_text,
73                           const char *box_text);
74 
75   // Writes to the given file. Returns false in case of error.
76   bool Serialize(TFile *fp) const;
77   // Reads from the given file. Returns false in case of error.
78   bool DeSerialize(TFile *fp);
79   // As DeSerialize, but only seeks past the data - hence a static method.
80   static bool SkipDeSerialize(TFile *fp);
81 
82   // Other accessors.
imagefilename()83   const std::string &imagefilename() const {
84     return imagefilename_;
85   }
set_imagefilename(const std::string & name)86   void set_imagefilename(const std::string &name) {
87     imagefilename_ = name;
88   }
page_number()89   int page_number() const {
90     return page_number_;
91   }
set_page_number(int num)92   void set_page_number(int num) {
93     page_number_ = num;
94   }
image_data()95   const std::vector<char> &image_data() const {
96     return image_data_;
97   }
language()98   const std::string &language() const {
99     return language_;
100   }
set_language(const std::string & lang)101   void set_language(const std::string &lang) {
102     language_ = lang;
103   }
transcription()104   const std::string &transcription() const {
105     return transcription_;
106   }
boxes()107   const std::vector<TBOX> &boxes() const {
108     return boxes_;
109   }
box_texts()110   const std::vector<std::string> &box_texts() const {
111     return box_texts_;
112   }
box_text(int index)113   const std::string &box_text(int index) const {
114     return box_texts_[index];
115   }
116   // Saves the given Pix as a PNG-encoded string and destroys it.
117   // In case of missing PNG support in Leptonica use PNM format,
118   // which requires more memory.
119   void SetPix(Image pix);
120   // Returns the Pix image for *this. Must be pixDestroyed after use.
121   Image GetPix() const;
122   // Gets anything and everything with a non-nullptr pointer, prescaled to a
123   // given target_height (if 0, then the original image height), and aligned.
124   // Also returns (if not nullptr) the width and height of the scaled image.
125   // The return value is the scaled Pix, which must be pixDestroyed after use,
126   // and scale_factor (if not nullptr) is set to the scale factor that was
127   // applied to the image to achieve the target_height.
128   Image PreScale(int target_height, int max_height, float *scale_factor, int *scaled_width,
129                 int *scaled_height, std::vector<TBOX> *boxes) const;
130 
131   int MemoryUsed() const;
132 
133   // Draws the data in a new window.
134   void Display() const;
135 
136   // Adds the supplied boxes and transcriptions that correspond to the correct
137   // page number.
138   void AddBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts,
139                 const std::vector<int> &box_pages);
140 
141 private:
142   // Saves the given Pix as a PNG-encoded string and destroys it.
143   // In case of missing PNG support in Leptonica use PNM format,
144   // which requires more memory.
145   static void SetPixInternal(Image pix, std::vector<char> *image_data);
146   // Returns the Pix image for the image_data. Must be pixDestroyed after use.
147   static Image GetPixInternal(const std::vector<char> &image_data);
148   // Parses the text string as a box file and adds any discovered boxes that
149   // match the page number. Returns false on error.
150   bool AddBoxes(const char *box_text);
151 
152 private:
153   std::string imagefilename_; // File to read image from.
154   int32_t page_number_;  // Page number if multi-page tif or -1.
155   // see https://github.com/tesseract-ocr/tesseract/pull/2965
156   // EP: reconsider for tess6.0/opencv
157 #ifdef TESSERACT_IMAGEDATA_AS_PIX
158   Image internal_pix_;
159 #endif
160   std::vector<char> image_data_;  // PNG/PNM file data.
161   std::string language_;          // Language code for image.
162   std::string transcription_;     // UTF-8 ground truth of image.
163   std::vector<TBOX> boxes_;       // If non-empty boxes of the image.
164   std::vector<std::string> box_texts_; // String for text in each box.
165   bool vertical_text_;            // Image has been rotated from vertical.
166 };
167 
168 // A collection of ImageData that knows roughly how much memory it is using.
169 class DocumentData {
170 public:
171   TESS_API
172   explicit DocumentData(const std::string &name);
173   TESS_API
174   ~DocumentData();
175 
176   // Reads all the pages in the given lstmf filename to the cache. The reader
177   // is used to read the file.
178   TESS_API
179   bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader);
180   // Sets up the document, without actually loading it.
181   void SetDocument(const char *filename, int64_t max_memory, FileReader reader);
182   // Writes all the pages to the given filename. Returns false on error.
183   TESS_API
184   bool SaveDocument(const char *filename, FileWriter writer);
185 
186   // Adds the given page data to this document, counting up memory.
187   TESS_API
188   void AddPageToDocument(ImageData *page);
189 
document_name()190   const std::string &document_name() const {
191     std::lock_guard<std::mutex> lock(general_mutex_);
192     return document_name_;
193   }
NumPages()194   int NumPages() const {
195     std::lock_guard<std::mutex> lock(general_mutex_);
196     return total_pages_;
197   }
PagesSize()198   size_t PagesSize() const {
199     return pages_.size();
200   }
memory_used()201   int64_t memory_used() const {
202     std::lock_guard<std::mutex> lock(general_mutex_);
203     return memory_used_;
204   }
205   // If the given index is not currently loaded, loads it using a separate
206   // thread. Note: there are 4 cases:
207   // Document uncached: IsCached() returns false, total_pages_ < 0.
208   // Required page is available: IsPageAvailable returns true. In this case,
209   // total_pages_ > 0 and
210   // pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size()
211   // Pages are loaded, but the required one is not.
212   // The requested page is being loaded by LoadPageInBackground. In this case,
213   // index == pages_offset_. Once the loading starts, the pages lock is held
214   // until it completes, at which point IsPageAvailable will unblock and return
215   // true.
216   void LoadPageInBackground(int index);
217   // Returns a pointer to the page with the given index, modulo the total
218   // number of pages. Blocks until the background load is completed.
219   TESS_API
220   const ImageData *GetPage(int index);
221   // Returns true if the requested page is available, and provides a pointer,
222   // which may be nullptr if the document is empty. May block, even though it
223   // doesn't guarantee to return true.
224   bool IsPageAvailable(int index, ImageData **page);
225   // Takes ownership of the given page index. The page is made nullptr in *this.
TakePage(int index)226   ImageData *TakePage(int index) {
227     std::lock_guard<std::mutex> lock(pages_mutex_);
228     ImageData *page = pages_[index];
229     pages_[index] = nullptr;
230     return page;
231   }
232   // Returns true if the document is currently loaded or in the process of
233   // loading.
IsCached()234   bool IsCached() const {
235     return NumPages() >= 0;
236   }
237   // Removes all pages from memory and frees the memory, but does not forget
238   // the document metadata. Returns the memory saved.
239   int64_t UnCache();
240   // Shuffles all the pages in the document.
241   void Shuffle();
242 
243 private:
244   // Sets the value of total_pages_ behind a mutex.
set_total_pages(int total)245   void set_total_pages(int total) {
246     std::lock_guard<std::mutex> lock(general_mutex_);
247     total_pages_ = total;
248   }
set_memory_used(int64_t memory_used)249   void set_memory_used(int64_t memory_used) {
250     std::lock_guard<std::mutex> lock(general_mutex_);
251     memory_used_ = memory_used;
252   }
253   // Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
254   // starting at index pages_offset_.
255   bool ReCachePages();
256 
257 private:
258   // A name for this document.
259   std::string document_name_;
260   // A group of pages that corresponds in some loose way to a document.
261   std::vector<ImageData *> pages_;
262   // Page number of the first index in pages_.
263   int pages_offset_;
264   // Total number of pages in document (may exceed size of pages_.)
265   int total_pages_;
266   // Total of all pix sizes in the document.
267   int64_t memory_used_;
268   // Max memory to use at any time.
269   int64_t max_memory_;
270   // Saved reader from LoadDocument to allow re-caching.
271   FileReader reader_;
272   // Mutex that protects pages_ and pages_offset_ against multiple parallel
273   // loads, and provides a wait for page.
274   std::mutex pages_mutex_;
275   // Mutex that protects other data members that callers want to access without
276   // waiting for a load operation.
277   mutable std::mutex general_mutex_;
278 
279   // Thread which loads document.
280   std::thread thread;
281 };
282 
283 // A collection of DocumentData that knows roughly how much memory it is using.
284 // Note that while it supports background read-ahead, it assumes that a single
285 // thread is accessing documents, ie it is not safe for multiple threads to
286 // access different documents in parallel, as one may de-cache the other's
287 // content.
288 class DocumentCache {
289 public:
290   TESS_API
291   explicit DocumentCache(int64_t max_memory);
292   TESS_API
293   ~DocumentCache();
294 
295   // Deletes all existing documents from the cache.
Clear()296   void Clear() {
297     for (auto *document : documents_) {
298       delete document;
299     }
300     documents_.clear();
301     num_pages_per_doc_ = 0;
302   }
303   // Adds all the documents in the list of filenames, counting memory.
304   // The reader is used to read the files.
305   TESS_API
306   bool LoadDocuments(const std::vector<std::string> &filenames, CachingStrategy cache_strategy,
307                      FileReader reader);
308 
309   // Adds document to the cache.
310   bool AddToCache(DocumentData *data);
311 
312   // Finds and returns a document by name.
313   DocumentData *FindDocument(const std::string &document_name) const;
314 
315   // Returns a page by serial number using the current cache_strategy_ to
316   // determine the mapping from serial number to page.
GetPageBySerial(int serial)317   const ImageData *GetPageBySerial(int serial) {
318     if (cache_strategy_ == CS_SEQUENTIAL) {
319       return GetPageSequential(serial);
320     } else {
321       return GetPageRoundRobin(serial);
322     }
323   }
324 
documents()325   const std::vector<DocumentData *> &documents() const {
326     return documents_;
327   }
328   // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
329   // strategy, could take a long time.
330   TESS_API
331   int TotalPages();
332 
333 private:
334   // Returns a page by serial number, selecting them in a round-robin fashion
335   // from all the documents. Highly disk-intensive, but doesn't need samples
336   // to be shuffled between files to begin with.
337   TESS_API
338   const ImageData *GetPageRoundRobin(int serial);
339   // Returns a page by serial number, selecting them in sequence from each file.
340   // Requires the samples to be shuffled between the files to give a random or
341   // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
342   TESS_API
343   const ImageData *GetPageSequential(int serial);
344 
345   // Helper counts the number of adjacent cached neighbour documents_ of index
346   // looking in direction dir, ie index+dir, index+2*dir etc.
347   int CountNeighbourDocs(int index, int dir);
348 
349   // A group of pages that corresponds in some loose way to a document.
350   std::vector<DocumentData *> documents_;
351   // Strategy to use for caching and serializing data samples.
352   CachingStrategy cache_strategy_ = CS_SEQUENTIAL;
353   // Number of pages in the first document, used as a divisor in
354   // GetPageSequential to determine the document index.
355   int num_pages_per_doc_ = 0;
356   // Max memory allowed in this cache.
357   int64_t max_memory_ = 0;
358 };
359 
360 } // namespace tesseract
361 
362 #endif // TESSERACT_IMAGE_IMAGEDATA_H_
363