1 ///////////////////////////////////////////////////////////////////////
2 // File:        imagedata.cpp
3 // Description: Class to hold information about a single multi-page tiff
4 //              training file and its corresponding boxes or text file.
5 // Author:      Ray Smith
6 //
7 // (C) Copyright 2013, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 ///////////////////////////////////////////////////////////////////////
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 #  include "config_auto.h"
22 #endif
23 
24 #include "imagedata.h"
25 
26 #include "boxread.h"    // for ReadMemBoxes
27 #include "rect.h"       // for TBOX
28 #include "scrollview.h" // for ScrollView, ScrollView::CYAN, ScrollView::NONE
29 #include "tprintf.h"    // for tprintf
30 
31 #include "helpers.h"  // for IntCastRounded, TRand, ClipToRange, Modulo
32 #include "serialis.h" // for TFile
33 
34 #include <allheaders.h> // for pixDestroy, pixGetHeight, pixGetWidth, lept_...
35 
36 #include <cinttypes> // for PRId64
37 
38 namespace tesseract {
39 
40 // Number of documents to read ahead while training. Doesn't need to be very
41 // large.
42 const int kMaxReadAhead = 8;
43 
ImageData()44 ImageData::ImageData() : page_number_(-1), vertical_text_(false) {}
45 // Takes ownership of the pix and destroys it.
ImageData(bool vertical,Image pix)46 ImageData::ImageData(bool vertical, Image pix)
47     : page_number_(0), vertical_text_(vertical) {
48   SetPix(pix);
49 }
~ImageData()50 ImageData::~ImageData() {
51 #ifdef TESSERACT_IMAGEDATA_AS_PIX
52   internal_pix_.destroy();
53 #endif
54 }
55 
56 // Builds and returns an ImageData from the basic data. Note that imagedata,
57 // truth_text, and box_text are all the actual file data, NOT filenames.
Build(const char * name,int page_number,const char * lang,const char * imagedata,int imagedatasize,const char * truth_text,const char * box_text)58 ImageData *ImageData::Build(const char *name, int page_number, const char *lang,
59                             const char *imagedata, int imagedatasize,
60                             const char *truth_text, const char *box_text) {
61   auto *image_data = new ImageData();
62   image_data->imagefilename_ = name;
63   image_data->page_number_ = page_number;
64   image_data->language_ = lang;
65   // Save the imagedata.
66   // TODO: optimize resize (no init).
67   image_data->image_data_.resize(imagedatasize);
68   memcpy(&image_data->image_data_[0], imagedata, imagedatasize);
69   if (!image_data->AddBoxes(box_text)) {
70     if (truth_text == nullptr || truth_text[0] == '\0') {
71       tprintf("Error: No text corresponding to page %d from image %s!\n",
72               page_number, name);
73       delete image_data;
74       return nullptr;
75     }
76     image_data->transcription_ = truth_text;
77     // If we have no boxes, the transcription is in the 0th box_texts_.
78     image_data->box_texts_.emplace_back(truth_text);
79     // We will create a box for the whole image on PreScale, to save unpacking
80     // the image now.
81   } else if (truth_text != nullptr && truth_text[0] != '\0' &&
82              image_data->transcription_ != truth_text) {
83     // Save the truth text as it is present and disagrees with the box text.
84     image_data->transcription_ = truth_text;
85   }
86   return image_data;
87 }
88 
89 // Writes to the given file. Returns false in case of error.
Serialize(TFile * fp) const90 bool ImageData::Serialize(TFile *fp) const {
91   if (!fp->Serialize(imagefilename_)) {
92     return false;
93   }
94   if (!fp->Serialize(&page_number_)) {
95     return false;
96   }
97   if (!fp->Serialize(image_data_)) {
98     return false;
99   }
100   if (!fp->Serialize(language_)) {
101     return false;
102   }
103   if (!fp->Serialize(transcription_)) {
104     return false;
105   }
106   if (!fp->Serialize(boxes_)) {
107     return false;
108   }
109   if (!fp->Serialize(box_texts_)) {
110     return false;
111   }
112   int8_t vertical = vertical_text_;
113   return fp->Serialize(&vertical);
114 }
115 
116 // Reads from the given file. Returns false in case of error.
DeSerialize(TFile * fp)117 bool ImageData::DeSerialize(TFile *fp) {
118   if (!fp->DeSerialize(imagefilename_)) {
119     return false;
120   }
121   if (!fp->DeSerialize(&page_number_)) {
122     return false;
123   }
124   if (!fp->DeSerialize(image_data_)) {
125     return false;
126   }
127   if (!fp->DeSerialize(language_)) {
128     return false;
129   }
130   if (!fp->DeSerialize(transcription_)) {
131     return false;
132   }
133   if (!fp->DeSerialize(boxes_)) {
134     return false;
135   }
136   if (!fp->DeSerialize(box_texts_)) {
137     return false;
138   }
139   int8_t vertical = 0;
140   if (!fp->DeSerialize(&vertical)) {
141     return false;
142   }
143   vertical_text_ = vertical != 0;
144   return true;
145 }
146 
147 // As DeSerialize, but only seeks past the data - hence a static method.
SkipDeSerialize(TFile * fp)148 bool ImageData::SkipDeSerialize(TFile *fp) {
149   if (!fp->DeSerializeSkip()) {
150     return false;
151   }
152   int32_t page_number;
153   if (!fp->DeSerialize(&page_number)) {
154     return false;
155   }
156   if (!fp->DeSerializeSkip()) {
157     return false;
158   }
159   if (!fp->DeSerializeSkip()) {
160     return false;
161   }
162   if (!fp->DeSerializeSkip()) {
163     return false;
164   }
165   if (!fp->DeSerializeSkip(sizeof(TBOX))) {
166     return false;
167   }
168   int32_t number;
169   if (!fp->DeSerialize(&number)) {
170     return false;
171   }
172   for (int i = 0; i < number; i++) {
173     if (!fp->DeSerializeSkip()) {
174       return false;
175     }
176   }
177   int8_t vertical = 0;
178   return fp->DeSerialize(&vertical);
179 }
180 
181 // Saves the given Pix as a PNG-encoded string and destroys it.
182 // In case of missing PNG support in Leptonica use PNM format,
183 // which requires more memory.
SetPix(Image pix)184 void ImageData::SetPix(Image pix) {
185 #ifdef TESSERACT_IMAGEDATA_AS_PIX
186   internal_pix_ = pix;
187 #else
188   SetPixInternal(pix, &image_data_);
189 #endif
190 }
191 
192 // Returns the Pix image for *this. Must be pixDestroyed after use.
GetPix() const193 Image ImageData::GetPix() const {
194 #ifdef TESSERACT_IMAGEDATA_AS_PIX
195 #  ifdef GRAPHICS_DISABLED
196   /* The only caller of this is the scaling functions to prescale the
197    * source. Thus we can just return a new pointer to the same data. */
198   return internal_pix_.clone();
199 #  else
200   /* pixCopy always does an actual copy, so the caller can modify the
201    * changed data. */
202   return internal_pix_.copy();
203 #  endif
204 #else
205   return GetPixInternal(image_data_);
206 #endif
207 }
208 
209 // Gets anything and everything with a non-nullptr pointer, prescaled to a
210 // given target_height (if 0, then the original image height), and aligned.
211 // Also returns (if not nullptr) the width and height of the scaled image.
212 // The return value is the scaled Pix, which must be pixDestroyed after use,
213 // and scale_factor (if not nullptr) is set to the scale factor that was applied
214 // to the image to achieve the target_height.
PreScale(int target_height,int max_height,float * scale_factor,int * scaled_width,int * scaled_height,std::vector<TBOX> * boxes) const215 Image ImageData::PreScale(int target_height, int max_height,
216                           float *scale_factor, int *scaled_width,
217                           int *scaled_height, std::vector<TBOX> *boxes) const {
218   int input_width = 0;
219   int input_height = 0;
220   Image src_pix = GetPix();
221   ASSERT_HOST(src_pix != nullptr);
222   input_width = pixGetWidth(src_pix);
223   input_height = pixGetHeight(src_pix);
224   if (target_height == 0) {
225     target_height = std::min(input_height, max_height);
226   }
227   float im_factor = static_cast<float>(target_height) / input_height;
228   if (scaled_width != nullptr) {
229     *scaled_width = IntCastRounded(im_factor * input_width);
230   }
231   if (scaled_height != nullptr) {
232     *scaled_height = target_height;
233   }
234   // Get the scaled image.
235   Image pix = pixScale(src_pix, im_factor, im_factor);
236   if (pix == nullptr) {
237     tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n",
238             input_width, input_height, im_factor);
239     src_pix.destroy();
240     return nullptr;
241   }
242   if (scaled_width != nullptr) {
243     *scaled_width = pixGetWidth(pix);
244   }
245   if (scaled_height != nullptr) {
246     *scaled_height = pixGetHeight(pix);
247   }
248   src_pix.destroy();
249   if (boxes != nullptr) {
250     // Get the boxes.
251     boxes->clear();
252     for (auto box : boxes_) {
253       box.scale(im_factor);
254       boxes->push_back(box);
255     }
256     if (boxes->empty()) {
257       // Make a single box for the whole image.
258       TBOX box(0, 0, im_factor * input_width, target_height);
259       boxes->push_back(box);
260     }
261   }
262   if (scale_factor != nullptr) {
263     *scale_factor = im_factor;
264   }
265   return pix;
266 }
267 
MemoryUsed() const268 int ImageData::MemoryUsed() const {
269   return image_data_.size();
270 }
271 
272 #ifndef GRAPHICS_DISABLED
273 
274 // Draws the data in a new window.
Display() const275 void ImageData::Display() const {
276   const int kTextSize = 64;
277   // Draw the image.
278   Image pix = GetPix();
279   if (pix == nullptr) {
280     return;
281   }
282   int width = pixGetWidth(pix);
283   int height = pixGetHeight(pix);
284   auto *win = new ScrollView("Imagedata", 100, 100, 2 * (width + 2 * kTextSize),
285                              2 * (height + 4 * kTextSize), width + 10,
286                              height + 3 * kTextSize, true);
287   win->Draw(pix, 0, height - 1);
288   pix.destroy();
289   // Draw the boxes.
290   win->Pen(ScrollView::RED);
291   win->Brush(ScrollView::NONE);
292   int text_size = kTextSize;
293   if (!boxes_.empty() && boxes_[0].height() * 2 < text_size) {
294     text_size = boxes_[0].height() * 2;
295   }
296   win->TextAttributes("Arial", text_size, false, false, false);
297   if (!boxes_.empty()) {
298     for (unsigned b = 0; b < boxes_.size(); ++b) {
299       boxes_[b].plot(win);
300       win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].c_str());
301     }
302   } else {
303     // The full transcription.
304     win->Pen(ScrollView::CYAN);
305     win->Text(0, height + kTextSize * 2, transcription_.c_str());
306   }
307   win->Update();
308   win->Wait();
309 }
310 
311 #endif
312 
313 // Adds the supplied boxes and transcriptions that correspond to the correct
314 // page number.
AddBoxes(const std::vector<TBOX> & boxes,const std::vector<std::string> & texts,const std::vector<int> & box_pages)315 void ImageData::AddBoxes(const std::vector<TBOX> &boxes,
316                          const std::vector<std::string> &texts,
317                          const std::vector<int> &box_pages) {
318   // Copy the boxes and make the transcription.
319   for (unsigned i = 0; i < box_pages.size(); ++i) {
320     if (page_number_ >= 0 && box_pages[i] != page_number_) {
321       continue;
322     }
323     transcription_ += texts[i];
324     boxes_.push_back(boxes[i]);
325     box_texts_.push_back(texts[i]);
326   }
327 }
328 
329 #ifndef TESSERACT_IMAGEDATA_AS_PIX
330 // Saves the given Pix as a PNG-encoded string and destroys it.
331 // In case of missing PNG support in Leptonica use PNM format,
332 // which requires more memory.
SetPixInternal(Image pix,std::vector<char> * image_data)333 void ImageData::SetPixInternal(Image pix, std::vector<char> *image_data) {
334   l_uint8 *data;
335   size_t size;
336   l_int32 ret;
337   ret = pixWriteMem(&data, &size, pix, IFF_PNG);
338   if (ret) {
339     ret = pixWriteMem(&data, &size, pix, IFF_PNM);
340   }
341   pix.destroy();
342   // TODO: optimize resize (no init).
343   image_data->resize(size);
344   memcpy(&(*image_data)[0], data, size);
345   lept_free(data);
346 }
347 
348 // Returns the Pix image for the image_data. Must be pixDestroyed after use.
GetPixInternal(const std::vector<char> & image_data)349 Image ImageData::GetPixInternal(const std::vector<char> &image_data) {
350   Image pix = nullptr;
351   if (!image_data.empty()) {
352     // Convert the array to an image.
353     const auto *u_data =
354         reinterpret_cast<const unsigned char *>(&image_data[0]);
355     pix = pixReadMem(u_data, image_data.size());
356   }
357   return pix;
358 }
359 #endif
360 
361 // Parses the text string as a box file and adds any discovered boxes that
362 // match the page number. Returns false on error.
AddBoxes(const char * box_text)363 bool ImageData::AddBoxes(const char *box_text) {
364   if (box_text != nullptr && box_text[0] != '\0') {
365     std::vector<TBOX> boxes;
366     std::vector<std::string> texts;
367     std::vector<int> box_pages;
368     if (ReadMemBoxes(page_number_, /*skip_blanks*/ false, box_text,
369                      /*continue_on_failure*/ true, &boxes, &texts, nullptr,
370                      &box_pages)) {
371       AddBoxes(boxes, texts, box_pages);
372       return true;
373     } else {
374       tprintf("Error: No boxes for page %d from image %s!\n", page_number_,
375               imagefilename_.c_str());
376     }
377   }
378   return false;
379 }
380 
DocumentData(const std::string & name)381 DocumentData::DocumentData(const std::string &name)
382     : document_name_(name),
383       pages_offset_(-1),
384       total_pages_(-1),
385       memory_used_(0),
386       max_memory_(0),
387       reader_(nullptr) {}
388 
~DocumentData()389 DocumentData::~DocumentData() {
390   if (thread.joinable()) {
391     thread.join();
392   }
393   std::lock_guard<std::mutex> lock_p(pages_mutex_);
394   std::lock_guard<std::mutex> lock_g(general_mutex_);
395   for (auto data : pages_) {
396     delete data;
397   }
398 }
399 
400 // Reads all the pages in the given lstmf filename to the cache. The reader
401 // is used to read the file.
LoadDocument(const char * filename,int start_page,int64_t max_memory,FileReader reader)402 bool DocumentData::LoadDocument(const char *filename, int start_page,
403                                 int64_t max_memory, FileReader reader) {
404   SetDocument(filename, max_memory, reader);
405   pages_offset_ = start_page;
406   return ReCachePages();
407 }
408 
409 // Sets up the document, without actually loading it.
SetDocument(const char * filename,int64_t max_memory,FileReader reader)410 void DocumentData::SetDocument(const char *filename, int64_t max_memory,
411                                FileReader reader) {
412   std::lock_guard<std::mutex> lock_p(pages_mutex_);
413   std::lock_guard<std::mutex> lock(general_mutex_);
414   document_name_ = filename;
415   pages_offset_ = -1;
416   max_memory_ = max_memory;
417   reader_ = reader;
418 }
419 
420 // Writes all the pages to the given filename. Returns false on error.
SaveDocument(const char * filename,FileWriter writer)421 bool DocumentData::SaveDocument(const char *filename, FileWriter writer) {
422   std::lock_guard<std::mutex> lock(pages_mutex_);
423   TFile fp;
424   fp.OpenWrite(nullptr);
425   if (!fp.Serialize(pages_) || !fp.CloseWrite(filename, writer)) {
426     tprintf("Serialize failed: %s\n", filename);
427     return false;
428   }
429   return true;
430 }
431 
432 // Adds the given page data to this document, counting up memory.
AddPageToDocument(ImageData * page)433 void DocumentData::AddPageToDocument(ImageData *page) {
434   std::lock_guard<std::mutex> lock(pages_mutex_);
435   pages_.push_back(page);
436   set_memory_used(memory_used() + page->MemoryUsed());
437 }
438 
439 // If the given index is not currently loaded, loads it using a separate
440 // thread.
LoadPageInBackground(int index)441 void DocumentData::LoadPageInBackground(int index) {
442   ImageData *page = nullptr;
443   if (IsPageAvailable(index, &page)) {
444     return;
445   }
446   {
447     std::lock_guard<std::mutex> lock(pages_mutex_);
448     if (pages_offset_ == index) {
449       return;
450     }
451     pages_offset_ = index;
452     for (auto page : pages_) {
453       delete page;
454     }
455     pages_.clear();
456   }
457   if (thread.joinable()) {
458     thread.join();
459   }
460   // Don't run next statement asynchronously because that would
461   // create too many threads on Linux (see issue #3111).
462   ReCachePages();
463 }
464 
465 // Returns a pointer to the page with the given index, modulo the total
466 // number of pages. Blocks until the background load is completed.
GetPage(int index)467 const ImageData *DocumentData::GetPage(int index) {
468   ImageData *page = nullptr;
469   while (!IsPageAvailable(index, &page)) {
470     // If there is no background load scheduled, schedule one now.
471     pages_mutex_.lock();
472     bool needs_loading = pages_offset_ != index;
473     pages_mutex_.unlock();
474     if (needs_loading) {
475       LoadPageInBackground(index);
476     }
477     // We can't directly load the page, or the background load will delete it
478     // while the caller is using it, so give it a chance to work.
479     std::this_thread::yield();
480   }
481   return page;
482 }
483 
484 // Returns true if the requested page is available, and provides a pointer,
485 // which may be nullptr if the document is empty. May block, even though it
486 // doesn't guarantee to return true.
IsPageAvailable(int index,ImageData ** page)487 bool DocumentData::IsPageAvailable(int index, ImageData **page) {
488   std::lock_guard<std::mutex> lock(pages_mutex_);
489   int num_pages = NumPages();
490   if (num_pages == 0 || index < 0) {
491     *page = nullptr; // Empty Document.
492     return true;
493   }
494   if (num_pages > 0) {
495     index = Modulo(index, num_pages);
496     if (pages_offset_ <= index &&
497         static_cast<unsigned>(index) < pages_offset_ + pages_.size()) {
498       *page = pages_[index - pages_offset_]; // Page is available already.
499       return true;
500     }
501   }
502   return false;
503 }
504 
505 // Removes all pages from memory and frees the memory, but does not forget
506 // the document metadata.
UnCache()507 int64_t DocumentData::UnCache() {
508   std::lock_guard<std::mutex> lock(pages_mutex_);
509   int64_t memory_saved = memory_used();
510   for (auto page : pages_) {
511     delete page;
512   }
513   pages_.clear();
514   pages_offset_ = -1;
515   set_total_pages(-1);
516   set_memory_used(0);
517   tprintf("Unloaded document %s, saving %" PRId64 " memory\n",
518           document_name_.c_str(), memory_saved);
519   return memory_saved;
520 }
521 
522 // Shuffles all the pages in the document.
Shuffle()523 void DocumentData::Shuffle() {
524   TRand random;
525   // Different documents get shuffled differently, but the same for the same
526   // name.
527   random.set_seed(document_name_.c_str());
528   int num_pages = pages_.size();
529   // Execute one random swap for each page in the document.
530   for (int i = 0; i < num_pages; ++i) {
531     int src = random.IntRand() % num_pages;
532     int dest = random.IntRand() % num_pages;
533     std::swap(pages_[src], pages_[dest]);
534   }
535 }
536 
537 // Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
538 // starting at index pages_offset_.
ReCachePages()539 bool DocumentData::ReCachePages() {
540   std::lock_guard<std::mutex> lock(pages_mutex_);
541   // Read the file.
542   set_total_pages(0);
543   set_memory_used(0);
544   int loaded_pages = 0;
545   for (auto page : pages_) {
546     delete page;
547   }
548   pages_.clear();
549   TFile fp;
550   if (!fp.Open(document_name_.c_str(), reader_) ||
551       !fp.DeSerializeSize(&loaded_pages) || loaded_pages <= 0) {
552     tprintf("Deserialize header failed: %s\n", document_name_.c_str());
553     return false;
554   }
555   pages_offset_ %= loaded_pages;
556   // Skip pages before the first one we want, and load the rest until max
557   // memory and skip the rest after that.
558   int page;
559   for (page = 0; page < loaded_pages; ++page) {
560     uint8_t non_null;
561     if (!fp.DeSerialize(&non_null)) {
562       break;
563     }
564     if (page < pages_offset_ ||
565         (max_memory_ > 0 && memory_used() > max_memory_)) {
566       if (non_null && !ImageData::SkipDeSerialize(&fp)) {
567         break;
568       }
569     } else {
570       ImageData *image_data = nullptr;
571       if (non_null) {
572         image_data = new ImageData;
573         if (!image_data->DeSerialize(&fp)) {
574           delete image_data;
575           break;
576         }
577       }
578       pages_.push_back(image_data);
579       if (image_data->imagefilename().empty()) {
580         image_data->set_imagefilename(document_name_);
581         image_data->set_page_number(page);
582       }
583       set_memory_used(memory_used() + image_data->MemoryUsed());
584     }
585   }
586   if (page < loaded_pages) {
587     tprintf("Deserialize failed: %s read %d/%d lines\n", document_name_.c_str(),
588             page, loaded_pages);
589     for (auto page : pages_) {
590       delete page;
591     }
592     pages_.clear();
593   } else if (loaded_pages > 1) {
594     // Avoid lots of messages for training with single line images.
595     tprintf("Loaded %zu/%d lines (%d-%zu) of document %s\n", pages_.size(),
596             loaded_pages, pages_offset_ + 1, pages_offset_ + pages_.size(),
597             document_name_.c_str());
598   }
599   set_total_pages(loaded_pages);
600   return !pages_.empty();
601 }
602 
603 // A collection of DocumentData that knows roughly how much memory it is using.
DocumentCache(int64_t max_memory)604 DocumentCache::DocumentCache(int64_t max_memory) : max_memory_(max_memory) {}
605 
~DocumentCache()606 DocumentCache::~DocumentCache() {
607   for (auto *document : documents_) {
608     delete document;
609   }
610 }
611 
612 // Adds all the documents in the list of filenames, counting memory.
613 // The reader is used to read the files.
LoadDocuments(const std::vector<std::string> & filenames,CachingStrategy cache_strategy,FileReader reader)614 bool DocumentCache::LoadDocuments(const std::vector<std::string> &filenames,
615                                   CachingStrategy cache_strategy,
616                                   FileReader reader) {
617   cache_strategy_ = cache_strategy;
618   int64_t fair_share_memory = 0;
619   // In the round-robin case, each DocumentData handles restricting its content
620   // to its fair share of memory. In the sequential case, DocumentCache
621   // determines which DocumentDatas are held entirely in memory.
622   if (cache_strategy_ == CS_ROUND_ROBIN) {
623     fair_share_memory = max_memory_ / filenames.size();
624   }
625   for (const auto &filename : filenames) {
626     auto *document = new DocumentData(filename);
627     document->SetDocument(filename.c_str(), fair_share_memory, reader);
628     AddToCache(document);
629   }
630   if (!documents_.empty()) {
631     // Try to get the first page now to verify the list of filenames.
632     if (GetPageBySerial(0) != nullptr) {
633       return true;
634     }
635     tprintf("Load of page 0 failed!\n");
636   }
637   return false;
638 }
639 
640 // Adds document to the cache.
AddToCache(DocumentData * data)641 bool DocumentCache::AddToCache(DocumentData *data) {
642   documents_.push_back(data);
643   return true;
644 }
645 
646 // Finds and returns a document by name.
FindDocument(const std::string & document_name) const647 DocumentData *DocumentCache::FindDocument(
648     const std::string &document_name) const {
649   for (auto *document : documents_) {
650     if (document->document_name() == document_name) {
651       return document;
652     }
653   }
654   return nullptr;
655 }
656 
657 // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
658 // strategy, could take a long time.
TotalPages()659 int DocumentCache::TotalPages() {
660   if (cache_strategy_ == CS_SEQUENTIAL) {
661     // In sequential mode, we assume each doc has the same number of pages
662     // whether it is true or not.
663     if (num_pages_per_doc_ == 0) {
664       GetPageSequential(0);
665     }
666     return num_pages_per_doc_ * documents_.size();
667   }
668   int total_pages = 0;
669   for (auto *document : documents_) {
670     // We have to load a page to make NumPages() valid.
671     document->GetPage(0);
672     total_pages += document->NumPages();
673   }
674   return total_pages;
675 }
676 
677 // Returns a page by serial number, selecting them in a round-robin fashion
678 // from all the documents. Highly disk-intensive, but doesn't need samples
679 // to be shuffled between files to begin with.
GetPageRoundRobin(int serial)680 const ImageData *DocumentCache::GetPageRoundRobin(int serial) {
681   int num_docs = documents_.size();
682   int doc_index = serial % num_docs;
683   const ImageData *doc = documents_[doc_index]->GetPage(serial / num_docs);
684   for (int offset = 1; offset <= kMaxReadAhead && offset < num_docs; ++offset) {
685     doc_index = (serial + offset) % num_docs;
686     int page = (serial + offset) / num_docs;
687     documents_[doc_index]->LoadPageInBackground(page);
688   }
689   return doc;
690 }
691 
692 // Returns a page by serial number, selecting them in sequence from each file.
693 // Requires the samples to be shuffled between the files to give a random or
694 // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
GetPageSequential(int serial)695 const ImageData *DocumentCache::GetPageSequential(int serial) {
696   int num_docs = documents_.size();
697   ASSERT_HOST(num_docs > 0);
698   if (num_pages_per_doc_ == 0) {
699     // Use the pages in the first doc as the number of pages in each doc.
700     documents_[0]->GetPage(0);
701     num_pages_per_doc_ = documents_[0]->NumPages();
702     if (num_pages_per_doc_ == 0) {
703       tprintf("First document cannot be empty!!\n");
704       ASSERT_HOST(num_pages_per_doc_ > 0);
705     }
706     // Get rid of zero now if we don't need it.
707     if (serial / num_pages_per_doc_ % num_docs > 0) {
708       documents_[0]->UnCache();
709     }
710   }
711   int doc_index = serial / num_pages_per_doc_ % num_docs;
712   const ImageData *doc =
713       documents_[doc_index]->GetPage(serial % num_pages_per_doc_);
714   // Count up total memory. Background loading makes it more complicated to
715   // keep a running count.
716   int64_t total_memory = 0;
717   for (auto *document : documents_) {
718     total_memory += document->memory_used();
719   }
720   if (total_memory >= max_memory_) {
721     // Find something to un-cache.
722     // If there are more than 3 in front, then serial is from the back reader
723     // of a pair of readers. If we un-cache from in-front-2 to 2-ahead, then
724     // we create a hole between them and then un-caching the backmost occupied
725     // will work for both.
726     int num_in_front = CountNeighbourDocs(doc_index, 1);
727     for (int offset = num_in_front - 2;
728          offset > 1 && total_memory >= max_memory_; --offset) {
729       int next_index = (doc_index + offset) % num_docs;
730       total_memory -= documents_[next_index]->UnCache();
731     }
732     // If that didn't work, the best solution is to un-cache from the back. If
733     // we take away the document that a 2nd reader is using, it will put it
734     // back and make a hole between.
735     int num_behind = CountNeighbourDocs(doc_index, -1);
736     for (int offset = num_behind; offset < 0 && total_memory >= max_memory_;
737          ++offset) {
738       int next_index = (doc_index + offset + num_docs) % num_docs;
739       total_memory -= documents_[next_index]->UnCache();
740     }
741   }
742   int next_index = (doc_index + 1) % num_docs;
743   if (!documents_[next_index]->IsCached() && total_memory < max_memory_) {
744     documents_[next_index]->LoadPageInBackground(0);
745   }
746   return doc;
747 }
748 
749 // Helper counts the number of adjacent cached neighbours of index looking in
750 // direction dir, ie index+dir, index+2*dir etc.
CountNeighbourDocs(int index,int dir)751 int DocumentCache::CountNeighbourDocs(int index, int dir) {
752   int num_docs = documents_.size();
753   for (int offset = dir; abs(offset) < num_docs; offset += dir) {
754     int offset_index = (index + offset + num_docs) % num_docs;
755     if (!documents_[offset_index]->IsCached()) {
756       return offset - dir;
757     }
758   }
759   return num_docs;
760 }
761 
762 } // namespace tesseract.
763