1 ///////////////////////////////////////////////////////////////////////
2 // File: imagedata.cpp
3 // Description: Class to hold information about a single multi-page tiff
4 // training file and its corresponding boxes or text file.
5 // Author: Ray Smith
6 //
7 // (C) Copyright 2013, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 ///////////////////////////////////////////////////////////////////////
18
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23
24 #include "imagedata.h"
25
26 #include "boxread.h" // for ReadMemBoxes
27 #include "rect.h" // for TBOX
28 #include "scrollview.h" // for ScrollView, ScrollView::CYAN, ScrollView::NONE
29 #include "tprintf.h" // for tprintf
30
31 #include "helpers.h" // for IntCastRounded, TRand, ClipToRange, Modulo
32 #include "serialis.h" // for TFile
33
34 #include <allheaders.h> // for pixDestroy, pixGetHeight, pixGetWidth, lept_...
35
36 #include <cinttypes> // for PRId64
37
38 namespace tesseract {
39
40 // Number of documents to read ahead while training. Doesn't need to be very
41 // large.
42 const int kMaxReadAhead = 8;
43
ImageData()44 ImageData::ImageData() : page_number_(-1), vertical_text_(false) {}
45 // Takes ownership of the pix and destroys it.
ImageData(bool vertical,Image pix)46 ImageData::ImageData(bool vertical, Image pix)
47 : page_number_(0), vertical_text_(vertical) {
48 SetPix(pix);
49 }
~ImageData()50 ImageData::~ImageData() {
51 #ifdef TESSERACT_IMAGEDATA_AS_PIX
52 internal_pix_.destroy();
53 #endif
54 }
55
56 // Builds and returns an ImageData from the basic data. Note that imagedata,
57 // truth_text, and box_text are all the actual file data, NOT filenames.
Build(const char * name,int page_number,const char * lang,const char * imagedata,int imagedatasize,const char * truth_text,const char * box_text)58 ImageData *ImageData::Build(const char *name, int page_number, const char *lang,
59 const char *imagedata, int imagedatasize,
60 const char *truth_text, const char *box_text) {
61 auto *image_data = new ImageData();
62 image_data->imagefilename_ = name;
63 image_data->page_number_ = page_number;
64 image_data->language_ = lang;
65 // Save the imagedata.
66 // TODO: optimize resize (no init).
67 image_data->image_data_.resize(imagedatasize);
68 memcpy(&image_data->image_data_[0], imagedata, imagedatasize);
69 if (!image_data->AddBoxes(box_text)) {
70 if (truth_text == nullptr || truth_text[0] == '\0') {
71 tprintf("Error: No text corresponding to page %d from image %s!\n",
72 page_number, name);
73 delete image_data;
74 return nullptr;
75 }
76 image_data->transcription_ = truth_text;
77 // If we have no boxes, the transcription is in the 0th box_texts_.
78 image_data->box_texts_.emplace_back(truth_text);
79 // We will create a box for the whole image on PreScale, to save unpacking
80 // the image now.
81 } else if (truth_text != nullptr && truth_text[0] != '\0' &&
82 image_data->transcription_ != truth_text) {
83 // Save the truth text as it is present and disagrees with the box text.
84 image_data->transcription_ = truth_text;
85 }
86 return image_data;
87 }
88
89 // Writes to the given file. Returns false in case of error.
Serialize(TFile * fp) const90 bool ImageData::Serialize(TFile *fp) const {
91 if (!fp->Serialize(imagefilename_)) {
92 return false;
93 }
94 if (!fp->Serialize(&page_number_)) {
95 return false;
96 }
97 if (!fp->Serialize(image_data_)) {
98 return false;
99 }
100 if (!fp->Serialize(language_)) {
101 return false;
102 }
103 if (!fp->Serialize(transcription_)) {
104 return false;
105 }
106 if (!fp->Serialize(boxes_)) {
107 return false;
108 }
109 if (!fp->Serialize(box_texts_)) {
110 return false;
111 }
112 int8_t vertical = vertical_text_;
113 return fp->Serialize(&vertical);
114 }
115
116 // Reads from the given file. Returns false in case of error.
DeSerialize(TFile * fp)117 bool ImageData::DeSerialize(TFile *fp) {
118 if (!fp->DeSerialize(imagefilename_)) {
119 return false;
120 }
121 if (!fp->DeSerialize(&page_number_)) {
122 return false;
123 }
124 if (!fp->DeSerialize(image_data_)) {
125 return false;
126 }
127 if (!fp->DeSerialize(language_)) {
128 return false;
129 }
130 if (!fp->DeSerialize(transcription_)) {
131 return false;
132 }
133 if (!fp->DeSerialize(boxes_)) {
134 return false;
135 }
136 if (!fp->DeSerialize(box_texts_)) {
137 return false;
138 }
139 int8_t vertical = 0;
140 if (!fp->DeSerialize(&vertical)) {
141 return false;
142 }
143 vertical_text_ = vertical != 0;
144 return true;
145 }
146
147 // As DeSerialize, but only seeks past the data - hence a static method.
SkipDeSerialize(TFile * fp)148 bool ImageData::SkipDeSerialize(TFile *fp) {
149 if (!fp->DeSerializeSkip()) {
150 return false;
151 }
152 int32_t page_number;
153 if (!fp->DeSerialize(&page_number)) {
154 return false;
155 }
156 if (!fp->DeSerializeSkip()) {
157 return false;
158 }
159 if (!fp->DeSerializeSkip()) {
160 return false;
161 }
162 if (!fp->DeSerializeSkip()) {
163 return false;
164 }
165 if (!fp->DeSerializeSkip(sizeof(TBOX))) {
166 return false;
167 }
168 int32_t number;
169 if (!fp->DeSerialize(&number)) {
170 return false;
171 }
172 for (int i = 0; i < number; i++) {
173 if (!fp->DeSerializeSkip()) {
174 return false;
175 }
176 }
177 int8_t vertical = 0;
178 return fp->DeSerialize(&vertical);
179 }
180
181 // Saves the given Pix as a PNG-encoded string and destroys it.
182 // In case of missing PNG support in Leptonica use PNM format,
183 // which requires more memory.
SetPix(Image pix)184 void ImageData::SetPix(Image pix) {
185 #ifdef TESSERACT_IMAGEDATA_AS_PIX
186 internal_pix_ = pix;
187 #else
188 SetPixInternal(pix, &image_data_);
189 #endif
190 }
191
192 // Returns the Pix image for *this. Must be pixDestroyed after use.
GetPix() const193 Image ImageData::GetPix() const {
194 #ifdef TESSERACT_IMAGEDATA_AS_PIX
195 # ifdef GRAPHICS_DISABLED
196 /* The only caller of this is the scaling functions to prescale the
197 * source. Thus we can just return a new pointer to the same data. */
198 return internal_pix_.clone();
199 # else
200 /* pixCopy always does an actual copy, so the caller can modify the
201 * changed data. */
202 return internal_pix_.copy();
203 # endif
204 #else
205 return GetPixInternal(image_data_);
206 #endif
207 }
208
209 // Gets anything and everything with a non-nullptr pointer, prescaled to a
210 // given target_height (if 0, then the original image height), and aligned.
211 // Also returns (if not nullptr) the width and height of the scaled image.
212 // The return value is the scaled Pix, which must be pixDestroyed after use,
213 // and scale_factor (if not nullptr) is set to the scale factor that was applied
214 // to the image to achieve the target_height.
PreScale(int target_height,int max_height,float * scale_factor,int * scaled_width,int * scaled_height,std::vector<TBOX> * boxes) const215 Image ImageData::PreScale(int target_height, int max_height,
216 float *scale_factor, int *scaled_width,
217 int *scaled_height, std::vector<TBOX> *boxes) const {
218 int input_width = 0;
219 int input_height = 0;
220 Image src_pix = GetPix();
221 ASSERT_HOST(src_pix != nullptr);
222 input_width = pixGetWidth(src_pix);
223 input_height = pixGetHeight(src_pix);
224 if (target_height == 0) {
225 target_height = std::min(input_height, max_height);
226 }
227 float im_factor = static_cast<float>(target_height) / input_height;
228 if (scaled_width != nullptr) {
229 *scaled_width = IntCastRounded(im_factor * input_width);
230 }
231 if (scaled_height != nullptr) {
232 *scaled_height = target_height;
233 }
234 // Get the scaled image.
235 Image pix = pixScale(src_pix, im_factor, im_factor);
236 if (pix == nullptr) {
237 tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n",
238 input_width, input_height, im_factor);
239 src_pix.destroy();
240 return nullptr;
241 }
242 if (scaled_width != nullptr) {
243 *scaled_width = pixGetWidth(pix);
244 }
245 if (scaled_height != nullptr) {
246 *scaled_height = pixGetHeight(pix);
247 }
248 src_pix.destroy();
249 if (boxes != nullptr) {
250 // Get the boxes.
251 boxes->clear();
252 for (auto box : boxes_) {
253 box.scale(im_factor);
254 boxes->push_back(box);
255 }
256 if (boxes->empty()) {
257 // Make a single box for the whole image.
258 TBOX box(0, 0, im_factor * input_width, target_height);
259 boxes->push_back(box);
260 }
261 }
262 if (scale_factor != nullptr) {
263 *scale_factor = im_factor;
264 }
265 return pix;
266 }
267
MemoryUsed() const268 int ImageData::MemoryUsed() const {
269 return image_data_.size();
270 }
271
272 #ifndef GRAPHICS_DISABLED
273
274 // Draws the data in a new window.
Display() const275 void ImageData::Display() const {
276 const int kTextSize = 64;
277 // Draw the image.
278 Image pix = GetPix();
279 if (pix == nullptr) {
280 return;
281 }
282 int width = pixGetWidth(pix);
283 int height = pixGetHeight(pix);
284 auto *win = new ScrollView("Imagedata", 100, 100, 2 * (width + 2 * kTextSize),
285 2 * (height + 4 * kTextSize), width + 10,
286 height + 3 * kTextSize, true);
287 win->Draw(pix, 0, height - 1);
288 pix.destroy();
289 // Draw the boxes.
290 win->Pen(ScrollView::RED);
291 win->Brush(ScrollView::NONE);
292 int text_size = kTextSize;
293 if (!boxes_.empty() && boxes_[0].height() * 2 < text_size) {
294 text_size = boxes_[0].height() * 2;
295 }
296 win->TextAttributes("Arial", text_size, false, false, false);
297 if (!boxes_.empty()) {
298 for (unsigned b = 0; b < boxes_.size(); ++b) {
299 boxes_[b].plot(win);
300 win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].c_str());
301 }
302 } else {
303 // The full transcription.
304 win->Pen(ScrollView::CYAN);
305 win->Text(0, height + kTextSize * 2, transcription_.c_str());
306 }
307 win->Update();
308 win->Wait();
309 }
310
311 #endif
312
313 // Adds the supplied boxes and transcriptions that correspond to the correct
314 // page number.
AddBoxes(const std::vector<TBOX> & boxes,const std::vector<std::string> & texts,const std::vector<int> & box_pages)315 void ImageData::AddBoxes(const std::vector<TBOX> &boxes,
316 const std::vector<std::string> &texts,
317 const std::vector<int> &box_pages) {
318 // Copy the boxes and make the transcription.
319 for (unsigned i = 0; i < box_pages.size(); ++i) {
320 if (page_number_ >= 0 && box_pages[i] != page_number_) {
321 continue;
322 }
323 transcription_ += texts[i];
324 boxes_.push_back(boxes[i]);
325 box_texts_.push_back(texts[i]);
326 }
327 }
328
329 #ifndef TESSERACT_IMAGEDATA_AS_PIX
330 // Saves the given Pix as a PNG-encoded string and destroys it.
331 // In case of missing PNG support in Leptonica use PNM format,
332 // which requires more memory.
SetPixInternal(Image pix,std::vector<char> * image_data)333 void ImageData::SetPixInternal(Image pix, std::vector<char> *image_data) {
334 l_uint8 *data;
335 size_t size;
336 l_int32 ret;
337 ret = pixWriteMem(&data, &size, pix, IFF_PNG);
338 if (ret) {
339 ret = pixWriteMem(&data, &size, pix, IFF_PNM);
340 }
341 pix.destroy();
342 // TODO: optimize resize (no init).
343 image_data->resize(size);
344 memcpy(&(*image_data)[0], data, size);
345 lept_free(data);
346 }
347
348 // Returns the Pix image for the image_data. Must be pixDestroyed after use.
GetPixInternal(const std::vector<char> & image_data)349 Image ImageData::GetPixInternal(const std::vector<char> &image_data) {
350 Image pix = nullptr;
351 if (!image_data.empty()) {
352 // Convert the array to an image.
353 const auto *u_data =
354 reinterpret_cast<const unsigned char *>(&image_data[0]);
355 pix = pixReadMem(u_data, image_data.size());
356 }
357 return pix;
358 }
359 #endif
360
361 // Parses the text string as a box file and adds any discovered boxes that
362 // match the page number. Returns false on error.
AddBoxes(const char * box_text)363 bool ImageData::AddBoxes(const char *box_text) {
364 if (box_text != nullptr && box_text[0] != '\0') {
365 std::vector<TBOX> boxes;
366 std::vector<std::string> texts;
367 std::vector<int> box_pages;
368 if (ReadMemBoxes(page_number_, /*skip_blanks*/ false, box_text,
369 /*continue_on_failure*/ true, &boxes, &texts, nullptr,
370 &box_pages)) {
371 AddBoxes(boxes, texts, box_pages);
372 return true;
373 } else {
374 tprintf("Error: No boxes for page %d from image %s!\n", page_number_,
375 imagefilename_.c_str());
376 }
377 }
378 return false;
379 }
380
DocumentData(const std::string & name)381 DocumentData::DocumentData(const std::string &name)
382 : document_name_(name),
383 pages_offset_(-1),
384 total_pages_(-1),
385 memory_used_(0),
386 max_memory_(0),
387 reader_(nullptr) {}
388
~DocumentData()389 DocumentData::~DocumentData() {
390 if (thread.joinable()) {
391 thread.join();
392 }
393 std::lock_guard<std::mutex> lock_p(pages_mutex_);
394 std::lock_guard<std::mutex> lock_g(general_mutex_);
395 for (auto data : pages_) {
396 delete data;
397 }
398 }
399
400 // Reads all the pages in the given lstmf filename to the cache. The reader
401 // is used to read the file.
LoadDocument(const char * filename,int start_page,int64_t max_memory,FileReader reader)402 bool DocumentData::LoadDocument(const char *filename, int start_page,
403 int64_t max_memory, FileReader reader) {
404 SetDocument(filename, max_memory, reader);
405 pages_offset_ = start_page;
406 return ReCachePages();
407 }
408
409 // Sets up the document, without actually loading it.
SetDocument(const char * filename,int64_t max_memory,FileReader reader)410 void DocumentData::SetDocument(const char *filename, int64_t max_memory,
411 FileReader reader) {
412 std::lock_guard<std::mutex> lock_p(pages_mutex_);
413 std::lock_guard<std::mutex> lock(general_mutex_);
414 document_name_ = filename;
415 pages_offset_ = -1;
416 max_memory_ = max_memory;
417 reader_ = reader;
418 }
419
420 // Writes all the pages to the given filename. Returns false on error.
SaveDocument(const char * filename,FileWriter writer)421 bool DocumentData::SaveDocument(const char *filename, FileWriter writer) {
422 std::lock_guard<std::mutex> lock(pages_mutex_);
423 TFile fp;
424 fp.OpenWrite(nullptr);
425 if (!fp.Serialize(pages_) || !fp.CloseWrite(filename, writer)) {
426 tprintf("Serialize failed: %s\n", filename);
427 return false;
428 }
429 return true;
430 }
431
432 // Adds the given page data to this document, counting up memory.
AddPageToDocument(ImageData * page)433 void DocumentData::AddPageToDocument(ImageData *page) {
434 std::lock_guard<std::mutex> lock(pages_mutex_);
435 pages_.push_back(page);
436 set_memory_used(memory_used() + page->MemoryUsed());
437 }
438
439 // If the given index is not currently loaded, loads it using a separate
440 // thread.
LoadPageInBackground(int index)441 void DocumentData::LoadPageInBackground(int index) {
442 ImageData *page = nullptr;
443 if (IsPageAvailable(index, &page)) {
444 return;
445 }
446 {
447 std::lock_guard<std::mutex> lock(pages_mutex_);
448 if (pages_offset_ == index) {
449 return;
450 }
451 pages_offset_ = index;
452 for (auto page : pages_) {
453 delete page;
454 }
455 pages_.clear();
456 }
457 if (thread.joinable()) {
458 thread.join();
459 }
460 // Don't run next statement asynchronously because that would
461 // create too many threads on Linux (see issue #3111).
462 ReCachePages();
463 }
464
465 // Returns a pointer to the page with the given index, modulo the total
466 // number of pages. Blocks until the background load is completed.
GetPage(int index)467 const ImageData *DocumentData::GetPage(int index) {
468 ImageData *page = nullptr;
469 while (!IsPageAvailable(index, &page)) {
470 // If there is no background load scheduled, schedule one now.
471 pages_mutex_.lock();
472 bool needs_loading = pages_offset_ != index;
473 pages_mutex_.unlock();
474 if (needs_loading) {
475 LoadPageInBackground(index);
476 }
477 // We can't directly load the page, or the background load will delete it
478 // while the caller is using it, so give it a chance to work.
479 std::this_thread::yield();
480 }
481 return page;
482 }
483
484 // Returns true if the requested page is available, and provides a pointer,
485 // which may be nullptr if the document is empty. May block, even though it
486 // doesn't guarantee to return true.
IsPageAvailable(int index,ImageData ** page)487 bool DocumentData::IsPageAvailable(int index, ImageData **page) {
488 std::lock_guard<std::mutex> lock(pages_mutex_);
489 int num_pages = NumPages();
490 if (num_pages == 0 || index < 0) {
491 *page = nullptr; // Empty Document.
492 return true;
493 }
494 if (num_pages > 0) {
495 index = Modulo(index, num_pages);
496 if (pages_offset_ <= index &&
497 static_cast<unsigned>(index) < pages_offset_ + pages_.size()) {
498 *page = pages_[index - pages_offset_]; // Page is available already.
499 return true;
500 }
501 }
502 return false;
503 }
504
505 // Removes all pages from memory and frees the memory, but does not forget
506 // the document metadata.
UnCache()507 int64_t DocumentData::UnCache() {
508 std::lock_guard<std::mutex> lock(pages_mutex_);
509 int64_t memory_saved = memory_used();
510 for (auto page : pages_) {
511 delete page;
512 }
513 pages_.clear();
514 pages_offset_ = -1;
515 set_total_pages(-1);
516 set_memory_used(0);
517 tprintf("Unloaded document %s, saving %" PRId64 " memory\n",
518 document_name_.c_str(), memory_saved);
519 return memory_saved;
520 }
521
522 // Shuffles all the pages in the document.
Shuffle()523 void DocumentData::Shuffle() {
524 TRand random;
525 // Different documents get shuffled differently, but the same for the same
526 // name.
527 random.set_seed(document_name_.c_str());
528 int num_pages = pages_.size();
529 // Execute one random swap for each page in the document.
530 for (int i = 0; i < num_pages; ++i) {
531 int src = random.IntRand() % num_pages;
532 int dest = random.IntRand() % num_pages;
533 std::swap(pages_[src], pages_[dest]);
534 }
535 }
536
537 // Locks the pages_mutex_ and Loads as many pages can fit in max_memory_
538 // starting at index pages_offset_.
ReCachePages()539 bool DocumentData::ReCachePages() {
540 std::lock_guard<std::mutex> lock(pages_mutex_);
541 // Read the file.
542 set_total_pages(0);
543 set_memory_used(0);
544 int loaded_pages = 0;
545 for (auto page : pages_) {
546 delete page;
547 }
548 pages_.clear();
549 TFile fp;
550 if (!fp.Open(document_name_.c_str(), reader_) ||
551 !fp.DeSerializeSize(&loaded_pages) || loaded_pages <= 0) {
552 tprintf("Deserialize header failed: %s\n", document_name_.c_str());
553 return false;
554 }
555 pages_offset_ %= loaded_pages;
556 // Skip pages before the first one we want, and load the rest until max
557 // memory and skip the rest after that.
558 int page;
559 for (page = 0; page < loaded_pages; ++page) {
560 uint8_t non_null;
561 if (!fp.DeSerialize(&non_null)) {
562 break;
563 }
564 if (page < pages_offset_ ||
565 (max_memory_ > 0 && memory_used() > max_memory_)) {
566 if (non_null && !ImageData::SkipDeSerialize(&fp)) {
567 break;
568 }
569 } else {
570 ImageData *image_data = nullptr;
571 if (non_null) {
572 image_data = new ImageData;
573 if (!image_data->DeSerialize(&fp)) {
574 delete image_data;
575 break;
576 }
577 }
578 pages_.push_back(image_data);
579 if (image_data->imagefilename().empty()) {
580 image_data->set_imagefilename(document_name_);
581 image_data->set_page_number(page);
582 }
583 set_memory_used(memory_used() + image_data->MemoryUsed());
584 }
585 }
586 if (page < loaded_pages) {
587 tprintf("Deserialize failed: %s read %d/%d lines\n", document_name_.c_str(),
588 page, loaded_pages);
589 for (auto page : pages_) {
590 delete page;
591 }
592 pages_.clear();
593 } else if (loaded_pages > 1) {
594 // Avoid lots of messages for training with single line images.
595 tprintf("Loaded %zu/%d lines (%d-%zu) of document %s\n", pages_.size(),
596 loaded_pages, pages_offset_ + 1, pages_offset_ + pages_.size(),
597 document_name_.c_str());
598 }
599 set_total_pages(loaded_pages);
600 return !pages_.empty();
601 }
602
603 // A collection of DocumentData that knows roughly how much memory it is using.
DocumentCache(int64_t max_memory)604 DocumentCache::DocumentCache(int64_t max_memory) : max_memory_(max_memory) {}
605
~DocumentCache()606 DocumentCache::~DocumentCache() {
607 for (auto *document : documents_) {
608 delete document;
609 }
610 }
611
612 // Adds all the documents in the list of filenames, counting memory.
613 // The reader is used to read the files.
LoadDocuments(const std::vector<std::string> & filenames,CachingStrategy cache_strategy,FileReader reader)614 bool DocumentCache::LoadDocuments(const std::vector<std::string> &filenames,
615 CachingStrategy cache_strategy,
616 FileReader reader) {
617 cache_strategy_ = cache_strategy;
618 int64_t fair_share_memory = 0;
619 // In the round-robin case, each DocumentData handles restricting its content
620 // to its fair share of memory. In the sequential case, DocumentCache
621 // determines which DocumentDatas are held entirely in memory.
622 if (cache_strategy_ == CS_ROUND_ROBIN) {
623 fair_share_memory = max_memory_ / filenames.size();
624 }
625 for (const auto &filename : filenames) {
626 auto *document = new DocumentData(filename);
627 document->SetDocument(filename.c_str(), fair_share_memory, reader);
628 AddToCache(document);
629 }
630 if (!documents_.empty()) {
631 // Try to get the first page now to verify the list of filenames.
632 if (GetPageBySerial(0) != nullptr) {
633 return true;
634 }
635 tprintf("Load of page 0 failed!\n");
636 }
637 return false;
638 }
639
640 // Adds document to the cache.
AddToCache(DocumentData * data)641 bool DocumentCache::AddToCache(DocumentData *data) {
642 documents_.push_back(data);
643 return true;
644 }
645
646 // Finds and returns a document by name.
FindDocument(const std::string & document_name) const647 DocumentData *DocumentCache::FindDocument(
648 const std::string &document_name) const {
649 for (auto *document : documents_) {
650 if (document->document_name() == document_name) {
651 return document;
652 }
653 }
654 return nullptr;
655 }
656
657 // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
658 // strategy, could take a long time.
TotalPages()659 int DocumentCache::TotalPages() {
660 if (cache_strategy_ == CS_SEQUENTIAL) {
661 // In sequential mode, we assume each doc has the same number of pages
662 // whether it is true or not.
663 if (num_pages_per_doc_ == 0) {
664 GetPageSequential(0);
665 }
666 return num_pages_per_doc_ * documents_.size();
667 }
668 int total_pages = 0;
669 for (auto *document : documents_) {
670 // We have to load a page to make NumPages() valid.
671 document->GetPage(0);
672 total_pages += document->NumPages();
673 }
674 return total_pages;
675 }
676
677 // Returns a page by serial number, selecting them in a round-robin fashion
678 // from all the documents. Highly disk-intensive, but doesn't need samples
679 // to be shuffled between files to begin with.
GetPageRoundRobin(int serial)680 const ImageData *DocumentCache::GetPageRoundRobin(int serial) {
681 int num_docs = documents_.size();
682 int doc_index = serial % num_docs;
683 const ImageData *doc = documents_[doc_index]->GetPage(serial / num_docs);
684 for (int offset = 1; offset <= kMaxReadAhead && offset < num_docs; ++offset) {
685 doc_index = (serial + offset) % num_docs;
686 int page = (serial + offset) / num_docs;
687 documents_[doc_index]->LoadPageInBackground(page);
688 }
689 return doc;
690 }
691
692 // Returns a page by serial number, selecting them in sequence from each file.
693 // Requires the samples to be shuffled between the files to give a random or
694 // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
GetPageSequential(int serial)695 const ImageData *DocumentCache::GetPageSequential(int serial) {
696 int num_docs = documents_.size();
697 ASSERT_HOST(num_docs > 0);
698 if (num_pages_per_doc_ == 0) {
699 // Use the pages in the first doc as the number of pages in each doc.
700 documents_[0]->GetPage(0);
701 num_pages_per_doc_ = documents_[0]->NumPages();
702 if (num_pages_per_doc_ == 0) {
703 tprintf("First document cannot be empty!!\n");
704 ASSERT_HOST(num_pages_per_doc_ > 0);
705 }
706 // Get rid of zero now if we don't need it.
707 if (serial / num_pages_per_doc_ % num_docs > 0) {
708 documents_[0]->UnCache();
709 }
710 }
711 int doc_index = serial / num_pages_per_doc_ % num_docs;
712 const ImageData *doc =
713 documents_[doc_index]->GetPage(serial % num_pages_per_doc_);
714 // Count up total memory. Background loading makes it more complicated to
715 // keep a running count.
716 int64_t total_memory = 0;
717 for (auto *document : documents_) {
718 total_memory += document->memory_used();
719 }
720 if (total_memory >= max_memory_) {
721 // Find something to un-cache.
722 // If there are more than 3 in front, then serial is from the back reader
723 // of a pair of readers. If we un-cache from in-front-2 to 2-ahead, then
724 // we create a hole between them and then un-caching the backmost occupied
725 // will work for both.
726 int num_in_front = CountNeighbourDocs(doc_index, 1);
727 for (int offset = num_in_front - 2;
728 offset > 1 && total_memory >= max_memory_; --offset) {
729 int next_index = (doc_index + offset) % num_docs;
730 total_memory -= documents_[next_index]->UnCache();
731 }
732 // If that didn't work, the best solution is to un-cache from the back. If
733 // we take away the document that a 2nd reader is using, it will put it
734 // back and make a hole between.
735 int num_behind = CountNeighbourDocs(doc_index, -1);
736 for (int offset = num_behind; offset < 0 && total_memory >= max_memory_;
737 ++offset) {
738 int next_index = (doc_index + offset + num_docs) % num_docs;
739 total_memory -= documents_[next_index]->UnCache();
740 }
741 }
742 int next_index = (doc_index + 1) % num_docs;
743 if (!documents_[next_index]->IsCached() && total_memory < max_memory_) {
744 documents_[next_index]->LoadPageInBackground(0);
745 }
746 return doc;
747 }
748
749 // Helper counts the number of adjacent cached neighbours of index looking in
750 // direction dir, ie index+dir, index+2*dir etc.
CountNeighbourDocs(int index,int dir)751 int DocumentCache::CountNeighbourDocs(int index, int dir) {
752 int num_docs = documents_.size();
753 for (int offset = dir; abs(offset) < num_docs; offset += dir) {
754 int offset_index = (index + offset + num_docs) % num_docs;
755 if (!documents_[offset_index]->IsCached()) {
756 return offset - dir;
757 }
758 }
759 return num_docs;
760 }
761
762 } // namespace tesseract.
763