1 /******************************************************************************
2  *
3  * File:         context.cpp  (Formerly context.c)
4  * Description:  Context checking functions
5  * Author:       Mark Seaman, OCR Technology
6  *
7  * (c) Copyright 1990, Hewlett-Packard Company.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  *****************************************************************************/
19 
20 #include "dict.h"
21 #include "unicharset.h"
22 
23 namespace tesseract {
24 
25 static const int kMinAbsoluteGarbageWordLength = 10;
26 static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
27 
28 const int case_state_table[6][4] = {
29     {/*  0. Beginning of word       */
30      /*    P   U   L   D                                          */
31      /* -1. Error on case           */
32      0, 1, 5, 4},
33     {/*  1. After initial capital    */
34      0, 3, 2, 4},
35     {/*  2. After lower case         */
36      0, -1, 2, -1},
37     {/*  3. After upper case         */
38      0, 3, -1, 4},
39     {/*  4. After a digit            */
40      0, -1, -1, 4},
41     {/*  5. After initial lower case */
42      5, -1, 2, -1},
43 };
44 
case_ok(const WERD_CHOICE & word) const45 int Dict::case_ok(const WERD_CHOICE &word) const {
46   int state = 0;
47   const UNICHARSET *unicharset = word.unicharset();
48   for (unsigned x = 0; x < word.length(); ++x) {
49     UNICHAR_ID ch_id = word.unichar_id(x);
50     if (unicharset->get_isupper(ch_id)) {
51       state = case_state_table[state][1];
52     } else if (unicharset->get_islower(ch_id)) {
53       state = case_state_table[state][2];
54     } else if (unicharset->get_isdigit(ch_id)) {
55       state = case_state_table[state][3];
56     } else {
57       state = case_state_table[state][0];
58     }
59     if (state == -1) {
60       return false;
61     }
62   }
63   return state != 5; // single lower is bad
64 }
65 
absolute_garbage(const WERD_CHOICE & word,const UNICHARSET & unicharset)66 bool Dict::absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset) {
67   if (word.length() < kMinAbsoluteGarbageWordLength) {
68     return false;
69   }
70   int num_alphanum = 0;
71   for (unsigned x = 0; x < word.length(); ++x) {
72     num_alphanum +=
73         (unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x)));
74   }
75   return (static_cast<float>(num_alphanum) / static_cast<float>(word.length()) <
76           kMinAbsoluteGarbageAlphanumFrac);
77 }
78 
79 } // namespace tesseract
80