1 /******************************************************************************
2 *
3 * File: context.cpp (Formerly context.c)
4 * Description: Context checking functions
5 * Author: Mark Seaman, OCR Technology
6 *
7 * (c) Copyright 1990, Hewlett-Packard Company.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 *****************************************************************************/
19
20 #include "dict.h"
21 #include "unicharset.h"
22
23 namespace tesseract {
24
25 static const int kMinAbsoluteGarbageWordLength = 10;
26 static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
27
28 const int case_state_table[6][4] = {
29 {/* 0. Beginning of word */
30 /* P U L D */
31 /* -1. Error on case */
32 0, 1, 5, 4},
33 {/* 1. After initial capital */
34 0, 3, 2, 4},
35 {/* 2. After lower case */
36 0, -1, 2, -1},
37 {/* 3. After upper case */
38 0, 3, -1, 4},
39 {/* 4. After a digit */
40 0, -1, -1, 4},
41 {/* 5. After initial lower case */
42 5, -1, 2, -1},
43 };
44
case_ok(const WERD_CHOICE & word) const45 int Dict::case_ok(const WERD_CHOICE &word) const {
46 int state = 0;
47 const UNICHARSET *unicharset = word.unicharset();
48 for (unsigned x = 0; x < word.length(); ++x) {
49 UNICHAR_ID ch_id = word.unichar_id(x);
50 if (unicharset->get_isupper(ch_id)) {
51 state = case_state_table[state][1];
52 } else if (unicharset->get_islower(ch_id)) {
53 state = case_state_table[state][2];
54 } else if (unicharset->get_isdigit(ch_id)) {
55 state = case_state_table[state][3];
56 } else {
57 state = case_state_table[state][0];
58 }
59 if (state == -1) {
60 return false;
61 }
62 }
63 return state != 5; // single lower is bad
64 }
65
absolute_garbage(const WERD_CHOICE & word,const UNICHARSET & unicharset)66 bool Dict::absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset) {
67 if (word.length() < kMinAbsoluteGarbageWordLength) {
68 return false;
69 }
70 int num_alphanum = 0;
71 for (unsigned x = 0; x < word.length(); ++x) {
72 num_alphanum +=
73 (unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x)));
74 }
75 return (static_cast<float>(num_alphanum) / static_cast<float>(word.length()) <
76 kMinAbsoluteGarbageAlphanumFrac);
77 }
78
79 } // namespace tesseract
80