1 // Copyright 2013 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // 16 // Author: dsites@google.com (Dick Sites) 17 // 18 // 19 // Terminology: 20 // Incoming original text has HTML tags and entities removed, all but letters 21 // removed, and letters lowercased. Strings of non-letters are mapped to a 22 // single ASCII space. 23 // 24 // One scriptspan has a run of letters/spaces in a single script. This is the 25 // fundamental text unit that is scored. There is an optional backmap from 26 // scriptspan text to the original document text, so that the language ranges 27 // reported in ResultChunkVector refer to byte ranges inthe original text. 28 // 29 // Scripts come in two forms, the full Unicode scripts described by 30 // http://www.unicode.org/Public/UNIDATA/Scripts.txt 31 // and a modified list used exclusively in CLD2. The modified form maps all 32 // the CJK scripts to one, Hani. The current version description is in 33 // i18n/encodings/cld2/builddata/script_summary.txt 34 // In addition, all non-letters are mapped to the Common script. 35 // 36 // ULScript describes this Unicode Letter script. 37 // 38 // Scoring uses text nil-grams, uni-grams, bi-grams, quad-grams, and octa-grams. 39 // Nilgrams (no text lookup at all) are for script-based pseudo-languages and 40 // for languages that are 1:1 with a given script. Unigrams and bigrams are 41 // used to score the CJK languages, all in the Hani script. Quadgrams and 42 // octagrams are used to score all other languages. 43 // 44 // RType is the Recognition Type per ulscript. 45 // 46 // The scoring tables map various grams to language-probability scores. 47 // A given gram that hits in scoring table maps to an indirect subscript into 48 // a list of packed languages and log probabilities. 49 // 50 // Languages are stored in two forms: 10-bit values in the Languge enum, and 51 // shorter 8-bit per-ulscript values in the scoring tables. 52 // 53 // Language refers to the full 10-bit range. 54 // pslang refers to the per-ulscript shorter values. 55 // 56 // Log probabilities also come in two forms. The full range uses values 0..255 57 // to represent minus log base 10th-root-of-2, covering 1 .. 1/2**25.5 or about 58 // TODO BOGUS description, 24 vs 12 59 // 1/47.5M. The second form quantizes these into multiples of 8 that can be 60 // added together to represent probability products. The quantized form uses 61 // values 24..0 with 0 now least likely instead of most likely, thus making 62 // larger sums for more probable results. 24 maps to original 1/2**4.8 (~1/28) 63 // and 0 maps to original 1/2**24.0 (~1/16M). 64 // 65 // qprob refers to quantized log probabilities. 66 // 67 // langprob is a uint32 holding three 1-byte pslangs and a 1-byte subscript to 68 // a list of three qprobs. It always nees a companion ulscript 69 // 70 // A scriptspan is scored via one or more hitbuffers 71 72 73 #ifndef I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ 74 #define I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ 75 76 #include <stdio.h> 77 78 #include "integral_types.h" // for uint8 etc. 79 80 #include "cld2tablesummary.h" 81 #include "compact_lang_det_impl.h" // for ResultChunkVector 82 #include "getonescriptspan.h" 83 #include "langspan.h" 84 #include "tote.h" 85 #include "utf8statetable.h" 86 87 namespace CLD2 { 88 89 static const int kMaxBoosts = 4; // For each of PerScriptLangBoosts 90 // must be power of two for wrap() 91 static const int kChunksizeQuads = 20; // For non-CJK 92 static const int kChunksizeUnis = 50; // For CJK 93 static const int kMaxScoringHits = 1000; 94 static const int kMaxSummaries = kMaxScoringHits / kChunksizeQuads; 95 96 97 // The first four tables are for CJK languages, 98 // the next three for quadgram languages, and 99 // the last for expected scores. 100 typedef struct { 101 const UTF8PropObj* unigram_obj; // 80K CJK characters 102 const CLD2TableSummary* unigram_compat_obj; // 256 CJK lookup probabilities 103 const CLD2TableSummary* deltabi_obj; 104 const CLD2TableSummary* distinctbi_obj; 105 106 const CLD2TableSummary* quadgram_obj; // Primary quadgram lookup table 107 const CLD2TableSummary* quadgram_obj2; // Secondary " 108 const CLD2TableSummary* deltaocta_obj; 109 const CLD2TableSummary* distinctocta_obj; 110 111 const short* kExpectedScore; // Expected base + delta + distinct score 112 // per 1KB input 113 // Subscripted by language and script4 114 } ScoringTables; 115 116 // Context for boosting several languages 117 typedef struct { 118 int32 n; 119 uint32 langprob[kMaxBoosts]; wrap__anoncda60c400208120 int wrap(int32 n) {return n & (kMaxBoosts - 1);} 121 } LangBoosts; 122 123 typedef struct { 124 LangBoosts latn; 125 LangBoosts othr; 126 } PerScriptLangBoosts; 127 128 129 130 // ScoringContext carries state across scriptspans 131 // ScoringContext also has read-only scoring tables mapping grams to qprobs 132 typedef struct { 133 FILE* debug_file; // Non-NULL if debug output wanted 134 bool flags_cld2_score_as_quads; 135 bool flags_cld2_html; 136 bool flags_cld2_cr; 137 bool flags_cld2_verbose; 138 ULScript ulscript; // langprobs below are with respect to this script 139 Language prior_chunk_lang; // Mostly for debug output 140 // boost has a packed set of per-script langs and probabilites 141 // whack has a per-script lang to be suppressed from ever scoring (zeroed) 142 // When a language in a close set is given as an explicit hint, others in 143 // that set will be whacked. 144 PerScriptLangBoosts langprior_boost; // From http content-lang or meta lang= 145 PerScriptLangBoosts langprior_whack; // From http content-lang or meta lang= 146 PerScriptLangBoosts distinct_boost; // From distinctive letter groups 147 int oldest_distinct_boost; // Subscript in hitbuffer of oldest 148 // distinct score to use 149 const ScoringTables* scoringtables; // Probability lookup tables 150 ScriptScanner* scanner; // For ResultChunkVector backmap 151 152 // Inits boosts init__anoncda60c400408153 void init() { 154 memset(&langprior_boost, 0, sizeof(langprior_boost)); 155 memset(&langprior_whack, 0, sizeof(langprior_whack)); 156 memset(&distinct_boost, 0, sizeof(distinct_boost)); 157 }; 158 } ScoringContext; 159 160 161 162 // Begin private 163 164 // Holds one scoring-table lookup hit. We hold indirect subscript instead of 165 // langprob to allow a single hit to use a variable number of langprobs. 166 typedef struct { 167 int offset; // First byte of quad/octa etc. in scriptspan 168 int indirect; // subscript of langprobs in scoring table 169 } ScoringHit; 170 171 typedef enum { 172 UNIHIT = 0, 173 QUADHIT = 1, 174 DELTAHIT = 2, 175 DISTINCTHIT = 3 176 } LinearHitType; 177 178 // Holds one scoring-table lookup hit resolved into a langprob. 179 typedef struct { 180 uint16 offset; // First byte of quad/octa etc. in scriptspan 181 uint16 type; // LinearHitType 182 uint32 langprob; // langprob from scoring table 183 } LangprobHit; 184 185 // Holds arrays of scoring-table lookup hits for (part of) a scriptspan 186 typedef struct { 187 ULScript ulscript; // langprobs below are with respect to this script 188 int maxscoringhits; // determines size of arrays below 189 int next_base; // First unused entry in each array 190 int next_delta; // " 191 int next_distinct; // " 192 int next_linear; // " 193 int next_chunk_start; // First unused chunk_start entry 194 int lowest_offset; // First byte of text span used to fill hitbuffer 195 // Dummy entry at the end of each giving offset of first unused text byte 196 ScoringHit base[kMaxScoringHits + 1]; // Uni/quad hits 197 ScoringHit delta[kMaxScoringHits + 1]; // delta-bi/delta-octa hits 198 ScoringHit distinct[kMaxScoringHits + 1]; // distinct-word hits 199 LangprobHit linear[4 * kMaxScoringHits + 1]; // Above three merge-sorted 200 // (4: some bases => 2 linear) 201 int chunk_start[kMaxSummaries + 1]; // First linear[] subscr of 202 // each scored chunk 203 int chunk_offset[kMaxSummaries + 1]; // First text subscr of 204 // each scored chunk 205 init__anoncda60c400808206 void init() { 207 ulscript = ULScript_Common; 208 maxscoringhits = kMaxScoringHits; 209 next_base = 0; 210 next_delta = 0; 211 next_distinct = 0; 212 next_linear = 0; 213 next_chunk_start = 0; 214 lowest_offset = 0; 215 base[0].offset = 0; 216 base[0].indirect = 0; 217 delta[0].offset = 0; 218 delta[0].indirect = 0; 219 distinct[0].offset = 0; 220 distinct[0].indirect = 0; 221 linear[0].offset = 0; 222 linear[0].langprob = 0; 223 chunk_start[0] = 0; 224 chunk_offset[0] = 0; 225 }; 226 } ScoringHitBuffer; 227 228 // TODO: Explain here why we need both ChunkSpan and ChunkSummary 229 typedef struct { 230 int chunk_base; // Subscript of first hitbuffer.base[] in chunk 231 int chunk_delta; // Subscript of first hitbuffer.delta[] 232 int chunk_distinct; // Subscript of first hitbuffer.distinct[] 233 int base_len; // Number of hitbuffer.base[] in chunk 234 int delta_len; // Number of hitbuffer.delta[] in chunk 235 int distinct_len; // Number of hitbuffer.distinct[] in chunk 236 } ChunkSpan; 237 238 239 // Packed into 20 bytes for space 240 typedef struct { 241 uint16 offset; // Text offset within current scriptspan.text 242 uint16 chunk_start; // Scoring subscr within hitbuffer->linear[] 243 uint16 lang1; // Top lang, mapped to full Language 244 uint16 lang2; // Second lang, mapped to full Language 245 uint16 score1; // Top lang raw score 246 uint16 score2; // Second lang raw score 247 uint16 bytes; // Number of lower letters bytes in chunk 248 uint16 grams; // Number of scored base quad- uni-grams in chunk 249 uint16 ulscript; // ULScript of chunk 250 uint8 reliability_delta; // Reliability 0..100, delta top:second scores 251 uint8 reliability_score; // Reliability 0..100, top:expected score 252 } ChunkSummary; 253 254 255 // We buffer up ~50 chunk summaries, corresponding to chunks of 20 quads in a 256 // 1000-quad hit buffer, so we can do boundary adjustment on them 257 // when adjacent entries are different languages. After that, we add them 258 // all into the document score 259 // 260 // About 50 * 20 = 1000 bytes. OK for stack alloc 261 typedef struct { 262 int n; 263 ChunkSummary chunksummary[kMaxSummaries + 1]; 264 } SummaryBuffer; 265 266 // End private 267 268 269 // Score RTypeNone or RTypeOne scriptspan into doc_tote and vec, updating 270 // scoringcontext 271 void ScoreEntireScriptSpan(const LangSpan& scriptspan, 272 ScoringContext* scoringcontext, 273 DocTote* doc_tote, 274 ResultChunkVector* vec); 275 276 // Score RTypeCJK scriptspan into doc_tote and vec, updating scoringcontext 277 void ScoreCJKScriptSpan(const LangSpan& scriptspan, 278 ScoringContext* scoringcontext, 279 DocTote* doc_tote, 280 ResultChunkVector* vec); 281 282 // Score RTypeMany scriptspan into doc_tote and vec, updating scoringcontext 283 void ScoreQuadScriptSpan(const LangSpan& scriptspan, 284 ScoringContext* scoringcontext, 285 DocTote* doc_tote, 286 ResultChunkVector* vec); 287 288 // Score one scriptspan into doc_tote and vec, updating scoringcontext 289 void ScoreOneScriptSpan(const LangSpan& scriptspan, 290 ScoringContext* scoringcontext, 291 DocTote* doc_tote, 292 ResultChunkVector* vec); 293 294 } // End namespace CLD2 295 296 #endif // I18N_ENCODINGS_CLD2_INTERNAL_SCOREONESCRIPTSPAN_H__ 297 298