1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 //
16 // Author: dsites@google.com (Dick Sites)
17 //
18
19
20 #ifndef I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
21 #define I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
22
23 #include "integral_types.h"
24 #include "langspan.h"
25 #include "offsetmap.h"
26
27 namespace CLD2 {
28
29 static const int kMaxScriptBuffer = 40960;
30 static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
31 static const int kMaxScriptBytes = kMaxScriptBuffer - 32; // Leave some room
32 static const int kWithinScriptTail = 32; // Stop at word space in last
33 // N bytes of script buffer
34
35
IsContinuationByte(char c)36 static inline bool IsContinuationByte(char c) {
37 return static_cast<signed char>(c) < -64;
38 }
39
40 // Gets lscript number for letters; always returns
41 // 0 (common script) for non-letters
42 int GetUTF8LetterScriptNum(const char* src);
43
44 // Update src pointer to point to next quadgram, +2..+5
45 // Looks at src[0..4]
46 const char* AdvanceQuad(const char* src);
47
48
49 class ScriptScanner {
50 public:
51 ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
52 ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text,
53 bool any_text, bool any_script);
54 ~ScriptScanner();
55
56 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
57 bool GetOneScriptSpan(LangSpan* span);
58
59 // Force Latin and Cyrillic scripts to be lowercase
60 void LowerScriptSpan(LangSpan* span);
61
62 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
63 // Force Latin and Cyrillic scripts to be lowercase
64 bool GetOneScriptSpanLower(LangSpan* span);
65
66 // Copy next run of non-tag characters to buffer [NUL terminated]
67 // This just removes tags and removes entities
68 // Buffer has leading space
69 bool GetOneTextSpan(LangSpan* span);
70
71 // Maps byte offset in most recent GetOneScriptSpan/Lower
72 // span->text [0..text_bytes] into an additional byte offset from
73 // span->offset, to get back to corresponding text in the original
74 // input buffer.
75 // text_offset must be the first byte
76 // of a UTF-8 character, or just beyond the last character. Normally this
77 // routine is called with the first byte of an interesting range and
78 // again with the first byte of the following range.
79 int MapBack(int text_offset);
80
GetBufferStart()81 const char* GetBufferStart() {return start_byte_;};
82
83 private:
84 // Skip over tags and non-letters
85 int SkipToFrontOfSpan(const char* src, int len, int* script);
86
87 const char* start_byte_; // Starting byte of buffer to scan
88 const char* next_byte_; // First unscanned byte
89 const char* next_byte_limit_; // Last byte + 1
90 int byte_length_; // Bytes left: next_byte_limit_ - next_byte_
91
92 bool is_plain_text_; // true fo text, false for HTML
93 char* script_buffer_; // Holds text with expanded entities
94 char* script_buffer_lower_; // Holds lowercased text
95 bool letters_marks_only_; // To distinguish scriptspan of one
96 // letters/marks vs. any mixture of text
97 bool one_script_only_; // To distinguish scriptspan of one
98 // script vs. any mixture of scripts
99 int exit_state_; // For tag parser kTagParseTbl_0, based
100 // on letters_marks_only_
101 public :
102 // Expose for debugging
103 OffsetMap map2original_; // map from script_buffer_ to buffer
104 OffsetMap map2uplow_; // map from script_buffer_lower_ to script_buffer_
105 };
106
107 } // namespace CLD2
108
109 #endif // I18N_ENCODINGS_CLD2_INTERNAL_GETONESCRIPTSPAN_H_
110
111