1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 //
16 // Author: dsites@google.com (Dick Sites)
17 //
18 
19 
20 #include "getonescriptspan.h"
21 #include <string.h>
22 
23 #include "fixunicodevalue.h"
24 #include "lang_script.h"
25 #include "port.h"
26 #include "utf8statetable.h"
27 
28 #include "utf8prop_lettermarkscriptnum.h"
29 #include "utf8repl_lettermarklower.h"
30 #include "utf8scannot_lettermarkspecial.h"
31 
32 
33 namespace CLD2 {
34 
35 // Alphabetical order for binary search, from
36 // generated_entities.cc
37 extern const int kNameToEntitySize;
38 extern const CharIntPair kNameToEntity[];
39 
40 static const int kMaxUpToWordBoundary = 50;       // span < this make longer,
41                                                   // else make shorter
42 static const int kMaxAdvanceToWordBoundary = 10;  // +/- this many bytes
43                                                   // to round to word boundary,
44                                                   // direction above
45 
46 static const char kSpecialSymbol[256] = {       // true for < > &
47   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
48   0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,1,0,1,0,
49   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
50   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
51 
52   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
53   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
54   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
55   0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
56 };
57 
58 
59 
60 #define LT 0      // <
61 #define GT 1      // >
62 #define EX 2      // !
63 #define HY 3      // -
64 #define QU 4      // "
65 #define AP 5      // '
66 #define SL 6      // /
67 #define S_ 7
68 #define C_ 8
69 #define R_ 9
70 #define I_ 10
71 #define P_ 11
72 #define T_ 12
73 #define Y_ 13
74 #define L_ 14
75 #define E_ 15
76 #define CR 16     // <cr> or <lf>
77 #define NL 17     // non-letter: ASCII whitespace, digit, punctuation
78 #define PL 18     // possible letter, incl. &
79 #define xx 19     // <unused>
80 
81 // Map byte to one of ~20 interesting categories for cheap tag parsing
82 static const uint8 kCharToSub[256] = {
83   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,CR,NL, NL,CR,NL,NL,
84   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
85   NL,EX,QU,NL, NL,NL,PL,AP, NL,NL,NL,NL, NL,HY,NL,SL,
86   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, LT,NL,GT,NL,
87 
88   PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
89   P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
90   PL,PL,PL,C_, PL,E_,PL,PL, PL,I_,PL,PL, L_,PL,PL,PL,
91   P_,PL,R_,S_, T_,PL,PL,PL, PL,Y_,PL,NL, NL,NL,NL,NL,
92 
93   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
94   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
95   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
96   NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL, NL,NL,NL,NL,
97 
98   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
99   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
100   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
101   PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL, PL,PL,PL,PL,
102 };
103 
104 #undef LT
105 #undef GT
106 #undef EX
107 #undef HY
108 #undef QU
109 #undef AP
110 #undef SL
111 #undef S_
112 #undef C_
113 #undef R_
114 #undef I_
115 #undef P_
116 #undef T_
117 #undef Y_
118 #undef L_
119 #undef E_
120 #undef CR
121 #undef NL
122 #undef PL
123 #undef xx
124 
125 
126 #define OK 0
127 #define X_ 1
128 
129 
130 static const int kMaxExitStateLettersMarksOnly = 1;
131 static const int kMaxExitStateAllText = 2;
132 
133 
134 // State machine to do cheap parse of non-letter strings incl. tags
135 // advances <tag>
136 //          |    |
137 // advances <tag> ... </tag>  for <script> <style>
138 //          |               |
139 // advances <!-- ... <tag> ... -->
140 //          |                     |
141 // advances <tag
142 //          ||  (0)
143 // advances <tag <tag2>
144 //          ||  (0)
145 //
146 // We start in state [0] at a non-letter and make at least one transition
147 // When scanning for just letters, arriving back at state [0] or [1] exits
148 //   the state machine.
149 // When scanning for any non-tag text, arriving at state [2] also exits
150 static const uint8 kTagParseTbl_0[] = {
151 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
152    3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [0] OK    exit state
153   X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, X_,X_,X_,X_, // [1] error exit state
154    3, 2, 2, 2,  2, 2, 2,OK, OK,OK,OK,OK, OK,OK,OK,OK,  2, 2,OK,X_, // [2] NL*   [exit state]
155   X_, 2, 4, 9, 10,11, 9,13,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [3] <
156   X_, 2, 9, 5, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [4] <!
157   X_, 2, 9, 6, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [5] <!-
158    6, 6, 6, 7,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [6] <!--.*
159    6, 6, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [7] <!--.*-
160    6, 2, 6, 8,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6, 6,  6, 6, 6,X_, // [8] <!--.*--
161   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [9] <.*
162   10,10,10,10,  9,10,10,10, 10,10,10,10, 10,10,10,10, 12,10,10,X_, // [10] <.*"
163   11,11,11,11, 11, 9,11,11, 11,11,11,11, 11,11,11,11, 12,11,11,X_, // [11] <.*'
164   X_, 2,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,12, 12,12,12,X_, // [12] <.* no " '
165 
166 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
167   X_, 2, 9, 9, 10,11, 9, 9, 14, 9, 9, 9, 28, 9, 9, 9,  9, 9, 9,X_, // [13] <S
168   X_, 2, 9, 9, 10,11, 9, 9,  9,15, 9, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [14] <SC
169   X_, 2, 9, 9, 10,11, 9, 9,  9, 9,16, 9,  9, 9, 9, 9,  9, 9, 9,X_, // [15] <SCR
170   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9,17,  9, 9, 9, 9,  9, 9, 9,X_, // [16] <SCRI
171   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9, 18, 9, 9, 9,  9, 9, 9,X_, // [17] <SCRIP
172   X_,19, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 19,19, 9,X_, // [18] <SCRIPT
173   20,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [19] <SCRIPT .*
174   19,19,19,19, 19,19,21,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [20] <SCRIPT .*<
175   19,19,19,19, 19,19,19,22, 19,19,19,19, 19,19,19,19, 21,21,19,X_, // [21] <SCRIPT .*</ allow SP CR LF
176   19,19,19,19, 19,19,19,19, 23,19,19,19, 19,19,19,19, 19,19,19,X_, // [22] <SCRIPT .*</S
177   19,19,19,19, 19,19,19,19, 19,24,19,19, 19,19,19,19, 19,19,19,X_, // [23] <SCRIPT .*</SC
178   19,19,19,19, 19,19,19,19, 19,19,25,19, 19,19,19,19, 19,19,19,X_, // [24] <SCRIPT .*</SCR
179   19,19,19,19, 19,19,19,19, 19,19,19,26, 19,19,19,19, 19,19,19,X_, // [25] <SCRIPT .*</SCRI
180   19,19,19,19, 19,19,19,19, 19,19,19,19, 27,19,19,19, 19,19,19,X_, // [26] <SCRIPT .*</SCRIP
181   19, 2,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,19, 19,19,19,X_, // [27] <SCRIPT .*</SCRIPT
182 
183 // <  >  !  -   "  '  /  S   C  R  I  P   T  Y  L  E  CR NL PL xx
184   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9,29, 9, 9,  9, 9, 9,X_, // [28] <ST
185   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9,30, 9,  9, 9, 9,X_, // [29] <STY
186   X_, 2, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9,31,  9, 9, 9,X_, // [30] <STYL
187   X_,32, 9, 9, 10,11, 9, 9,  9, 9, 9, 9,  9, 9, 9, 9, 32,32, 9,X_, // [31] <STYLE
188   33,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [32] <STYLE .*
189   32,32,32,32, 32,32,34,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [33] <STYLE .*<
190   32,32,32,32, 32,32,32,35, 32,32,32,32, 32,32,32,32, 34,34,32,X_, // [34] <STYLE .*</ allow SP CR LF
191   32,32,32,32, 32,32,32,32, 32,32,32,32, 36,32,32,32, 32,32,32,X_, // [35] <STYLE .*</S
192   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,37,32,32, 32,32,32,X_, // [36] <STYLE .*</ST
193   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,38,32, 32,32,32,X_, // [37] <STYLE .*</STY
194   32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,39, 32,32,32,X_, // [38] <STYLE .*</STYL
195   32, 2,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,32, 32,32,32,X_, // [39] <STYLE .*</STYLE
196 };
197 
198 #undef OK
199 #undef X_
200 
201 enum
202 {
203   UTFmax        = 4,            // maximum bytes per rune
204   Runesync      = 0x80,         // cannot represent part of a UTF sequence (<)
205   Runeself      = 0x80,         // rune and UTF sequences are the same (<)
206   Runeerror     = 0xFFFD,       // decoding error in UTF
207   Runemax       = 0x10FFFF,     // maximum rune value
208 };
209 
210 // Debugging. Not thread safe.
211 static char gDisplayPiece[32];
212 const uint8 gCharlen[16] = {1,1,1,1, 1,1,1,1, 1,1,1,1, 2,2,3,4};
DisplayPiece(const char * next_byte_,int byte_length_)213 char* DisplayPiece(const char* next_byte_, int byte_length_) {
214   // Copy up to 8 UTF-8 chars to buffer
215   int k = 0;    // byte count
216   int n = 0;    // character count
217   for (int i = 0; i < byte_length_; ++i) {
218     char c = next_byte_[i];
219     if ((c & 0xc0) != 0x80) {
220       // Beginning of a UTF-8 character
221       int charlen = gCharlen[static_cast<uint8>(c) >> 4];
222       if (i + charlen > byte_length_) {break;} // Not enough room for full char
223       if (k >= (32 - 7)) {break;}   // Not necessarily enough room
224       if (n >= 8) {break;}          // Enough characters already
225       ++n;
226     }
227     if (c == '<') {
228       memcpy(&gDisplayPiece[k], "&lt;", 4); k += 4;
229     } else if (c == '>') {
230       memcpy(&gDisplayPiece[k], "&gt;", 4); k += 4;
231     } else if (c == '&') {
232       memcpy(&gDisplayPiece[k], "&amp;", 5); k += 5;
233     } else if (c == '\'') {
234       memcpy(&gDisplayPiece[k], "&apos;", 6); k += 6;
235     } else if (c == '"') {
236       memcpy(&gDisplayPiece[k], "&quot;", 6); k += 6;
237     } else {
238       gDisplayPiece[k++] = c;
239     }
240   }
241   gDisplayPiece[k++] = '\0';
242   return gDisplayPiece;
243 }
244 
245 
246 
247 // runetochar copies (encodes) one rune, pointed to by r, to at most
248 // UTFmax bytes starting at s and returns the number of bytes generated.
runetochar(char * str,const char32 * rune)249 int runetochar(char *str, const char32 *rune) {
250   // Convert to unsigned for range check.
251   unsigned long c;
252 
253   // 1 char 00-7F
254   c = *rune;
255   if(c <= 0x7F) {
256     str[0] = static_cast<char>(c);
257     return 1;
258   }
259 
260   // 2 char 0080-07FF
261   if(c <= 0x07FF) {
262     str[0] = 0xC0 | static_cast<char>(c >> 1*6);
263     str[1] = 0x80 | (c & 0x3F);
264     return 2;
265   }
266 
267   // Range check
268   if (c > Runemax) {
269     c = Runeerror;
270   }
271 
272   // 3 char 0800-FFFF
273   if (c <= 0xFFFF) {
274     str[0] = 0xE0 | static_cast<char>(c >> 2*6);
275     str[1] = 0x80 | ((c >> 1*6) & 0x3F);
276     str[2] = 0x80 | (c & 0x3F);
277     return 3;
278   }
279 
280   // 4 char 10000-1FFFFF
281   str[0] = 0xF0 | static_cast<char>(c >> 3*6);
282   str[1] = 0x80 | ((c >> 2*6) & 0x3F);
283   str[2] = 0x80 | ((c >> 1*6) & 0x3F);
284   str[3] = 0x80 | (c & 0x3F);
285   return 4;
286 }
287 
288 
289 
290 // Useful for converting an entity to an ascii value.
291 // RETURNS unicode value, or -1 if entity isn't valid.  Don't include & or ;
LookupEntity(const char * entity_name,int entity_len)292 int LookupEntity(const char* entity_name, int entity_len) {
293   // Make a C string
294   if (entity_len >= 16) {return -1;}    // All real entities are shorter
295   char temp[16];
296   memcpy(temp, entity_name, entity_len);
297   temp[entity_len] = '\0';
298   int match = BinarySearch(temp, 0, kNameToEntitySize, kNameToEntity);
299   if (match >= 0) {return kNameToEntity[match].i;}
300   return -1;
301 }
302 
ascii_isdigit(char c)303 bool ascii_isdigit(char c) {
304   return ('0' <= c) && (c <= '9');
305 }
ascii_isxdigit(char c)306 bool ascii_isxdigit(char c) {
307   if (('0' <= c) && (c <= '9')) {return true;}
308   if (('a' <= c) && (c <= 'f')) {return true;}
309   if (('A' <= c) && (c <= 'F')) {return true;}
310   return false;
311 }
ascii_isalnum(char c)312 bool ascii_isalnum(char c) {
313   if (('0' <= c) && (c <= '9')) {return true;}
314   if (('a' <= c) && (c <= 'z')) {return true;}
315   if (('A' <= c) && (c <= 'Z')) {return true;}
316   return false;
317 }
hex_digit_to_int(char c)318 int hex_digit_to_int(char c) {
319   if (('0' <= c) && (c <= '9')) {return c - '0';}
320   if (('a' <= c) && (c <= 'f')) {return c - 'a' + 10;}
321   if (('A' <= c) && (c <= 'F')) {return c - 'A' + 10;}
322   return 0;
323 }
324 
strto32_base10(const char * nptr,const char * limit,const char ** endptr)325 static int32 strto32_base10(const char* nptr, const char* limit,
326                             const char **endptr) {
327   *endptr = nptr;
328   while (nptr < limit && *nptr == '0') {
329     ++nptr;
330   }
331   if (nptr == limit || !ascii_isdigit(*nptr))
332     return -1;
333   const char* end_digits_run = nptr;
334   while (end_digits_run < limit && ascii_isdigit(*end_digits_run)) {
335     ++end_digits_run;
336   }
337   *endptr = end_digits_run;
338   const int num_digits = end_digits_run - nptr;
339   // kint32max == 2147483647.
340   if (num_digits < 9 ||
341       (num_digits == 10 && memcmp(nptr, "2147483647", 10) <= 0)) {
342     int value = 0;
343     for (; nptr < end_digits_run; ++nptr) {
344       value *= 10;
345       value += *nptr - '0';
346     }
347     // Overflow past the last valid unicode codepoint
348     // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
349     return FixUnicodeValue(value);
350   } else {
351     // Overflow: can't fit in an int32;
352     // returns the replacement character 0xFFFD.
353     return 0xFFFD;
354   }
355 }
356 
strto32_base16(const char * nptr,const char * limit,const char ** endptr)357 static int32 strto32_base16(const char* nptr, const char* limit,
358                             const char **endptr) {
359   *endptr = nptr;
360   while (nptr < limit && *nptr == '0') {
361     ++nptr;
362   }
363   if (nptr == limit || !ascii_isxdigit(*nptr)) {
364     return -1;
365   }
366   const char* end_xdigits_run = nptr;
367   while (end_xdigits_run < limit && ascii_isxdigit(*end_xdigits_run)) {
368     ++end_xdigits_run;
369   }
370   *endptr = end_xdigits_run;
371   const int num_xdigits = end_xdigits_run - nptr;
372   // kint32max == 0x7FFFFFFF.
373   if (num_xdigits < 8 || (num_xdigits == 8 && nptr[0] < '8')) {
374     int value = 0;
375     for (; nptr < end_xdigits_run; ++nptr) {
376       value <<= 4;
377       value += hex_digit_to_int(*nptr);
378     }
379     // Overflow past the last valid unicode codepoint
380     // (0x10ffff) is converted to U+FFFD by FixUnicodeValue().
381     return FixUnicodeValue(value);
382   } else {
383     // Overflow: can't fit in an int32;
384     // returns the replacement character 0xFFFD.
385     return 0xFFFD;
386   }
387 }
388 
389 // Unescape the current character pointed to by src.  SETS the number
390 // of chars read for the conversion (in UTF8).  If src isn't a valid entity,
391 // just consume the & and RETURN -1.  If src doesn't point to & -- which it
392 // should -- set src_consumed to 0 and RETURN -1.
ReadEntity(const char * src,int srcn,int * src_consumed)393 int ReadEntity(const char* src, int srcn, int* src_consumed) {
394   const char* const srcend = src + srcn;
395 
396   if (srcn == 0 || *src != '&') {      // input should start with an ampersand
397     *src_consumed = 0;
398     return -1;
399   }
400   *src_consumed = 1;                   // we'll get the & at least
401 
402   // The standards are a bit unclear on when an entity ends.  Certainly a ";"
403   // ends one, but spaces probably do too.  We follow the lead of both IE and
404   // Netscape, which as far as we can tell end numeric entities (1st case below)
405   // at any non-digit, and end character entities (2nd case) at any non-alnum.
406   const char* entstart, *entend;  // where the entity starts and ends
407   entstart = src + 1;             // read past the &
408   int entval;                     // UCS2 value of the entity
409   if ( *entstart == '#' ) {       // -- 1st case: numeric entity
410     if ( entstart + 2 >= srcend ) {
411       return -1;                  // no way a legitimate number could fit
412     } else if ( entstart[1] == 'x' || entstart[1] == 'X' ) {   // hex numeric
413       entval = strto32_base16(entstart + 2, srcend, &entend);
414     } else {                                  // decimal numeric entity
415       entval = strto32_base10(entstart+1, srcend, &entend);
416     }
417     if (entval == -1 || entend > srcend) {
418       return -1;                 // not entirely correct, but close enough
419     }
420   } else {                       // -- 2nd case: character entity
421     for (entend = entstart;
422          entend < srcend && ascii_isalnum(*entend);
423          ++entend ) {
424       // entity consists of alphanumeric chars
425     }
426     entval = LookupEntity(entstart, entend - entstart);
427     if (entval < 0) {
428       return -1;  // not a legal entity name
429     }
430     // Now we do a strange-seeming IE6-compatibility check: if entval is
431     // >= 256, it *must* be followed by a semicolon or it's not considered
432     // an entity.  The problem is lots of the newfangled entity names, like
433     // "lang", also occur in URL CGI arguments: "/search?q=test&lang=en".
434     // When these links are written in HTML, it would be really bad if the
435     // "&lang" were treated as an entity, which is what the spec says
436     // *should* happen (even when the HTML is inside an "A HREF" tag!)
437     // IE ignores the spec for these new, high-value entities, so we do too.
438     if ( entval >= 256 && !(entend < srcend && *entend == ';') ) {
439       return -1;                 // make non-;-terminated entity illegal
440     }
441   }
442 
443   // Finally, figure out how much src was consumed
444   if ( entend < srcend && *entend == ';' ) {
445     entend++;                    // standard says ; terminator is special
446   }
447   *src_consumed = entend - src;
448   return entval;
449 }
450 
451 
452 // Src points to '&'
453 // Writes entity value to dst. Returns take(src), put(dst) byte counts
EntityToBuffer(const char * src,int len,char * dst,int * tlen,int * plen)454 void EntityToBuffer(const char* src, int len, char* dst,
455                     int* tlen, int* plen) {
456   char32 entval = ReadEntity(src, len, tlen);
457 
458   // ReadEntity does this already: entval = FixUnicodeValue(entval);
459 
460   // Convert UTF-32 to UTF-8
461   if (entval > 0) {
462     *plen = runetochar(dst, &entval);
463   } else {
464     // Illegal entity; ignore the '&'
465     *tlen = 1;
466     *plen = 0;
467   }
468 }
469 
470 // Returns true if character is < > or &, none of which are letters
IsSpecial(char c)471 bool inline IsSpecial(char c) {
472   if ((c & 0xe0) == 0x20) {
473     return kSpecialSymbol[static_cast<uint8>(c)];
474   }
475   return false;
476 }
477 
478 // Quick Skip to next letter or < > & or to end of string (eos)
479 // Always return is_letter for eos
ScanToLetterOrSpecial(const char * src,int len)480 int ScanToLetterOrSpecial(const char* src, int len) {
481   int bytes_consumed;
482   StringPiece str(src, len);
483   UTF8GenericScan(&utf8scannot_lettermarkspecial_obj, str, &bytes_consumed);
484   return bytes_consumed;
485 }
486 
487 
488 
489 
490 // src points to non-letter, such as tag-opening '<'
491 // Return length from here to next possible letter
492 // On another < before >, return 1
493 // advances <tag>
494 //          |    |
495 // advances <tag> ... </tag>  for <script> <style>
496 //          |               |
497 // advances <!-- ... <tag> ... -->
498 //          |                     |
499 // advances <tag
500 //          |    | end of string
501 // advances <tag <tag2>
502 //          ||
ScanToPossibleLetter(const char * isrc,int len,int max_exit_state)503 int ScanToPossibleLetter(const char* isrc, int len, int max_exit_state) {
504   const uint8* src = reinterpret_cast<const uint8*>(isrc);
505   const uint8* srclimit = src + len;
506   const uint8* tagParseTbl = kTagParseTbl_0;
507   int e = 0;
508   while (src < srclimit) {
509     e = tagParseTbl[kCharToSub[*src++]];
510     if (e <= max_exit_state) {
511       // We overshot by one byte
512       --src;
513       break;
514     }
515     tagParseTbl = &kTagParseTbl_0[e * 20];
516   }
517 
518   if (src >= srclimit) {
519     // We fell off the end of the text.
520     // It looks like the most common case for this is a truncated file, not
521     // mismatched angle brackets. So we pretend that the last char was '>'
522     return len;
523   }
524 
525   // OK to be in state 0 or state 2 at exit
526   if ((e != 0) && (e != 2)) {
527     // Error, '<' followed by '<'
528     // We want to back up to first <, then advance by one byte past it
529     int offset = src - reinterpret_cast<const uint8*>(isrc);
530 
531     // Backscan to first '<' and return enough length to just get past it
532     --offset;   // back up over the second '<', which caused us to stop
533     while ((0 < offset) && (isrc[offset] != '<')) {
534       // Find the first '<', which is unmatched
535       --offset;
536     }
537     // skip to just beyond first '<'
538     return offset + 1;
539   }
540 
541   return src - reinterpret_cast<const uint8*>(isrc);
542 }
543 
544 
ScriptScanner(const char * buffer,int buffer_length,bool is_plain_text)545 ScriptScanner::ScriptScanner(const char* buffer,
546                              int buffer_length,
547                              bool is_plain_text)
548   : start_byte_(buffer),
549   next_byte_(buffer),
550   next_byte_limit_(buffer + buffer_length),
551   byte_length_(buffer_length),
552   is_plain_text_(is_plain_text),
553   letters_marks_only_(true),
554   one_script_only_(true),
555   exit_state_(kMaxExitStateLettersMarksOnly) {
556     script_buffer_ = new char[kMaxScriptBuffer];
557     script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
558     map2original_.Clear();    // map from script_buffer_ to buffer
559     map2uplow_.Clear();       // map from script_buffer_lower_ to script_buffer_
560 }
561 
562 // Extended version to allow spans of any non-tag text and spans of mixed script
ScriptScanner(const char * buffer,int buffer_length,bool is_plain_text,bool any_text,bool any_script)563 ScriptScanner::ScriptScanner(const char* buffer,
564                              int buffer_length,
565                              bool is_plain_text,
566                              bool any_text,
567                              bool any_script)
568   : start_byte_(buffer),
569   next_byte_(buffer),
570   next_byte_limit_(buffer + buffer_length),
571   byte_length_(buffer_length),
572   is_plain_text_(is_plain_text),
573   letters_marks_only_(!any_text),
574   one_script_only_(!any_script),
575   exit_state_(any_text ? kMaxExitStateAllText : kMaxExitStateLettersMarksOnly) {
576     script_buffer_ = new char[kMaxScriptBuffer];
577     script_buffer_lower_ = new char[kMaxScriptLowerBuffer];
578     map2original_.Clear();    // map from script_buffer_ to buffer
579     map2uplow_.Clear();       // map from script_buffer_lower_ to script_buffer_
580 }
581 
582 
~ScriptScanner()583 ScriptScanner::~ScriptScanner() {
584   delete[] script_buffer_;
585   delete[] script_buffer_lower_;
586 }
587 
588 
589 
590 
591 // Get to the first real non-tag letter or entity that is a letter
592 // Sets script of that letter
593 // Return len if no more letters
SkipToFrontOfSpan(const char * src,int len,int * script)594 int ScriptScanner::SkipToFrontOfSpan(const char* src, int len, int* script) {
595   int sc = UNKNOWN_ULSCRIPT;
596   int skip = 0;
597   int tlen, plen;
598 
599   // Do run of non-letters (tag | &NL | NL)*
600   tlen = 0;
601   while (skip < len) {
602     // Do fast scan to next interesting byte
603     // int oldskip = skip;
604     skip += ScanToLetterOrSpecial(src + skip, len - skip);
605 
606     // Check for no more letters/specials
607     if (skip >= len) {
608       // All done
609       *script = sc;
610       return len;
611     }
612 
613     // We are at a letter, nonletter, tag, or entity
614     if (IsSpecial(src[skip]) && !is_plain_text_) {
615       if (src[skip] == '<') {
616         // Begining of tag; skip to end and go around again
617         tlen = ScanToPossibleLetter(src + skip, len - skip,
618                                     exit_state_);
619         sc = 0;
620       } else if (src[skip] == '>') {
621         // Unexpected end of tag; skip it and go around again
622         tlen = 1;         // Over the >
623         sc = 0;
624       } else if (src[skip] == '&') {
625         // Expand entity, no advance
626         char temp[4];
627         EntityToBuffer(src + skip, len - skip,
628                        temp, &tlen, &plen);
629         if (plen > 0) {
630           sc = GetUTF8LetterScriptNum(temp);
631         }
632       }
633     } else {
634       // Update 1..4 bytes
635       tlen = UTF8OneCharLen(src + skip);
636       sc = GetUTF8LetterScriptNum(src + skip);
637     }
638     if (sc != 0) {break;}           // Letter found
639     skip += tlen;                   // Else advance
640   }
641 
642   *script = sc;
643   return skip;
644 }
645 
646 
647 // These are for ASCII-only tag names
648 // Compare one letter uplow to c, ignoring case of uplowp
EqCase(char uplow,char c)649 inline bool EqCase(char uplow, char c) {
650   return (uplow | 0x20) == c;
651 }
652 
653 // These are for ASCII-only tag names
654 // Return true for space / < > etc. all less than 0x40
NeqLetter(char c)655 inline bool NeqLetter(char c) {
656   return c < 0x40;
657 }
658 
659 // These are for ASCII-only tag names
660 // Return true for space \n false for \r
WS(char c)661 inline bool WS(char c) {
662   return (c == ' ') || (c == '\n');
663 }
664 
665 // Canonical CR or LF
666 static const char LF = '\n';
667 
668 
669 // The naive loop scans from next_byte_ to script_buffer_ until full.
670 // But this can leave an awkward hard-to-identify short fragment at the
671 // end of the input. We would prefer to make the next-to-last fragment
672 // shorter and the last fragment longer.
673 
674 // Copy next run of non-tag characters to buffer [NUL terminated]
675 // This just replaces tags with space or \n and removes entities.
676 // Tags <br> <p> and <tr> are replaced with \n. Non-letter sequences
677 // including \r or \n are replaced by \n. All other tags and skipped text
678 // are replaced with ASCII space.
679 //
680 // Buffer ALWAYS has leading space and trailing space space space NUL
GetOneTextSpan(LangSpan * span)681 bool ScriptScanner::GetOneTextSpan(LangSpan* span) {
682   span->text = script_buffer_;
683   span->text_bytes = 0;
684   span->offset = next_byte_ - start_byte_;
685   span->ulscript = UNKNOWN_ULSCRIPT;
686   span->lang = UNKNOWN_LANGUAGE;
687   span->truncated = false;
688 
689   int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
690   if ((kMaxScriptBytes <= byte_length_) &&
691       (byte_length_ < (2 * kMaxScriptBytes))) {
692     // Try to split the last two fragments in half
693     put_soft_limit = byte_length_ / 2;
694   }
695 
696   script_buffer_[0] = ' ';  // Always a space at front of output
697   script_buffer_[1] = '\0';
698   int take = 0;
699   int put = 1;              // Start after the initial space
700   int tlen, plen;
701 
702   if (byte_length_ <= 0) {
703     return false;          // No more text to be found
704   }
705 
706   // Go over alternating spans of text and tags,
707   // copying letters to buffer with single spaces for each run of non-letters
708   bool last_byte_was_space = false;
709   while (take < byte_length_) {
710     char c = next_byte_[take];
711     if (c == '\r') {c = LF;}      // Canonical CR or LF
712     if (c == '\n') {c = LF;}      // Canonical CR or LF
713 
714     if (IsSpecial(c) && !is_plain_text_) {
715       if (c == '<') {
716         // Replace tag with space
717         c = ' ';                      // for almost-full test below
718         // or if <p> <br> <tr>, replace with \n
719         if (take < (byte_length_ - 3)) {
720           if (EqCase(next_byte_[take + 1], 'p') &&
721               NeqLetter(next_byte_[take + 2])) {
722             c = LF;
723           }
724           if (EqCase(next_byte_[take + 1], 'b') &&
725               EqCase(next_byte_[take + 2], 'r') &&
726               NeqLetter(next_byte_[take + 3])) {
727             c = LF;
728           }
729           if (EqCase(next_byte_[take + 1], 't') &&
730               EqCase(next_byte_[take + 2], 'r') &&
731               NeqLetter(next_byte_[take + 3])) {
732             c = LF;
733           }
734         }
735         // Begining of tag; skip to end and go around again
736         tlen = 1 + ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
737                                     exit_state_);
738         // Copy one byte, compressing spaces
739         if (!last_byte_was_space || !WS(c)) {
740           script_buffer_[put++] = c;      // Advance dest
741           last_byte_was_space = WS(c);
742         }
743       } else if (c == '>') {
744         // Unexpected end of tag; copy it and go around again
745         tlen = 1;         // Over the >
746         script_buffer_[put++] = c;    // Advance dest
747       } else if (c == '&') {
748         // Expand entity, no advance
749         EntityToBuffer(next_byte_ + take, byte_length_ - take,
750                        script_buffer_ + put, &tlen, &plen);
751         put += plen;                  // Advance dest
752       }
753       take += tlen;                   // Advance source
754     } else {
755       // Copy one byte, compressing spaces
756       if (!last_byte_was_space || !WS(c)) {
757         script_buffer_[put++] = c;      // Advance dest
758         last_byte_was_space = WS(c);
759       }
760       ++take;                         // Advance source
761     }
762 
763     if (WS(c) &&
764         (put >= put_soft_limit)) {
765       // Buffer is almost full
766       span->truncated = true;
767       break;
768     }
769     if (put >= kMaxScriptBytes) {
770       // Buffer is completely full
771       span->truncated = true;
772       break;
773     }
774   }
775 
776   // Almost done. Back up to a character boundary if needed
777   while ((0 < take) && ((next_byte_[take] & 0xc0) == 0x80)) {
778     // Back up over continuation byte
779     --take;
780     --put;
781   }
782 
783   // Update input position
784   next_byte_ += take;
785   byte_length_ -= take;
786 
787   // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
788   //                          kMaxScriptBytes |   | put
789   script_buffer_[put + 0] = ' ';
790   script_buffer_[put + 1] = ' ';
791   script_buffer_[put + 2] = ' ';
792   script_buffer_[put + 3] = '\0';
793 
794   span->text_bytes = put;       // Does not include the last four chars above
795   return true;
796 }
797 
798 
799 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
800 // Buffer ALWAYS has leading space and trailing space space space NUL
GetOneScriptSpan(LangSpan * span)801 bool ScriptScanner::GetOneScriptSpan(LangSpan* span) {
802   if (!letters_marks_only_) {
803     // Return non-tag text, including punctuation and digits
804     return GetOneTextSpan(span);
805   }
806 
807   span->text = script_buffer_;
808   span->text_bytes = 0;
809   span->offset = next_byte_ - start_byte_;
810   span->ulscript = UNKNOWN_ULSCRIPT;
811   span->lang = UNKNOWN_LANGUAGE;
812   span->truncated = false;
813 
814   // struct timeval script_start, script_mid, script_end;
815 
816   int put_soft_limit = kMaxScriptBytes - kWithinScriptTail;
817   if ((kMaxScriptBytes <= byte_length_) &&
818       (byte_length_ < (2 * kMaxScriptBytes))) {
819     // Try to split the last two fragments in half
820     put_soft_limit = byte_length_ / 2;
821   }
822 
823 
824   int spanscript;           // The script of this span
825   int sc = UNKNOWN_ULSCRIPT;  // The script of next character
826   int tlen = 0;
827   int plen = 0;
828 
829   script_buffer_[0] = ' ';  // Always a space at front of output
830   script_buffer_[1] = '\0';
831   int take = 0;
832   int put = 1;              // Start after the initial space
833 
834   // Build offsets from span->text back to start_byte_ + span->offset
835   // This mapping reflects deletion of non-letters, expansion of
836   // entities, etc.
837   map2original_.Clear();
838   map2original_.Delete(span->offset);   // So that MapBack(0) gives offset
839 
840   // Get to the first real non-tag letter or entity that is a letter
841   int skip = SkipToFrontOfSpan(next_byte_, byte_length_, &spanscript);
842   next_byte_ += skip;
843   byte_length_ -= skip;
844 
845   if (skip != 1) {
846     map2original_.Delete(skip);
847     map2original_.Insert(1);
848   } else {
849     map2original_.Copy(1);
850   }
851   if (byte_length_ <= 0) {
852     map2original_.Reset();
853     return false;               // No more letters to be found
854   }
855 
856   // There is at least one letter, so we know the script for this span
857   span->ulscript = (ULScript)spanscript;
858 
859 
860   // Go over alternating spans of same-script letters and non-letters,
861   // copying letters to buffer with single spaces for each run of non-letters
862   while (take < byte_length_) {
863     // Copy run of letters in same script (&LS | LS)*
864     int letter_count = 0;              // Keep track of word length
865     bool need_break = false;
866 
867     while (take < byte_length_) {
868       // We are at a letter, nonletter, tag, or entity
869       if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
870         if (next_byte_[take] == '<') {
871           // Begining of tag
872           sc = 0;
873           break;
874         } else if (next_byte_[take] == '>') {
875           // Unexpected end of tag
876           sc = 0;
877           break;
878         } else if (next_byte_[take] == '&') {
879           // Copy entity, no advance
880           EntityToBuffer(next_byte_ + take, byte_length_ - take,
881                          script_buffer_ + put, &tlen, &plen);
882           if (plen > 0) {
883             sc = GetUTF8LetterScriptNum(script_buffer_ + put);
884           }
885         }
886       } else {
887         // Real letter, safely copy up to 4 bytes, increment by 1..4
888         // Will update by 1..4 bytes at Advance, below
889         tlen = plen = UTF8OneCharLen(next_byte_ + take);
890         if (take < (byte_length_ - 3)) {
891           // X86 fast case, does unaligned load/store
892           UNALIGNED_STORE32(script_buffer_ + put,
893                             UNALIGNED_LOAD32(next_byte_ + take));
894 
895         } else {
896           // Slow case, happens 1-3 times per input document
897           memcpy(script_buffer_ + put, next_byte_ + take, plen);
898         }
899         sc = GetUTF8LetterScriptNum(next_byte_ + take);
900       }
901 
902       // Allow continue across a single letter in a different script:
903       // A B D = three scripts, c = common script, i = inherited script,
904       // - = don't care, ( = take position before the += below
905       //  AAA(A-    continue
906       //
907       //  AAA(BA    continue
908       //  AAA(BB    break
909       //  AAA(Bc    continue (breaks after B)
910       //  AAA(BD    break
911       //  AAA(Bi    break
912       //
913       //  AAA(c-    break
914       //
915       //  AAA(i-    continue
916       //
917 
918       if ((sc != spanscript) && (sc != ULScript_Inherited)) {
919         // Might need to break this script span
920         if (sc == ULScript_Common) {
921           need_break = true;
922         } else {
923           // Look at next following character, ignoring entity as Common
924           int sc2 = GetUTF8LetterScriptNum(next_byte_ + take + tlen);
925           if ((sc2 != ULScript_Common) && (sc2 != spanscript)) {
926             // We found a non-trivial change of script
927             if (one_script_only_) {
928               need_break = true;
929             }
930           }
931         }
932       }
933       if (need_break) {break;}  // Non-letter or letter in wrong script
934 
935       take += tlen;                   // Advance
936       put += plen;                    // Advance
937 
938       // Update the offset map to reflect take/put lengths
939       if (tlen == plen) {
940         map2original_.Copy(tlen);
941       } else if (tlen < plen) {
942         map2original_.Copy(tlen);
943         map2original_.Insert(plen - tlen);
944       } else {    // plen < tlen
945         map2original_.Copy(plen);
946         map2original_.Delete(tlen - plen);
947       }
948 
949       ++letter_count;
950       if (put >= kMaxScriptBytes) {
951         // Buffer is full
952         span->truncated = true;
953         break;
954       }
955     }     // End while letters
956 
957     // Do run of non-letters (tag | &NL | NL)*
958     while (take < byte_length_) {
959       // Do fast scan to next interesting byte
960       tlen = ScanToLetterOrSpecial(next_byte_ + take, byte_length_ - take);
961       take += tlen;
962       map2original_.Delete(tlen);
963       if (take >= byte_length_) {break;}    // Might have scanned to end
964 
965       // We are at a letter, nonletter, tag, or entity
966       if (IsSpecial(next_byte_[take]) && !is_plain_text_) {
967         if (next_byte_[take] == '<') {
968           // Begining of tag; skip to end and go around again
969           tlen = ScanToPossibleLetter(next_byte_ + take, byte_length_ - take,
970                                       exit_state_);
971           sc = 0;
972         } else if (next_byte_[take] == '>') {
973           // Unexpected end of tag; skip it and go around again
974           tlen = 1;         // Over the >
975           sc = 0;
976         } else if (next_byte_[take] == '&') {
977           // Expand entity, no advance
978           EntityToBuffer(next_byte_ + take, byte_length_ - take,
979                          script_buffer_ + put, &tlen, &plen);
980           if (plen > 0) {
981             sc = GetUTF8LetterScriptNum(script_buffer_ + put);
982           }
983         }
984       } else {
985         // Update 1..4
986         tlen = UTF8OneCharLen(next_byte_ + take);
987         sc = GetUTF8LetterScriptNum(next_byte_ + take);
988       }
989       if (sc != 0) {break;}           // Letter found
990       take += tlen;                   // Else advance
991       map2original_.Delete(tlen);
992     }     // End while not-letters
993 
994     script_buffer_[put++] = ' ';
995     map2original_.Insert(1);
996 
997     // Letter in wrong script ?
998     if ((sc != spanscript) && (sc != ULScript_Inherited)) {break;}
999     if (put >= put_soft_limit) {
1000       // Buffer is almost full
1001       span->truncated = true;
1002       break;
1003     }
1004   }
1005 
1006   // Almost done. Back up to a character boundary if needed
1007   while ((0 < take) && (take < byte_length_) &&
1008          ((next_byte_[take] & 0xc0) == 0x80)) {
1009     // Back up over continuation byte
1010     --take;
1011     --put;
1012   }
1013 
1014   // Update input position
1015   next_byte_ += take;
1016   byte_length_ -= take;
1017 
1018   // Put four more spaces/NUL. Worst case is abcd _ _ _ \0
1019   //                          kMaxScriptBytes |   | put
1020   script_buffer_[put + 0] = ' ';
1021   script_buffer_[put + 1] = ' ';
1022   script_buffer_[put + 2] = ' ';
1023   script_buffer_[put + 3] = '\0';
1024   map2original_.Insert(4);
1025   map2original_.Reset();
1026 
1027   span->text_bytes = put;       // Does not include the last four chars above
1028   return true;
1029 }
1030 
1031 // Force Latin, Cyrillic, Armenian, Greek scripts to be lowercase
1032 // List changes with each version of Unicode, so just always lowercase
1033 // Unicode 6.2.0:
1034 //   ARMENIAN COPTIC CYRILLIC DESERET GEORGIAN GLAGOLITIC GREEK LATIN
LowerScriptSpan(LangSpan * span)1035 void ScriptScanner::LowerScriptSpan(LangSpan* span) {
1036   // If needed, lowercase all the text. If we do it sooner, might miss
1037   // lowercasing an entity such as &Aacute;
1038   // We only need to do this for Latn and Cyrl scripts
1039   map2uplow_.Clear();
1040   // Full Unicode lowercase of the entire buffer, including
1041   // four pad bytes off the end.
1042   // Ahhh. But the last byte 0x00 is not interchange-valid, so we do 3 pad
1043   // bytes and put the 0x00 in explicitly.
1044   // Build an offset map from script_buffer_lower_ back to script_buffer_
1045   int consumed, filled, changed;
1046   StringPiece istr(span->text, span->text_bytes + 3);
1047   StringPiece ostr(script_buffer_lower_, kMaxScriptLowerBuffer);
1048 
1049   UTF8GenericReplace(&utf8repl_lettermarklower_obj,
1050                             istr, ostr, is_plain_text_,
1051                             &consumed, &filled, &changed, &map2uplow_);
1052   script_buffer_lower_[filled] = '\0';
1053   span->text = script_buffer_lower_;
1054   span->text_bytes = filled - 3;
1055   map2uplow_.Reset();
1056 }
1057 
1058 // Copy next run of same-script non-tag letters to buffer [NUL terminated]
1059 // Force Latin, Cyrillic, Greek scripts to be lowercase
1060 // Buffer ALWAYS has leading space and trailing space space space NUL
GetOneScriptSpanLower(LangSpan * span)1061 bool ScriptScanner::GetOneScriptSpanLower(LangSpan* span) {
1062   bool ok = GetOneScriptSpan(span);
1063   if (ok) {
1064     LowerScriptSpan(span);
1065   }
1066   return ok;
1067 }
1068 
1069 
1070 // Maps byte offset in most recent GetOneScriptSpan/Lower
1071 // span->text [0..text_bytes] into an additional byte offset from
1072 // span->offset, to get back to corresponding text in the original
1073 // input buffer.
1074 // text_offset must be the first byte
1075 // of a UTF-8 character, or just beyond the last character. Normally this
1076 // routine is called with the first byte of an interesting range and
1077 // again with the first byte of the following range.
MapBack(int text_offset)1078 int ScriptScanner::MapBack(int text_offset) {
1079   return map2original_.MapBack(map2uplow_.MapBack(text_offset));
1080 }
1081 
1082 
1083 // Gets lscript number for letters; always returns
1084 //   0 (common script) for non-letters
GetUTF8LetterScriptNum(const char * src)1085 int GetUTF8LetterScriptNum(const char* src) {
1086   int srclen = UTF8OneCharLen(src);
1087   const uint8* usrc = reinterpret_cast<const uint8*>(src);
1088   return UTF8GenericPropertyTwoByte(&utf8prop_lettermarkscriptnum_obj,
1089                                     &usrc, &srclen);
1090 }
1091 
1092 }  // namespace CLD2
1093 
1094 
1095