1 // Copyright 2019 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 #[macro_use]
11 extern crate arrayref;
12 extern crate memmap2;
13 #[macro_use]
14 extern crate log;
15 
16 use std::slice;
17 use std::str;
18 use std::cmp::max;
19 use std::fs::File;
20 use std::mem;
21 
22 use memmap2::Mmap;
23 
24 // Make submodules available publicly.
25 pub mod builder;
26 pub mod ffi;
27 
28 // 4-byte identification expected at beginning of a compiled dictionary file.
29 // (This will be updated if an incompatible change to the format is made in
30 // some future revision.)
31 const MAGIC_NUMBER: [u8; 4] = [b'H', b'y', b'f', b'0'];
32 
33 const INVALID_STRING_OFFSET: u16 = 0xffff;
34 const INVALID_STATE_OFFSET: u32 = 0x00ff_ffff;
35 
36 const FILE_HEADER_SIZE: usize = 8; // 4-byte magic number, 4-byte count of levels
37 const LEVEL_HEADER_SIZE: usize = 16;
38 
39 // Transition actually holds a 24-bit new state offset and an 8-bit input byte
40 // to match. We will be interpreting byte ranges as Transition arrays (in the
41 // State::transitions() method below), so use repr(C) to ensure we have the
42 // memory layout we expect.
43 // Transition records do not depend on any specific alignment.
44 #[repr(C)]
45 #[derive(Debug,Copy,Clone)]
46 struct Transition(u8, u8, u8, u8);
47 
48 impl Transition {
new_state_offset(&self) -> usize49     fn new_state_offset(&self) -> usize {
50         // Read a 24-bit little-endian number from three bytes.
51         self.0 as usize + ((self.1 as usize) << 8) + ((self.2 as usize) << 16)
52     }
match_byte(&self) -> u853     fn match_byte(&self) -> u8 {
54         self.3
55     }
56 }
57 
58 // State is an area of the Level's data block that begins with a fixed header,
59 // followed by an array of transitions. The total size of each State's data
60 // depends on the number of transitions in the state. Only the basic header
61 // is defined by the struct here; the rest of the state is accessed via
62 // pointer magic.
63 // There are two versions of State, a basic version that supports only simple
64 // hyphenation (no associated spelling change), and an extended version that
65 // adds the replacement-string fields to support spelling changes at the
66 // hyphenation point. Check is_extended() to know which version is present.
67 // State records are NOT necessarily 4-byte aligned, so multi-byte fields
68 // should be read with care.
69 #[derive(Debug,Copy,Clone)]
70 #[repr(C)]
71 struct State {
72     fallback_state: [u8; 4],
73     match_string_offset: [u8; 2],
74     num_transitions: u8,
75     is_extended: u8,
76 }
77 
78 #[repr(C)]
79 struct StateExtended {
80     state: State,
81     repl_string_offset: [u8; 2],
82     repl_index: i8,
83     repl_cut: i8,
84 }
85 
86 impl State {
87     // Accessors for the various State header fields; see file format description.
fallback_state(&self) -> usize88     fn fallback_state(&self) -> usize {
89         u32::from_le_bytes(self.fallback_state) as usize
90     }
match_string_offset(&self) -> usize91     fn match_string_offset(&self) -> usize {
92         u16::from_le_bytes(self.match_string_offset) as usize
93     }
num_transitions(&self) -> u894     fn num_transitions(&self) -> u8 {
95         self.num_transitions
96     }
is_extended(&self) -> bool97     fn is_extended(&self) -> bool {
98         self.is_extended != 0
99     }
100     // Accessors that are only valid if is_extended() is true.
101     // These use `unsafe` to dereference a pointer to the relevant field;
102     // this is OK because Level::get_state always validates the total state size
103     // before returning a state reference, so these pointers will be valid for
104     // any extended state it returns.
105     #[allow(dead_code)]
as_extended(&self) -> &StateExtended106     fn as_extended(&self) -> &StateExtended {
107         debug_assert!(self.is_extended());
108         unsafe { mem::transmute(self) }
109     }
110     #[allow(dead_code)]
repl_string_offset(&self) -> usize111     fn repl_string_offset(&self) -> usize {
112         u16::from_le_bytes(self.as_extended().repl_string_offset) as usize
113     }
114     #[allow(dead_code)]
repl_index(&self) -> i8115     fn repl_index(&self) -> i8 {
116         self.as_extended().repl_index
117     }
118     #[allow(dead_code)]
repl_cut(&self) -> i8119     fn repl_cut(&self) -> i8 {
120         self.as_extended().repl_cut
121     }
122     // Return the state's Transitions as a slice reference.
transitions(&self) -> &[Transition]123     fn transitions(&self) -> &[Transition] {
124         let count = self.num_transitions() as usize;
125         if count == 0 {
126             return &[];
127         }
128         let transition_offset = if self.is_extended() { mem::size_of::<StateExtended>() } else { mem::size_of::<State>() } as isize;
129         // We know the `offset` here will not look beyond the valid range of memory
130         // because Level::get_state() checks the state length (accounting for the
131         // number of transitions) before returning a State reference.
132         let trans_ptr = unsafe { (self as *const State as *const u8).offset(transition_offset) as *const Transition };
133         // Again, because Level::get_state() already checked the state length, we know
134         // this slice address and count will be valid.
135         unsafe { slice::from_raw_parts(trans_ptr, count) }
136     }
137     // Look up the Transition for a given input byte, or None.
transition_for(&self, b: u8) -> Option<Transition>138     fn transition_for(&self, b: u8) -> Option<Transition> {
139         // The transitions array is sorted by match_byte() value, but there are
140         // usually very few entries; benchmarking showed that using binary_search_by
141         // here gave no benefit (possibly slightly slower).
142         self.transitions().iter().copied().find(|t| t.match_byte() == b)
143     }
144     // Just for debugging use...
145     #[allow(dead_code)]
deep_show(&self, prefix: &str, dic: &Level)146     fn deep_show(&self, prefix: &str, dic: &Level) {
147         if self.match_string_offset() != INVALID_STRING_OFFSET as usize {
148             let match_string = dic.string_at_offset(self.match_string_offset());
149             println!("{}match: {}", prefix, str::from_utf8(match_string).unwrap());
150         }
151         for t in self.transitions() {
152             println!("{}{} ->", prefix, t.match_byte() as char);
153             let next_prefix = format!("{}  ", prefix);
154             dic.get_state(t.new_state_offset()).unwrap().deep_show(&next_prefix, &dic);
155         }
156     }
157 }
158 
159 // We count the presentation-form ligature characters U+FB00..FB06 as multiple
160 // chars for the purposes of lefthyphenmin/righthyphenmin. In UTF-8, all these
161 // ligature characters are 3-byte sequences beginning with <0xEF, 0xAC>; this
162 // helper returns the "decomposed length" of the ligature given its trailing
163 // byte.
lig_length(trail_byte: u8) -> usize164 fn lig_length(trail_byte: u8) -> usize {
165     // This is only called on valid UTF-8 where we already know trail_byte
166     // must be >= 0x80.
167     // Ligature lengths:       ff   fi   fl   ffi  ffl  long-st  st
168     const LENGTHS: [u8; 7] = [ 2u8, 2u8, 2u8, 3u8, 3u8, 2u8,     2u8 ];
169     if trail_byte > 0x86 {
170         return 1;
171     }
172     LENGTHS[trail_byte as usize - 0x80] as usize
173 }
174 
is_utf8_trail_byte(byte: u8) -> bool175 fn is_utf8_trail_byte(byte: u8) -> bool {
176     (byte & 0xC0) == 0x80
177 }
178 
is_ascii_digit(byte: u8) -> bool179 fn is_ascii_digit(byte: u8) -> bool {
180     byte <= b'9' && byte >= b'0'
181 }
182 
is_odd(byte: u8) -> bool183 fn is_odd(byte: u8) -> bool {
184     (byte & 0x01) == 0x01
185 }
186 
187 // A hyphenation Level has a header followed by State records and packed string
188 // data. The total size of the slice depends on the number and size of the
189 // States and Strings it contains.
190 // Note that the data of the Level may not have any specific alignment!
191 #[derive(Debug,Copy,Clone)]
192 struct Level<'a> {
193     data: &'a [u8],
194     // Header fields cached by the constructor for faster access:
195     state_data_base_: usize,
196     string_data_base_: usize,
197 }
198 
199 impl Level<'_> {
200     // Constructor that initializes our cache variables.
new(data: &[u8]) -> Level201     fn new(data: &[u8]) -> Level {
202         Level {
203             data,
204             state_data_base_: u32::from_le_bytes(*array_ref!(data, 0, 4)) as usize,
205             string_data_base_: u32::from_le_bytes(*array_ref!(data, 4, 4)) as usize,
206         }
207     }
208 
209     // Accessors for Level header fields; see file format description.
state_data_base(&self) -> usize210     fn state_data_base(&self) -> usize {
211         self.state_data_base_ // cached by constructor
212     }
string_data_base(&self) -> usize213     fn string_data_base(&self) -> usize {
214         self.string_data_base_ // cached by constructor
215     }
nohyphen_string_offset(&self) -> usize216     fn nohyphen_string_offset(&self) -> usize {
217         u16::from_le_bytes(*array_ref!(self.data, 8, 2)) as usize
218     }
219     #[allow(dead_code)]
nohyphen_count(&self) -> u16220     fn nohyphen_count(&self) -> u16 {
221         u16::from_le_bytes(*array_ref!(self.data, 10, 2))
222     }
lh_min(&self) -> usize223     fn lh_min(&self) -> usize {
224         max(1, self.data[12] as usize)
225     }
rh_min(&self) -> usize226     fn rh_min(&self) -> usize {
227         max(1, self.data[13] as usize)
228     }
clh_min(&self) -> usize229     fn clh_min(&self) -> usize {
230         max(1, self.data[14] as usize)
231     }
crh_min(&self) -> usize232     fn crh_min(&self) -> usize {
233         max(1, self.data[15] as usize)
234     }
word_boundary_mins(&self) -> (usize, usize, usize, usize)235     fn word_boundary_mins(&self) -> (usize, usize, usize, usize) {
236         (self.lh_min(), self.rh_min(), self.clh_min(), self.crh_min())
237     }
238     // Strings are represented as offsets from the Level's string_data_base.
239     // This returns a byte slice referencing the string at a given offset,
240     // or an empty slice if invalid.
string_at_offset(&self, offset: usize) -> &'_ [u8]241     fn string_at_offset(&self, offset: usize) -> &'_ [u8] {
242         if offset == INVALID_STRING_OFFSET as usize {
243             return &[];
244         }
245         let string_base = self.string_data_base() as usize + offset;
246         // TODO: move this to the validation function.
247         debug_assert!(string_base < self.data.len());
248         if string_base + 1 > self.data.len() {
249             return &[];
250         }
251         let len = self.data[string_base] as usize;
252         // TODO: move this to the validation function.
253         debug_assert!(string_base + 1 + len <= self.data.len());
254         if string_base + 1 + len > self.data.len() {
255             return &[];
256         }
257         self.data.get(string_base + 1 .. string_base + 1 + len).unwrap()
258     }
259     // The nohyphen field actually contains multiple NUL-separated substrings;
260     // return them as a vector of individual byte slices.
nohyphen(&self) -> Vec<&[u8]>261     fn nohyphen(&self) -> Vec<&[u8]> {
262         let string_offset = self.nohyphen_string_offset();
263         let nohyph_str = self.string_at_offset(string_offset as usize);
264         if nohyph_str.is_empty() {
265             return vec![];
266         }
267         nohyph_str.split(|&b| b == 0).collect()
268     }
269     // States are represented as an offset from the Level's state_data_base.
270     // This returns a reference to the State at a given offset, or None if invalid.
get_state(&self, offset: usize) -> Option<&State>271     fn get_state(&self, offset: usize) -> Option<&State> {
272         if offset == INVALID_STATE_OFFSET as usize {
273             return None;
274         }
275         debug_assert_eq!(offset & 3, 0);
276         let state_base = self.state_data_base() + offset;
277         // TODO: move this to the validation function.
278         debug_assert!(state_base + mem::size_of::<State>() <= self.string_data_base());
279         if state_base + mem::size_of::<State>() > self.string_data_base() {
280             return None;
281         }
282         let state_ptr = &self.data[state_base] as *const u8 as *const State;
283         // This is safe because we just checked against self.string_data_base() above.
284         let state = unsafe { state_ptr.as_ref().unwrap() };
285         let length = if state.is_extended() { mem::size_of::<StateExtended>() } else { mem::size_of::<State>() }
286                 + mem::size_of::<Transition>() * state.num_transitions() as usize;
287         // TODO: move this to the validation function.
288         debug_assert!(state_base + length <= self.string_data_base());
289         if state_base + length > self.string_data_base() {
290             return None;
291         }
292         // This is safe because we checked the full state length against self.string_data_base().
293         unsafe { state_ptr.as_ref() }
294     }
295     // Sets hyphenation values (odd = potential break, even = no break) in values[],
296     // and returns the change in the number of odd values present, so the caller can
297     // keep track of the total number of potential breaks in the word.
find_hyphen_values(&self, word: &str, values: &mut [u8], lh_min: usize, rh_min: usize) -> isize298     fn find_hyphen_values(&self, word: &str, values: &mut [u8], lh_min: usize, rh_min: usize) -> isize {
299         // Bail out immediately if the word is too short to hyphenate.
300         if word.len() < lh_min + rh_min {
301             return 0;
302         }
303         let start_state = self.get_state(0);
304         let mut st = start_state;
305         let mut hyph_count = 0;
306         for i in 0 .. word.len() + 2 {
307             // Loop over the word by bytes, with a virtual '.' added at each end
308             // to match word-boundary patterns.
309             let b = if i == 0 || i == word.len() + 1 { b'.' } else { word.as_bytes()[i - 1] };
310             loop {
311                 // Loop to repeatedly fall back if we don't find a matching transition.
312                 // Note that this could infinite-loop if there is a state whose fallback
313                 // points to itself (or a cycle of fallbacks), but this would represent
314                 // a table compilation error.
315                 // (A potential validation function could check for fallback cycles.)
316                 if st.is_none() {
317                     st = start_state;
318                     break;
319                 }
320                 let state = st.unwrap();
321                 if let Some(tr) = state.transition_for(b) {
322                     // Found a transition for the current byte. Look up the new state;
323                     // if it has a match_string, merge its weights into `values`.
324                     st = self.get_state(tr.new_state_offset());
325                     if let Some(state) = st {
326                         let match_offset = state.match_string_offset();
327                         if match_offset != INVALID_STRING_OFFSET as usize {
328                             if state.is_extended() {
329                                 debug_assert!(false, "extended hyphenation not supported by this function");
330                             } else {
331                                 let match_str = self.string_at_offset(match_offset);
332                                 let offset = i + 1 - match_str.len();
333                                 assert!(offset + match_str.len() <= word.len() + 2);
334                                 for (j, ch) in match_str.iter().enumerate() {
335                                     let index = offset + j;
336                                     if index >= lh_min && index <= word.len() - rh_min {
337                                         // lh_min and rh_min are guaranteed to be >= 1,
338                                         // so this will not try to access outside values[].
339                                         let old_value = values[index - 1];
340                                         let value = ch - b'0';
341                                         if value > old_value {
342                                             if is_odd(old_value) != is_odd(value) {
343                                                 // Adjust hyph_count for the change we're making
344                                                 hyph_count += if is_odd(value) { 1 } else { -1 };
345                                             }
346                                             values[index - 1] = value;
347                                         }
348                                     }
349                                 }
350                             }
351                         }
352                     }
353                     // We have handled the current input byte; leave the fallback loop
354                     // and get next input.
355                     break;
356                 }
357                 // No transition for the current byte; go to fallback state and try again.
358                 st = self.get_state(state.fallback_state());
359             }
360         }
361 
362         // If the word was not purely ASCII, or if the word begins/ends with
363         // digits, the use of lh_min and rh_min above may not have correctly
364         // excluded enough positions, so we need to fix things up here.
365         let mut index = 0;
366         let mut count = 0;
367         let word_bytes = word.as_bytes();
368         let mut clear_hyphen_at = |i| { if is_odd(values[i]) { hyph_count -= 1; } values[i] = 0; };
369         // Handle lh_min.
370         while count < lh_min - 1 && index < word_bytes.len() {
371             let byte = word_bytes[index];
372             clear_hyphen_at(index);
373             if byte < 0x80 {
374                 index += 1;
375                 if is_ascii_digit(byte) {
376                     continue; // ASCII digits don't count
377                 }
378             } else if byte == 0xEF && word_bytes[index + 1] == 0xAC {
379                 // Unicode presentation-form ligature characters, which we count as
380                 // multiple chars for the purpose of lh_min/rh_min, all begin with
381                 // 0xEF, 0xAC in UTF-8.
382                 count += lig_length(word_bytes[index + 2]);
383                 clear_hyphen_at(index + 1);
384                 clear_hyphen_at(index + 2);
385                 index += 3;
386                 continue;
387             } else {
388                 index += 1;
389                 while index < word_bytes.len() && is_utf8_trail_byte(word_bytes[index])  {
390                     clear_hyphen_at(index);
391                     index += 1;
392                 }
393             }
394             count += 1;
395         }
396 
397         // Handle rh_min.
398         count = 0;
399         index = word.len();
400         while count < rh_min && index > 0 {
401             index -= 1;
402             let byte = word_bytes[index];
403             if index < word.len() - 1 {
404                 clear_hyphen_at(index);
405             }
406             if byte < 0x80 {
407                 // Only count if not an ASCII digit
408                 if !is_ascii_digit(byte) {
409                     count += 1;
410                 }
411                 continue;
412             }
413             if is_utf8_trail_byte(byte) {
414                 continue;
415             }
416             if byte == 0xEF && word_bytes[index + 1] == 0xAC {
417                 // Presentation-form ligatures count as multiple chars.
418                 count += lig_length(word_bytes[index + 2]);
419                 continue;
420             }
421             count += 1;
422         }
423 
424         hyph_count
425     }
426 }
427 
428 /// Hyphenation engine encapsulating a language-specific set of patterns (rules)
429 /// that identify possible break positions within a word.
430 pub struct Hyphenator<'a>(&'a [u8]);
431 
432 impl Hyphenator<'_> {
433     /// Return a Hyphenator that wraps the given buffer.
434     /// This does *not* check that the given buffer is in fact a valid hyphenation table.
435     /// Use `is_valid_hyphenator()` to determine whether it is usable.
436     /// (Calling hyphenation methods on a Hyphenator that wraps arbitrary,
437     /// unvalidated data is not unsafe, but may panic.)
new(buffer: &[u8]) -> Hyphenator438     pub fn new(buffer: &[u8]) -> Hyphenator {
439         Hyphenator(buffer)
440     }
441 
442     // Internal implementation details
magic_number(&self) -> &[u8]443     fn magic_number(&self) -> &[u8] {
444         &self.0[0 .. 4]
445     }
num_levels(&self) -> usize446     fn num_levels(&self) -> usize {
447         u32::from_le_bytes(*array_ref!(self.0, 4, 4)) as usize
448     }
level(&self, i: usize) -> Level449     fn level(&self, i: usize) -> Level {
450         let offset = u32::from_le_bytes(*array_ref!(self.0, FILE_HEADER_SIZE + 4 * i, 4)) as usize;
451         let limit = if i == self.num_levels() - 1 {
452             self.0.len()
453         } else {
454             u32::from_le_bytes(*array_ref!(self.0, FILE_HEADER_SIZE + 4 * i + 4, 4)) as usize
455         };
456         debug_assert!(offset + LEVEL_HEADER_SIZE <= limit && limit <= self.0.len());
457         debug_assert_eq!(offset & 3, 0);
458         debug_assert_eq!(limit & 3, 0);
459         Level::new(&self.0[offset .. limit])
460     }
461 
462     /// Identify acceptable hyphenation positions in the given `word`.
463     ///
464     /// The caller-supplied `values` must be at least as long as the `word`.
465     ///
466     /// On return, any elements with an odd value indicate positions in the word
467     /// after which a hyphen could be inserted.
468     ///
469     /// Returns the number of possible hyphenation positions that were found.
470     ///
471     /// # Panics
472     /// If the given `values` slice is too small to hold the results.
473     ///
474     /// If the block of memory represented by `self.0` is not in fact a valid
475     /// hyphenation dictionary, this function may panic with an overflow or
476     /// array bounds violation.
find_hyphen_values(&self, word: &str, values: &mut [u8]) -> isize477     pub fn find_hyphen_values(&self, word: &str, values: &mut [u8]) -> isize {
478         assert!(values.len() >= word.len());
479         values.iter_mut().for_each(|x| *x = 0);
480         let top_level = self.level(0);
481         let (lh_min, rh_min, clh_min, crh_min) = top_level.word_boundary_mins();
482         if word.len() < lh_min + rh_min {
483             return 0;
484         }
485         let mut hyph_count = top_level.find_hyphen_values(word, values, lh_min, rh_min);
486         let compound = hyph_count > 0;
487         // Subsequent levels are applied to fragments between potential breaks
488         // already found:
489         for l in 1 .. self.num_levels() {
490             let level = self.level(l);
491             if hyph_count > 0 {
492                 let mut begin = 0;
493                 let mut lh = lh_min;
494                 // lh_min and rh_min are both guaranteed to be greater than zero,
495                 // so this loop will not reach fully to the end of the word.
496                 for i in lh_min - 1 .. word.len() - rh_min {
497                     if is_odd(values[i]) {
498                         if i > begin {
499                             // We've found a component of a compound;
500                             // clear the corresponding values and apply the new level.
501                             // (These values must be even, so hyph_count is unchanged.)
502                             values[begin .. i].iter_mut().for_each(|x| {
503                                 *x = 0;
504                             });
505                             hyph_count += level.find_hyphen_values(&word[begin ..= i],
506                                                                    &mut values[begin ..= i],
507                                                                    lh, crh_min);
508                         }
509                         begin = i + 1;
510                         lh = clh_min;
511                     }
512                 }
513                 if begin == 0 {
514                     // No compound-word breaks were found, just apply level to the whole word.
515                     hyph_count += level.find_hyphen_values(word, values, lh_min, rh_min);
516                 } else if begin < word.len() {
517                     // Handle trailing component of compound.
518                     hyph_count += level.find_hyphen_values(&word[begin .. word.len()],
519                                                            &mut values[begin .. word.len()],
520                                                            clh_min, rh_min);
521                 }
522             } else {
523                 hyph_count += level.find_hyphen_values(word, values, lh_min, rh_min);
524             }
525         }
526 
527         // Only need to check nohyphen strings if top-level (compound) breaks were found.
528         if compound && hyph_count > 0 {
529             let nohyph = top_level.nohyphen();
530             if !nohyph.is_empty() {
531                 for i in lh_min ..= word.len() - rh_min {
532                     if is_odd(values[i - 1]) {
533                         for nh in &nohyph {
534                             if i + nh.len() <= word.len() && *nh == &word.as_bytes()[i .. i + nh.len()] {
535                                 values[i - 1] = 0;
536                                 hyph_count -= 1;
537                                 break;
538                             }
539                             if nh.len() <= i && *nh == &word.as_bytes()[i - nh.len() .. i] {
540                                 values[i - 1] = 0;
541                                 hyph_count -= 1;
542                                 break;
543                             }
544                         }
545                     }
546                 }
547             }
548         }
549 
550         hyph_count
551     }
552 
553     /// Generate the hyphenated form of a `word` by inserting the given `hyphen_char`
554     /// at each valid break position.
555     ///
556     /// # Panics
557     /// If the block of memory represented by `self` is not in fact a valid
558     /// hyphenation dictionary, this function may panic with an overflow or
559     /// array bounds violation.
560     ///
561     /// Also panics if the length of the hyphenated word would overflow `usize`.
hyphenate_word(&self, word: &str, hyphchar: char) -> String562     pub fn hyphenate_word(&self, word: &str, hyphchar: char) -> String {
563         let mut values = vec![0u8; word.len()];
564         let hyph_count = self.find_hyphen_values(word, &mut values);
565         if hyph_count <= 0 {
566             return word.to_string();
567         }
568         // We know how long the result will be, so we can preallocate here.
569         let result_len = word.len() + hyph_count as usize * hyphchar.len_utf8();
570         let mut result = String::with_capacity(result_len);
571         let mut n = 0;
572         for ch in word.char_indices() {
573             if ch.0 > 0 && is_odd(values[ch.0 - 1]) {
574                 result.push(hyphchar);
575                 n += 1;
576             }
577             result.push(ch.1);
578         }
579         debug_assert_eq!(n, hyph_count);
580         debug_assert_eq!(result_len, result.len());
581         result
582     }
583 
584     /// Check if the block of memory looks like it could be a valid hyphenation
585     /// table.
is_valid_hyphenator(&self) -> bool586     pub fn is_valid_hyphenator(&self) -> bool {
587         // Size must be at least 4 bytes for magic_number + 4 bytes num_levels;
588         // smaller than this cannot be safely inspected.
589         if self.0.len() < FILE_HEADER_SIZE {
590             return false;
591         }
592         if self.magic_number() != MAGIC_NUMBER {
593             return false;
594         }
595         // For each level, there's a 4-byte offset in the header, and the level
596         // has its own 16-byte header, so we can check a minimum size again here.
597         let num_levels = self.num_levels();
598         if self.0.len() < FILE_HEADER_SIZE + LEVEL_HEADER_SIZE * num_levels {
599             return false;
600         }
601         // Check that state_data_base and string_data_base for each hyphenation
602         // level are within range.
603         for l in 0 .. num_levels {
604             let level = self.level(l);
605             if level.state_data_base() < LEVEL_HEADER_SIZE ||
606                    level.state_data_base() > level.string_data_base() ||
607                    level.string_data_base() > level.data.len() {
608                 return false;
609             }
610             // TODO: consider doing more extensive validation of states and
611             // strings within the level?
612         }
613         // It's still possible the dic is internally broken, but at least it's
614         // worth trying to use it!
615         true
616     }
617 }
618 
619 /// Load the compiled hyphenation file at `dic_path`, if present.
620 ///
621 /// Returns `None` if the specified file cannot be opened or mapped,
622 /// otherwise returns a `memmap2::Mmap` mapping the file.
623 ///
624 /// # Safety
625 ///
626 /// This is unsafe for the same reason `Mmap::map()` is unsafe:
627 /// mapped_hyph does not guarantee safety if the mapped file is modified
628 /// (e.g. by another process) while we're using it.
629 ///
630 /// This verifies that the file looks superficially like it may be a
631 /// compiled hyphenation table, but does *not* fully check the validity
632 /// of the file contents! Calling hyphenation functions with the returned
633 /// data is not unsafe, but may panic if the data is invalid.
load_file(dic_path: &str) -> Option<Mmap>634 pub unsafe fn load_file(dic_path: &str) -> Option<Mmap> {
635     let file = File::open(dic_path).ok()?;
636     let dic = Mmap::map(&file).ok()?;
637     let hyph = Hyphenator(&*dic);
638     if hyph.is_valid_hyphenator() {
639         return Some(dic);
640     }
641     None
642 }
643