1 // Copyright 2015 The Servo Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 //! 3.3.3 Preparations for Implicit Processing
11 //!
12 //! <http://www.unicode.org/reports/tr9/#Preparations_for_Implicit_Processing>
13 
14 use std::cmp::max;
15 use std::ops::Range;
16 
17 use super::char_data::BidiClass;
18 use super::level::Level;
19 
20 use BidiClass::*;
21 
22 /// A maximal substring of characters with the same embedding level.
23 ///
24 /// Represented as a range of byte indices.
25 pub type LevelRun = Range<usize>;
26 
27 
28 /// Output of `isolating_run_sequences` (steps X9-X10)
29 #[derive(Debug, PartialEq)]
30 pub struct IsolatingRunSequence {
31     pub runs: Vec<LevelRun>,
32     pub sos: BidiClass, // Start-of-sequence type.
33     pub eos: BidiClass, // End-of-sequence type.
34 }
35 
36 
37 /// Compute the set of isolating run sequences.
38 ///
39 /// An isolating run sequence is a maximal sequence of level runs such that for all level runs
40 /// except the last one in the sequence, the last character of the run is an isolate initiator
41 /// whose matching PDI is the first character of the next level run in the sequence.
42 ///
43 /// Note: This function does *not* return the sequences in order by their first characters.
44 #[cfg_attr(feature = "flame_it", flame)]
isolating_run_sequences( para_level: Level, original_classes: &[BidiClass], levels: &[Level], ) -> Vec<IsolatingRunSequence>45 pub fn isolating_run_sequences(
46     para_level: Level,
47     original_classes: &[BidiClass],
48     levels: &[Level],
49 ) -> Vec<IsolatingRunSequence> {
50     let runs = level_runs(levels, original_classes);
51 
52     // Compute the set of isolating run sequences.
53     // <http://www.unicode.org/reports/tr9/#BD13>
54     let mut sequences = Vec::with_capacity(runs.len());
55 
56     // When we encounter an isolate initiator, we push the current sequence onto the
57     // stack so we can resume it after the matching PDI.
58     let mut stack = vec![Vec::new()];
59 
60     for run in runs {
61         assert!(run.len() > 0);
62         assert!(!stack.is_empty());
63 
64         let start_class = original_classes[run.start];
65         let end_class = original_classes[run.end - 1];
66 
67         let mut sequence = if start_class == PDI && stack.len() > 1 {
68             // Continue a previous sequence interrupted by an isolate.
69             stack.pop().unwrap()
70         } else {
71             // Start a new sequence.
72             Vec::new()
73         };
74 
75         sequence.push(run);
76 
77         if matches!(end_class, RLI | LRI | FSI) {
78             // Resume this sequence after the isolate.
79             stack.push(sequence);
80         } else {
81             // This sequence is finished.
82             sequences.push(sequence);
83         }
84     }
85     // Pop any remaning sequences off the stack.
86     sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty()));
87 
88     // Determine the `sos` and `eos` class for each sequence.
89     // <http://www.unicode.org/reports/tr9/#X10>
90     sequences
91         .into_iter()
92         .map(|sequence: Vec<LevelRun>| {
93             assert!(!sequence.is_empty());
94 
95             let start_of_seq = sequence[0].start;
96             let end_of_seq = sequence[sequence.len() - 1].end;
97             let seq_level = levels[start_of_seq];
98 
99             #[cfg(test)]
100             for run in sequence.clone() {
101                 for idx in run {
102                     if not_removed_by_x9(&original_classes[idx]) {
103                         assert_eq!(seq_level, levels[idx]);
104                     }
105                 }
106             }
107 
108             // Get the level of the last non-removed char before the runs.
109             let pred_level = match original_classes[..start_of_seq].iter().rposition(
110                 not_removed_by_x9,
111             ) {
112                 Some(idx) => levels[idx],
113                 None => para_level,
114             };
115 
116             // Get the level of the next non-removed char after the runs.
117             let succ_level = if matches!(original_classes[end_of_seq - 1], RLI | LRI | FSI) {
118                 para_level
119             } else {
120                 match original_classes[end_of_seq..].iter().position(
121                     not_removed_by_x9,
122                 ) {
123                     Some(idx) => levels[end_of_seq + idx],
124                     None => para_level,
125                 }
126             };
127 
128             IsolatingRunSequence {
129                 runs: sequence,
130                 sos: max(seq_level, pred_level).bidi_class(),
131                 eos: max(seq_level, succ_level).bidi_class(),
132             }
133         })
134         .collect()
135 }
136 
137 /// Finds the level runs in a paragraph.
138 ///
139 /// <http://www.unicode.org/reports/tr9/#BD7>
level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun>140 fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun> {
141     assert_eq!(levels.len(), original_classes.len());
142 
143     let mut runs = Vec::new();
144     if levels.is_empty() {
145         return runs;
146     }
147 
148     let mut current_run_level = levels[0];
149     let mut current_run_start = 0;
150     for i in 1..levels.len() {
151         if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level {
152             // End the last run and start a new one.
153             runs.push(current_run_start..i);
154             current_run_level = levels[i];
155             current_run_start = i;
156         }
157     }
158     runs.push(current_run_start..levels.len());
159 
160     runs
161 }
162 
163 /// Should this character be ignored in steps after X9?
164 ///
165 /// <http://www.unicode.org/reports/tr9/#X9>
removed_by_x9(class: BidiClass) -> bool166 pub fn removed_by_x9(class: BidiClass) -> bool {
167     matches!(class, RLE | LRE | RLO | LRO | PDF | BN)
168 }
169 
170 // For use as a predicate for `position` / `rposition`
not_removed_by_x9(class: &BidiClass) -> bool171 pub fn not_removed_by_x9(class: &BidiClass) -> bool {
172     !removed_by_x9(*class)
173 }
174 
175 #[cfg(test)]
176 mod tests {
177     use super::*;
178 
179     #[test]
test_level_runs()180     fn test_level_runs() {
181         assert_eq!(level_runs(&Level::vec(&[]), &[]), &[]);
182         assert_eq!(
183             level_runs(&Level::vec(&[0, 0, 0, 1, 1, 2, 0, 0]), &[L; 8]),
184             &[0..3, 3..5, 5..6, 6..8]
185         );
186     }
187 
188     // From <http://www.unicode.org/reports/tr9/#BD13>
189     #[cfg_attr(rustfmt, rustfmt_skip)]
190     #[test]
test_isolating_run_sequences()191     fn test_isolating_run_sequences() {
192 
193         // == Example 1 ==
194         // text1·RLE·text2·PDF·RLE·text3·PDF·text4
195         // index        0    1  2    3    4  5    6  7
196         let classes = &[L, RLE, L, PDF, RLE, L, PDF, L];
197         let levels =  &[0,   1, 1,   1,   1, 1,   1, 0];
198         let para_level = Level::ltr();
199         let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
200         sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
201         assert_eq!(
202             sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
203             vec![vec![0..2], vec![2..7], vec![7..8]]
204         );
205 
206         // == Example 2 ==
207         // text1·RLI·text2·PDI·RLI·text3·PDI·text4
208         // index        0    1  2    3    4  5    6  7
209         let classes = &[L, RLI, L, PDI, RLI, L, PDI, L];
210         let levels =  &[0,   0, 1,   0,   0, 1,   0, 0];
211         let para_level = Level::ltr();
212         let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
213         sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
214         assert_eq!(
215             sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
216             vec![vec![0..2, 3..5, 6..8], vec![2..3], vec![5..6]]
217         );
218 
219         // == Example 3 ==
220         // text1·RLI·text2·LRI·text3·RLE·text4·PDF·text5·PDI·text6·PDI·text7
221         // index        0    1  2    3  4    5  6    7  8    9  10  11  12
222         let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI,  L];
223         let levels =  &[0,   0, 1,   1, 2,   3, 3,   3, 2,   1, 1,   0,  0];
224         let para_level = Level::ltr();
225         let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
226         sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
227         assert_eq!(
228             sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
229             vec![vec![0..2, 11..13], vec![2..4, 9..11], vec![4..6], vec![6..8], vec![8..9]]
230         );
231     }
232 
233     // From <http://www.unicode.org/reports/tr9/#X10>
234     #[cfg_attr(rustfmt, rustfmt_skip)]
235     #[test]
test_isolating_run_sequences_sos_and_eos()236     fn test_isolating_run_sequences_sos_and_eos() {
237 
238         // == Example 1 ==
239         // text1·RLE·text2·LRE·text3·PDF·text4·PDF·RLE·text5·PDF·text6
240         // index        0    1  2    3  4    5  6    7    8  9   10  11
241         let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF,  L];
242         let levels =  &[0,   1, 1,   2, 2,   2, 1,   1,   1, 1,   1,  0];
243         let para_level = Level::ltr();
244         let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
245         sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
246 
247         // text1
248         assert_eq!(
249             &sequences[0],
250             &IsolatingRunSequence {
251                 runs: vec![0..2],
252                 sos: L,
253                 eos: R,
254             }
255         );
256 
257         // text2
258         assert_eq!(
259             &sequences[1],
260             &IsolatingRunSequence {
261                 runs: vec![2..4],
262                 sos: R,
263                 eos: L,
264             }
265         );
266 
267         // text3
268         assert_eq!(
269             &sequences[2],
270             &IsolatingRunSequence {
271                 runs: vec![4..6],
272                 sos: L,
273                 eos: L,
274             }
275         );
276 
277         // text4 text5
278         assert_eq!(
279             &sequences[3],
280             &IsolatingRunSequence {
281                 runs: vec![6..11],
282                 sos: L,
283                 eos: R,
284             }
285         );
286 
287         // text6
288         assert_eq!(
289             &sequences[4],
290             &IsolatingRunSequence {
291                 runs: vec![11..12],
292                 sos: R,
293                 eos: L,
294             }
295         );
296 
297         // == Example 2 ==
298         // text1·RLI·text2·LRI·text3·PDI·text4·PDI·RLI·text5·PDI·text6
299         // index        0    1  2    3  4    5  6    7    8  9   10  11
300         let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI,  L];
301         let levels =  &[0,   0, 1,   1, 2,   1, 1,   0,   0, 1,   0,  0];
302         let para_level = Level::ltr();
303         let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
304         sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
305 
306         // text1·RLI·PDI·RLI·PDI·text6
307         assert_eq!(
308             &sequences[0],
309             &IsolatingRunSequence {
310                 runs: vec![0..2, 7..9, 10..12],
311                 sos: L,
312                 eos: L,
313             }
314         );
315 
316         // text2·LRI·PDI·text4
317         assert_eq!(
318             &sequences[1],
319             &IsolatingRunSequence {
320                 runs: vec![2..4, 5..7],
321                 sos: R,
322                 eos: R,
323             }
324         );
325 
326         // text3
327         assert_eq!(
328             &sequences[2],
329             &IsolatingRunSequence {
330                 runs: vec![4..5],
331                 sos: L,
332                 eos: L,
333             }
334         );
335 
336         // text5
337         assert_eq!(
338             &sequences[3],
339             &IsolatingRunSequence {
340                 runs: vec![9..10],
341                 sos: R,
342                 eos: R,
343             }
344         );
345     }
346 
347     #[test]
test_removed_by_x9()348     fn test_removed_by_x9() {
349         let rem_classes = &[RLE, LRE, RLO, LRO, PDF, BN];
350         let not_classes = &[L, RLI, AL, LRI, PDI];
351         for x in rem_classes {
352             assert_eq!(removed_by_x9(*x), true);
353         }
354         for x in not_classes {
355             assert_eq!(removed_by_x9(*x), false);
356         }
357     }
358 
359     #[test]
test_not_removed_by_x9()360     fn test_not_removed_by_x9() {
361         let non_x9_classes = &[L, R, AL, EN, ES, ET, AN, CS, NSM, B, S, WS, ON, LRI, RLI, FSI, PDI];
362         for x in non_x9_classes {
363             assert_eq!(not_removed_by_x9(&x), true);
364         }
365     }
366 }
367