1// Copyright 2015 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package bidi 6 7import ( 8 "container/list" 9 "fmt" 10 "sort" 11) 12 13// This file contains a port of the reference implementation of the 14// Bidi Parentheses Algorithm: 15// https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java 16// 17// The implementation in this file covers definitions BD14-BD16 and rule N0 18// of UAX#9. 19// 20// Some preprocessing is done for each rune before data is passed to this 21// algorithm: 22// - opening and closing brackets are identified 23// - a bracket pair type, like '(' and ')' is assigned a unique identifier that 24// is identical for the opening and closing bracket. It is left to do these 25// mappings. 26// - The BPA algorithm requires that bracket characters that are canonical 27// equivalents of each other be able to be substituted for each other. 28// It is the responsibility of the caller to do this canonicalization. 29// 30// In implementing BD16, this implementation departs slightly from the "logical" 31// algorithm defined in UAX#9. In particular, the stack referenced there 32// supports operations that go beyond a "basic" stack. An equivalent 33// implementation based on a linked list is used here. 34 35// Bidi_Paired_Bracket_Type 36// BD14. An opening paired bracket is a character whose 37// Bidi_Paired_Bracket_Type property value is Open. 38// 39// BD15. A closing paired bracket is a character whose 40// Bidi_Paired_Bracket_Type property value is Close. 41type bracketType byte 42 43const ( 44 bpNone bracketType = iota 45 bpOpen 46 bpClose 47) 48 49// bracketPair holds a pair of index values for opening and closing bracket 50// location of a bracket pair. 51type bracketPair struct { 52 opener int 53 closer int 54} 55 56func (b *bracketPair) String() string { 57 return fmt.Sprintf("(%v, %v)", b.opener, b.closer) 58} 59 60// bracketPairs is a slice of bracketPairs with a sort.Interface implementation. 61type bracketPairs []bracketPair 62 63func (b bracketPairs) Len() int { return len(b) } 64func (b bracketPairs) Swap(i, j int) { b[i], b[j] = b[j], b[i] } 65func (b bracketPairs) Less(i, j int) bool { return b[i].opener < b[j].opener } 66 67// resolvePairedBrackets runs the paired bracket part of the UBA algorithm. 68// 69// For each rune, it takes the indexes into the original string, the class the 70// bracket type (in pairTypes) and the bracket identifier (pairValues). It also 71// takes the direction type for the start-of-sentence and the embedding level. 72// 73// The identifiers for bracket types are the rune of the canonicalized opening 74// bracket for brackets (open or close) or 0 for runes that are not brackets. 75func resolvePairedBrackets(s *isolatingRunSequence) { 76 p := bracketPairer{ 77 sos: s.sos, 78 openers: list.New(), 79 codesIsolatedRun: s.types, 80 indexes: s.indexes, 81 } 82 dirEmbed := L 83 if s.level&1 != 0 { 84 dirEmbed = R 85 } 86 p.locateBrackets(s.p.pairTypes, s.p.pairValues) 87 p.resolveBrackets(dirEmbed, s.p.initialTypes) 88} 89 90type bracketPairer struct { 91 sos Class // direction corresponding to start of sequence 92 93 // The following is a restatement of BD 16 using non-algorithmic language. 94 // 95 // A bracket pair is a pair of characters consisting of an opening 96 // paired bracket and a closing paired bracket such that the 97 // Bidi_Paired_Bracket property value of the former equals the latter, 98 // subject to the following constraints. 99 // - both characters of a pair occur in the same isolating run sequence 100 // - the closing character of a pair follows the opening character 101 // - any bracket character can belong at most to one pair, the earliest possible one 102 // - any bracket character not part of a pair is treated like an ordinary character 103 // - pairs may nest properly, but their spans may not overlap otherwise 104 105 // Bracket characters with canonical decompositions are supposed to be 106 // treated as if they had been normalized, to allow normalized and non- 107 // normalized text to give the same result. In this implementation that step 108 // is pushed out to the caller. The caller has to ensure that the pairValue 109 // slices contain the rune of the opening bracket after normalization for 110 // any opening or closing bracket. 111 112 openers *list.List // list of positions for opening brackets 113 114 // bracket pair positions sorted by location of opening bracket 115 pairPositions bracketPairs 116 117 codesIsolatedRun []Class // directional bidi codes for an isolated run 118 indexes []int // array of index values into the original string 119 120} 121 122// matchOpener reports whether characters at given positions form a matching 123// bracket pair. 124func (p *bracketPairer) matchOpener(pairValues []rune, opener, closer int) bool { 125 return pairValues[p.indexes[opener]] == pairValues[p.indexes[closer]] 126} 127 128const maxPairingDepth = 63 129 130// locateBrackets locates matching bracket pairs according to BD16. 131// 132// This implementation uses a linked list instead of a stack, because, while 133// elements are added at the front (like a push) they are not generally removed 134// in atomic 'pop' operations, reducing the benefit of the stack archetype. 135func (p *bracketPairer) locateBrackets(pairTypes []bracketType, pairValues []rune) { 136 // traverse the run 137 // do that explicitly (not in a for-each) so we can record position 138 for i, index := range p.indexes { 139 140 // look at the bracket type for each character 141 if pairTypes[index] == bpNone || p.codesIsolatedRun[i] != ON { 142 // continue scanning 143 continue 144 } 145 switch pairTypes[index] { 146 case bpOpen: 147 // check if maximum pairing depth reached 148 if p.openers.Len() == maxPairingDepth { 149 p.openers.Init() 150 return 151 } 152 // remember opener location, most recent first 153 p.openers.PushFront(i) 154 155 case bpClose: 156 // see if there is a match 157 count := 0 158 for elem := p.openers.Front(); elem != nil; elem = elem.Next() { 159 count++ 160 opener := elem.Value.(int) 161 if p.matchOpener(pairValues, opener, i) { 162 // if the opener matches, add nested pair to the ordered list 163 p.pairPositions = append(p.pairPositions, bracketPair{opener, i}) 164 // remove up to and including matched opener 165 for ; count > 0; count-- { 166 p.openers.Remove(p.openers.Front()) 167 } 168 break 169 } 170 } 171 sort.Sort(p.pairPositions) 172 // if we get here, the closing bracket matched no openers 173 // and gets ignored 174 } 175 } 176} 177 178// Bracket pairs within an isolating run sequence are processed as units so 179// that both the opening and the closing paired bracket in a pair resolve to 180// the same direction. 181// 182// N0. Process bracket pairs in an isolating run sequence sequentially in 183// the logical order of the text positions of the opening paired brackets 184// using the logic given below. Within this scope, bidirectional types EN 185// and AN are treated as R. 186// 187// Identify the bracket pairs in the current isolating run sequence 188// according to BD16. For each bracket-pair element in the list of pairs of 189// text positions: 190// 191// a Inspect the bidirectional types of the characters enclosed within the 192// bracket pair. 193// 194// b If any strong type (either L or R) matching the embedding direction is 195// found, set the type for both brackets in the pair to match the embedding 196// direction. 197// 198// o [ e ] o -> o e e e o 199// 200// o [ o e ] -> o e o e e 201// 202// o [ NI e ] -> o e NI e e 203// 204// c Otherwise, if a strong type (opposite the embedding direction) is 205// found, test for adjacent strong types as follows: 1 First, check 206// backwards before the opening paired bracket until the first strong type 207// (L, R, or sos) is found. If that first preceding strong type is opposite 208// the embedding direction, then set the type for both brackets in the pair 209// to that type. 2 Otherwise, set the type for both brackets in the pair to 210// the embedding direction. 211// 212// o [ o ] e -> o o o o e 213// 214// o [ o NI ] o -> o o o NI o o 215// 216// e [ o ] o -> e e o e o 217// 218// e [ o ] e -> e e o e e 219// 220// e ( o [ o ] NI ) e -> e e o o o o NI e e 221// 222// d Otherwise, do not set the type for the current bracket pair. Note that 223// if the enclosed text contains no strong types the paired brackets will 224// both resolve to the same level when resolved individually using rules N1 225// and N2. 226// 227// e ( NI ) o -> e ( NI ) o 228 229// getStrongTypeN0 maps character's directional code to strong type as required 230// by rule N0. 231// 232// TODO: have separate type for "strong" directionality. 233func (p *bracketPairer) getStrongTypeN0(index int) Class { 234 switch p.codesIsolatedRun[index] { 235 // in the scope of N0, number types are treated as R 236 case EN, AN, AL, R: 237 return R 238 case L: 239 return L 240 default: 241 return ON 242 } 243} 244 245// classifyPairContent reports the strong types contained inside a Bracket Pair, 246// assuming the given embedding direction. 247// 248// It returns ON if no strong type is found. If a single strong type is found, 249// it returns this type. Otherwise it returns the embedding direction. 250// 251// TODO: use separate type for "strong" directionality. 252func (p *bracketPairer) classifyPairContent(loc bracketPair, dirEmbed Class) Class { 253 dirOpposite := ON 254 for i := loc.opener + 1; i < loc.closer; i++ { 255 dir := p.getStrongTypeN0(i) 256 if dir == ON { 257 continue 258 } 259 if dir == dirEmbed { 260 return dir // type matching embedding direction found 261 } 262 dirOpposite = dir 263 } 264 // return ON if no strong type found, or class opposite to dirEmbed 265 return dirOpposite 266} 267 268// classBeforePair determines which strong types are present before a Bracket 269// Pair. Return R or L if strong type found, otherwise ON. 270func (p *bracketPairer) classBeforePair(loc bracketPair) Class { 271 for i := loc.opener - 1; i >= 0; i-- { 272 if dir := p.getStrongTypeN0(i); dir != ON { 273 return dir 274 } 275 } 276 // no strong types found, return sos 277 return p.sos 278} 279 280// assignBracketType implements rule N0 for a single bracket pair. 281func (p *bracketPairer) assignBracketType(loc bracketPair, dirEmbed Class, initialTypes []Class) { 282 // rule "N0, a", inspect contents of pair 283 dirPair := p.classifyPairContent(loc, dirEmbed) 284 285 // dirPair is now L, R, or N (no strong type found) 286 287 // the following logical tests are performed out of order compared to 288 // the statement of the rules but yield the same results 289 if dirPair == ON { 290 return // case "d" - nothing to do 291 } 292 293 if dirPair != dirEmbed { 294 // case "c": strong type found, opposite - check before (c.1) 295 dirPair = p.classBeforePair(loc) 296 if dirPair == dirEmbed || dirPair == ON { 297 // no strong opposite type found before - use embedding (c.2) 298 dirPair = dirEmbed 299 } 300 } 301 // else: case "b", strong type found matching embedding, 302 // no explicit action needed, as dirPair is already set to embedding 303 // direction 304 305 // set the bracket types to the type found 306 p.setBracketsToType(loc, dirPair, initialTypes) 307} 308 309func (p *bracketPairer) setBracketsToType(loc bracketPair, dirPair Class, initialTypes []Class) { 310 p.codesIsolatedRun[loc.opener] = dirPair 311 p.codesIsolatedRun[loc.closer] = dirPair 312 313 for i := loc.opener + 1; i < loc.closer; i++ { 314 index := p.indexes[i] 315 if initialTypes[index] != NSM { 316 break 317 } 318 p.codesIsolatedRun[i] = dirPair 319 } 320 321 for i := loc.closer + 1; i < len(p.indexes); i++ { 322 index := p.indexes[i] 323 if initialTypes[index] != NSM { 324 break 325 } 326 p.codesIsolatedRun[i] = dirPair 327 } 328} 329 330// resolveBrackets implements rule N0 for a list of pairs. 331func (p *bracketPairer) resolveBrackets(dirEmbed Class, initialTypes []Class) { 332 for _, loc := range p.pairPositions { 333 p.assignBracketType(loc, dirEmbed, initialTypes) 334 } 335} 336