1// Copyright 2015 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package cases 6 7func (c info) cccVal() info { 8 if c&exceptionBit != 0 { 9 return info(exceptions[c>>exceptionShift]) & cccMask 10 } 11 return c & cccMask 12} 13 14func (c info) cccType() info { 15 ccc := c.cccVal() 16 if ccc <= cccZero { 17 return cccZero 18 } 19 return ccc 20} 21 22// TODO: Implement full Unicode breaking algorithm: 23// 1) Implement breaking in separate package. 24// 2) Use the breaker here. 25// 3) Compare table size and performance of using the more generic breaker. 26// 27// Note that we can extend the current algorithm to be much more accurate. This 28// only makes sense, though, if the performance and/or space penalty of using 29// the generic breaker is big. Extra data will only be needed for non-cased 30// runes, which means there are sufficient bits left in the caseType. 31// ICU prohibits breaking in such cases as well. 32 33// For the purpose of title casing we use an approximation of the Unicode Word 34// Breaking algorithm defined in Annex #29: 35// http://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table. 36// 37// For our approximation, we group the Word Break types into the following 38// categories, with associated rules: 39// 40// 1) Letter: 41// ALetter, Hebrew_Letter, Numeric, ExtendNumLet, Extend, Format_FE, ZWJ. 42// Rule: Never break between consecutive runes of this category. 43// 44// 2) Mid: 45// MidLetter, MidNumLet, Single_Quote. 46// (Cf. case-ignorable: MidLetter, MidNumLet, Single_Quote or cat is Mn, 47// Me, Cf, Lm or Sk). 48// Rule: Don't break between Letter and Mid, but break between two Mids. 49// 50// 3) Break: 51// Any other category: NewLine, MidNum, CR, LF, Double_Quote, Katakana, and 52// Other. 53// These categories should always result in a break between two cased letters. 54// Rule: Always break. 55// 56// Note 1: the Katakana and MidNum categories can, in esoteric cases, result in 57// preventing a break between two cased letters. For now we will ignore this 58// (e.g. [ALetter] [ExtendNumLet] [Katakana] [ExtendNumLet] [ALetter] and 59// [ALetter] [Numeric] [MidNum] [Numeric] [ALetter].) 60// 61// Note 2: the rule for Mid is very approximate, but works in most cases. To 62// improve, we could store the categories in the trie value and use a FA to 63// manage breaks. See TODO comment above. 64// 65// Note 3: according to the spec, it is possible for the Extend category to 66// introduce breaks between other categories grouped in Letter. However, this 67// is undesirable for our purposes. ICU prevents breaks in such cases as well. 68 69// isBreak returns whether this rune should introduce a break. 70func (c info) isBreak() bool { 71 return c.cccVal() == cccBreak 72} 73 74// isLetter returns whether the rune is of break type ALetter, Hebrew_Letter, 75// Numeric, ExtendNumLet, or Extend. 76func (c info) isLetter() bool { 77 ccc := c.cccVal() 78 if ccc == cccZero { 79 return !c.isCaseIgnorable() 80 } 81 return ccc != cccBreak 82} 83