1// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package cases
6
7func (c info) cccVal() info {
8	if c&exceptionBit != 0 {
9		return info(exceptions[c>>exceptionShift]) & cccMask
10	}
11	return c & cccMask
12}
13
14func (c info) cccType() info {
15	ccc := c.cccVal()
16	if ccc <= cccZero {
17		return cccZero
18	}
19	return ccc
20}
21
22// TODO: Implement full Unicode breaking algorithm:
23// 1) Implement breaking in separate package.
24// 2) Use the breaker here.
25// 3) Compare table size and performance of using the more generic breaker.
26//
27// Note that we can extend the current algorithm to be much more accurate. This
28// only makes sense, though, if the performance and/or space penalty of using
29// the generic breaker is big. Extra data will only be needed for non-cased
30// runes, which means there are sufficient bits left in the caseType.
31// ICU prohibits breaking in such cases as well.
32
33// For the purpose of title casing we use an approximation of the Unicode Word
34// Breaking algorithm defined in Annex #29:
35// http://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table.
36//
37// For our approximation, we group the Word Break types into the following
38// categories, with associated rules:
39//
40// 1) Letter:
41//    ALetter, Hebrew_Letter, Numeric, ExtendNumLet, Extend, Format_FE, ZWJ.
42//    Rule: Never break between consecutive runes of this category.
43//
44// 2) Mid:
45//    MidLetter, MidNumLet, Single_Quote.
46//    (Cf. case-ignorable: MidLetter, MidNumLet, Single_Quote or cat is Mn,
47//    Me, Cf, Lm or Sk).
48//    Rule: Don't break between Letter and Mid, but break between two Mids.
49//
50// 3) Break:
51//    Any other category: NewLine, MidNum, CR, LF, Double_Quote, Katakana, and
52//    Other.
53//    These categories should always result in a break between two cased letters.
54//    Rule: Always break.
55//
56// Note 1: the Katakana and MidNum categories can, in esoteric cases, result in
57// preventing a break between two cased letters. For now we will ignore this
58// (e.g. [ALetter] [ExtendNumLet] [Katakana] [ExtendNumLet] [ALetter] and
59// [ALetter] [Numeric] [MidNum] [Numeric] [ALetter].)
60//
61// Note 2: the rule for Mid is very approximate, but works in most cases. To
62// improve, we could store the categories in the trie value and use a FA to
63// manage breaks. See TODO comment above.
64//
65// Note 3: according to the spec, it is possible for the Extend category to
66// introduce breaks between other categories grouped in Letter. However, this
67// is undesirable for our purposes. ICU prevents breaks in such cases as well.
68
69// isBreak returns whether this rune should introduce a break.
70func (c info) isBreak() bool {
71	return c.cccVal() == cccBreak
72}
73
74// isLetter returns whether the rune is of break type ALetter, Hebrew_Letter,
75// Numeric, ExtendNumLet, or Extend.
76func (c info) isLetter() bool {
77	ccc := c.cccVal()
78	if ccc == cccZero {
79		return !c.isCaseIgnorable()
80	}
81	return ccc != cccBreak
82}
83