aboutsummaryrefslogtreecommitdiff
path: root/vendor/golang.org/x/text/unicode/norm/forminfo.go
blob: 487335d14d360884a2b684e6d4055c7baea001ac (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package norm

import "encoding/binary"

// This file contains Form-specific logic and wrappers for data in tables.go.

// Rune info is stored in a separate trie per composing form. A composing form
// and its corresponding decomposing form share the same trie.  Each trie maps
// a rune to a uint16. The values take two forms.  For v >= 0x8000:
//   bits
//   15:    1 (inverse of NFD_QC bit of qcInfo)
//   13..7: qcInfo (see below). isYesD is always true (no decomposition).
//    6..0: ccc (compressed CCC value).
// For v < 0x8000, the respective rune has a decomposition and v is an index
// into a byte array of UTF-8 decomposition sequences and additional info and
// has the form:
//    <header> <decomp_byte>* [<tccc> [<lccc>]]
// The header contains the number of bytes in the decomposition (excluding this
// length byte). The two most significant bits of this length byte correspond
// to bit 5 and 4 of qcInfo (see below).  The byte sequence itself starts at v+1.
// The byte sequence is followed by a trailing and leading CCC if the values
// for these are not zero.  The value of v determines which ccc are appended
// to the sequences.  For v < firstCCC, there are none, for v >= firstCCC,
// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
// there is an additional leading ccc. The value of tccc itself is the
// trailing CCC shifted left 2 bits. The two least-significant bits of tccc
// are the number of trailing non-starters.

const (
	qcInfoMask      = 0x3F // to clear all but the relevant bits in a qcInfo
	headerLenMask   = 0x3F // extract the length value from the header byte
	headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
)

// Properties provides access to normalization properties of a rune.
type Properties struct {
	pos   uint8  // start position in reorderBuffer; used in composition.go
	size  uint8  // length of UTF-8 encoding of this rune
	ccc   uint8  // leading canonical combining class (ccc if not decomposition)
	tccc  uint8  // trailing canonical combining class (ccc if not decomposition)
	nLead uint8  // number of leading non-starters.
	flags qcInfo // quick check flags
	index uint16
}

// functions dispatchable per form
type lookupFunc func(b input, i int) Properties

// formInfo holds Form-specific functions and tables.
type formInfo struct {
	form                     Form
	composing, compatibility bool // form type
	info                     lookupFunc
	nextMain                 iterFunc
}

var formTable = []*formInfo{{
	form:          NFC,
	composing:     true,
	compatibility: false,
	info:          lookupInfoNFC,
	nextMain:      nextComposed,
}, {
	form:          NFD,
	composing:     false,
	compatibility: false,
	info:          lookupInfoNFC,
	nextMain:      nextDecomposed,
}, {
	form:          NFKC,
	composing:     true,
	compatibility: true,
	info:          lookupInfoNFKC,
	nextMain:      nextComposed,
}, {
	form:          NFKD,
	composing:     false,
	compatibility: true,
	info:          lookupInfoNFKC,
	nextMain:      nextDecomposed,
}}

// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
// unexpected behavior for the user.  For example, in NFD, there is a boundary
// after 'a'.  However, 'a' might combine with modifiers, so from the application's
// perspective it is not a good boundary. We will therefore always use the
// boundaries for the combining variants.

// BoundaryBefore returns true if this rune starts a new segment and
// cannot combine with any rune on the left.
func (p Properties) BoundaryBefore() bool {
	if p.ccc == 0 && !p.combinesBackward() {
		return true
	}
	// We assume that the CCC of the first character in a decomposition
	// is always non-zero if different from info.ccc and that we can return
	// false at this point. This is verified by maketables.
	return false
}

// BoundaryAfter returns true if runes cannot combine with or otherwise
// interact with this or previous runes.
func (p Properties) BoundaryAfter() bool {
	// TODO: loosen these conditions.
	return p.isInert()
}

// We pack quick check data in 4 bits:
//
//	5:    Combines forward  (0 == false, 1 == true)
//	4..3: NFC_QC Yes(00), No (10), or Maybe (11)
//	2:    NFD_QC Yes (0) or No (1). No also means there is a decomposition.
//	1..0: Number of trailing non-starters.
//
// When all 4 bits are zero, the character is inert, meaning it is never
// influenced by normalization.
type qcInfo uint8

func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }

func (p Properties) combinesForward() bool  { return p.flags&0x20 != 0 }
func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD

func (p Properties) isInert() bool {
	return p.flags&qcInfoMask == 0 && p.ccc == 0
}

func (p Properties) multiSegment() bool {
	return p.index >= firstMulti && p.index < endMulti
}

func (p Properties) nLeadingNonStarters() uint8 {
	return p.nLead
}

func (p Properties) nTrailingNonStarters() uint8 {
	return uint8(p.flags & 0x03)
}

// Decomposition returns the decomposition for the underlying rune
// or nil if there is none.
func (p Properties) Decomposition() []byte {
	// TODO: create the decomposition for Hangul?
	if p.index == 0 {
		return nil
	}
	i := p.index
	n := decomps[i] & headerLenMask
	i++
	return decomps[i : i+uint16(n)]
}

// Size returns the length of UTF-8 encoding of the rune.
func (p Properties) Size() int {
	return int(p.size)
}

// CCC returns the canonical combining class of the underlying rune.
func (p Properties) CCC() uint8 {
	if p.index >= firstCCCZeroExcept {
		return 0
	}
	return ccc[p.ccc]
}

// LeadCCC returns the CCC of the first rune in the decomposition.
// If there is no decomposition, LeadCCC equals CCC.
func (p Properties) LeadCCC() uint8 {
	return ccc[p.ccc]
}

// TrailCCC returns the CCC of the last rune in the decomposition.
// If there is no decomposition, TrailCCC equals CCC.
func (p Properties) TrailCCC() uint8 {
	return ccc[p.tccc]
}

func buildRecompMap() {
	recompMap = make(map[uint32]rune, len(recompMapPacked)/8)
	var buf [8]byte
	for i := 0; i < len(recompMapPacked); i += 8 {
		copy(buf[:], recompMapPacked[i:i+8])
		key := binary.BigEndian.Uint32(buf[:4])
		val := binary.BigEndian.Uint32(buf[4:])
		recompMap[key] = rune(val)
	}
}

// Recomposition
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
// This clips off the bits of three entries, but we know this will not
// result in a collision. In the unlikely event that changes to
// UnicodeData.txt introduce collisions, the compiler will catch it.
// Note that the recomposition map for NFC and NFKC are identical.

// combine returns the combined rune or 0 if it doesn't exist.
//
// The caller is responsible for calling
// recompMapOnce.Do(buildRecompMap) sometime before this is called.
func combine(a, b rune) rune {
	key := uint32(uint16(a))<<16 + uint32(uint16(b))
	if recompMap == nil {
		panic("caller error") // see func comment
	}
	return recompMap[key]
}

func lookupInfoNFC(b input, i int) Properties {
	v, sz := b.charinfoNFC(i)
	return compInfo(v, sz)
}

func lookupInfoNFKC(b input, i int) Properties {
	v, sz := b.charinfoNFKC(i)
	return compInfo(v, sz)
}

// Properties returns properties for the first rune in s.
func (f Form) Properties(s []byte) Properties {
	if f == NFC || f == NFD {
		return compInfo(nfcData.lookup(s))
	}
	return compInfo(nfkcData.lookup(s))
}

// PropertiesString returns properties for the first rune in s.
func (f Form) PropertiesString(s string) Properties {
	if f == NFC || f == NFD {
		return compInfo(nfcData.lookupString(s))
	}
	return compInfo(nfkcData.lookupString(s))
}

// compInfo converts the information contained in v and sz
// to a Properties.  See the comment at the top of the file
// for more information on the format.
func compInfo(v uint16, sz int) Properties {
	if v == 0 {
		return Properties{size: uint8(sz)}
	} else if v >= 0x8000 {
		p := Properties{
			size:  uint8(sz),
			ccc:   uint8(v),
			tccc:  uint8(v),
			flags: qcInfo(v >> 8),
		}
		if p.ccc > 0 || p.combinesBackward() {
			p.nLead = uint8(p.flags & 0x3)
		}
		return p
	}
	// has decomposition
	h := decomps[v]
	f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
	p := Properties{size: uint8(sz), flags: f, index: v}
	if v >= firstCCC {
		v += uint16(h&headerLenMask) + 1
		c := decomps[v]
		p.tccc = c >> 2
		p.flags |= qcInfo(c & 0x3)
		if v >= firstLeadingCCC {
			p.nLead = c & 0x3
			if v >= firstStarterWithNLead {
				// We were tricked. Remove the decomposition.
				p.flags &= 0x03
				p.index = 0
				return p
			}
			p.ccc = decomps[v+1]
		}
	}
	return p
}