280 lines
8.7 KiB
Go
280 lines
8.7 KiB
Go
|
// Copyright 2011 The Go Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style
|
||
|
// license that can be found in the LICENSE file.
|
||
|
|
||
|
package norm
|
||
|
|
||
|
import "encoding/binary"
|
||
|
|
||
|
// This file contains Form-specific logic and wrappers for data in tables.go.
|
||
|
|
||
|
// Rune info is stored in a separate trie per composing form. A composing form
|
||
|
// and its corresponding decomposing form share the same trie. Each trie maps
|
||
|
// a rune to a uint16. The values take two forms. For v >= 0x8000:
|
||
|
// bits
|
||
|
// 15: 1 (inverse of NFD_QC bit of qcInfo)
|
||
|
// 13..7: qcInfo (see below). isYesD is always true (no decomposition).
|
||
|
// 6..0: ccc (compressed CCC value).
|
||
|
// For v < 0x8000, the respective rune has a decomposition and v is an index
|
||
|
// into a byte array of UTF-8 decomposition sequences and additional info and
|
||
|
// has the form:
|
||
|
// <header> <decomp_byte>* [<tccc> [<lccc>]]
|
||
|
// The header contains the number of bytes in the decomposition (excluding this
|
||
|
// length byte). The two most significant bits of this length byte correspond
|
||
|
// to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1.
|
||
|
// The byte sequence is followed by a trailing and leading CCC if the values
|
||
|
// for these are not zero. The value of v determines which ccc are appended
|
||
|
// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
|
||
|
// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
|
||
|
// there is an additional leading ccc. The value of tccc itself is the
|
||
|
// trailing CCC shifted left 2 bits. The two least-significant bits of tccc
|
||
|
// are the number of trailing non-starters.
|
||
|
|
||
|
const (
|
||
|
qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo
|
||
|
headerLenMask = 0x3F // extract the length value from the header byte
|
||
|
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
|
||
|
)
|
||
|
|
||
|
// Properties provides access to normalization properties of a rune.
|
||
|
type Properties struct {
|
||
|
pos uint8 // start position in reorderBuffer; used in composition.go
|
||
|
size uint8 // length of UTF-8 encoding of this rune
|
||
|
ccc uint8 // leading canonical combining class (ccc if not decomposition)
|
||
|
tccc uint8 // trailing canonical combining class (ccc if not decomposition)
|
||
|
nLead uint8 // number of leading non-starters.
|
||
|
flags qcInfo // quick check flags
|
||
|
index uint16
|
||
|
}
|
||
|
|
||
|
// functions dispatchable per form
|
||
|
type lookupFunc func(b input, i int) Properties
|
||
|
|
||
|
// formInfo holds Form-specific functions and tables.
|
||
|
type formInfo struct {
|
||
|
form Form
|
||
|
composing, compatibility bool // form type
|
||
|
info lookupFunc
|
||
|
nextMain iterFunc
|
||
|
}
|
||
|
|
||
|
var formTable = []*formInfo{{
|
||
|
form: NFC,
|
||
|
composing: true,
|
||
|
compatibility: false,
|
||
|
info: lookupInfoNFC,
|
||
|
nextMain: nextComposed,
|
||
|
}, {
|
||
|
form: NFD,
|
||
|
composing: false,
|
||
|
compatibility: false,
|
||
|
info: lookupInfoNFC,
|
||
|
nextMain: nextDecomposed,
|
||
|
}, {
|
||
|
form: NFKC,
|
||
|
composing: true,
|
||
|
compatibility: true,
|
||
|
info: lookupInfoNFKC,
|
||
|
nextMain: nextComposed,
|
||
|
}, {
|
||
|
form: NFKD,
|
||
|
composing: false,
|
||
|
compatibility: true,
|
||
|
info: lookupInfoNFKC,
|
||
|
nextMain: nextDecomposed,
|
||
|
}}
|
||
|
|
||
|
// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
|
||
|
// unexpected behavior for the user. For example, in NFD, there is a boundary
|
||
|
// after 'a'. However, 'a' might combine with modifiers, so from the application's
|
||
|
// perspective it is not a good boundary. We will therefore always use the
|
||
|
// boundaries for the combining variants.
|
||
|
|
||
|
// BoundaryBefore returns true if this rune starts a new segment and
|
||
|
// cannot combine with any rune on the left.
|
||
|
func (p Properties) BoundaryBefore() bool {
|
||
|
if p.ccc == 0 && !p.combinesBackward() {
|
||
|
return true
|
||
|
}
|
||
|
// We assume that the CCC of the first character in a decomposition
|
||
|
// is always non-zero if different from info.ccc and that we can return
|
||
|
// false at this point. This is verified by maketables.
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
// BoundaryAfter returns true if runes cannot combine with or otherwise
|
||
|
// interact with this or previous runes.
|
||
|
func (p Properties) BoundaryAfter() bool {
|
||
|
// TODO: loosen these conditions.
|
||
|
return p.isInert()
|
||
|
}
|
||
|
|
||
|
// We pack quick check data in 4 bits:
|
||
|
//
|
||
|
// 5: Combines forward (0 == false, 1 == true)
|
||
|
// 4..3: NFC_QC Yes(00), No (10), or Maybe (11)
|
||
|
// 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
|
||
|
// 1..0: Number of trailing non-starters.
|
||
|
//
|
||
|
// When all 4 bits are zero, the character is inert, meaning it is never
|
||
|
// influenced by normalization.
|
||
|
type qcInfo uint8
|
||
|
|
||
|
func (p Properties) isYesC() bool { return p.flags&0x10 == 0 }
|
||
|
func (p Properties) isYesD() bool { return p.flags&0x4 == 0 }
|
||
|
|
||
|
func (p Properties) combinesForward() bool { return p.flags&0x20 != 0 }
|
||
|
func (p Properties) combinesBackward() bool { return p.flags&0x8 != 0 } // == isMaybe
|
||
|
func (p Properties) hasDecomposition() bool { return p.flags&0x4 != 0 } // == isNoD
|
||
|
|
||
|
func (p Properties) isInert() bool {
|
||
|
return p.flags&qcInfoMask == 0 && p.ccc == 0
|
||
|
}
|
||
|
|
||
|
func (p Properties) multiSegment() bool {
|
||
|
return p.index >= firstMulti && p.index < endMulti
|
||
|
}
|
||
|
|
||
|
func (p Properties) nLeadingNonStarters() uint8 {
|
||
|
return p.nLead
|
||
|
}
|
||
|
|
||
|
func (p Properties) nTrailingNonStarters() uint8 {
|
||
|
return uint8(p.flags & 0x03)
|
||
|
}
|
||
|
|
||
|
// Decomposition returns the decomposition for the underlying rune
|
||
|
// or nil if there is none.
|
||
|
func (p Properties) Decomposition() []byte {
|
||
|
// TODO: create the decomposition for Hangul?
|
||
|
if p.index == 0 {
|
||
|
return nil
|
||
|
}
|
||
|
i := p.index
|
||
|
n := decomps[i] & headerLenMask
|
||
|
i++
|
||
|
return decomps[i : i+uint16(n)]
|
||
|
}
|
||
|
|
||
|
// Size returns the length of UTF-8 encoding of the rune.
|
||
|
func (p Properties) Size() int {
|
||
|
return int(p.size)
|
||
|
}
|
||
|
|
||
|
// CCC returns the canonical combining class of the underlying rune.
|
||
|
func (p Properties) CCC() uint8 {
|
||
|
if p.index >= firstCCCZeroExcept {
|
||
|
return 0
|
||
|
}
|
||
|
return ccc[p.ccc]
|
||
|
}
|
||
|
|
||
|
// LeadCCC returns the CCC of the first rune in the decomposition.
|
||
|
// If there is no decomposition, LeadCCC equals CCC.
|
||
|
func (p Properties) LeadCCC() uint8 {
|
||
|
return ccc[p.ccc]
|
||
|
}
|
||
|
|
||
|
// TrailCCC returns the CCC of the last rune in the decomposition.
|
||
|
// If there is no decomposition, TrailCCC equals CCC.
|
||
|
func (p Properties) TrailCCC() uint8 {
|
||
|
return ccc[p.tccc]
|
||
|
}
|
||
|
|
||
|
func buildRecompMap() {
|
||
|
recompMap = make(map[uint32]rune, len(recompMapPacked)/8)
|
||
|
var buf [8]byte
|
||
|
for i := 0; i < len(recompMapPacked); i += 8 {
|
||
|
copy(buf[:], recompMapPacked[i:i+8])
|
||
|
key := binary.BigEndian.Uint32(buf[:4])
|
||
|
val := binary.BigEndian.Uint32(buf[4:])
|
||
|
recompMap[key] = rune(val)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Recomposition
|
||
|
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
|
||
|
// This clips off the bits of three entries, but we know this will not
|
||
|
// result in a collision. In the unlikely event that changes to
|
||
|
// UnicodeData.txt introduce collisions, the compiler will catch it.
|
||
|
// Note that the recomposition map for NFC and NFKC are identical.
|
||
|
|
||
|
// combine returns the combined rune or 0 if it doesn't exist.
|
||
|
//
|
||
|
// The caller is responsible for calling
|
||
|
// recompMapOnce.Do(buildRecompMap) sometime before this is called.
|
||
|
func combine(a, b rune) rune {
|
||
|
key := uint32(uint16(a))<<16 + uint32(uint16(b))
|
||
|
if recompMap == nil {
|
||
|
panic("caller error") // see func comment
|
||
|
}
|
||
|
return recompMap[key]
|
||
|
}
|
||
|
|
||
|
func lookupInfoNFC(b input, i int) Properties {
|
||
|
v, sz := b.charinfoNFC(i)
|
||
|
return compInfo(v, sz)
|
||
|
}
|
||
|
|
||
|
func lookupInfoNFKC(b input, i int) Properties {
|
||
|
v, sz := b.charinfoNFKC(i)
|
||
|
return compInfo(v, sz)
|
||
|
}
|
||
|
|
||
|
// Properties returns properties for the first rune in s.
|
||
|
func (f Form) Properties(s []byte) Properties {
|
||
|
if f == NFC || f == NFD {
|
||
|
return compInfo(nfcData.lookup(s))
|
||
|
}
|
||
|
return compInfo(nfkcData.lookup(s))
|
||
|
}
|
||
|
|
||
|
// PropertiesString returns properties for the first rune in s.
|
||
|
func (f Form) PropertiesString(s string) Properties {
|
||
|
if f == NFC || f == NFD {
|
||
|
return compInfo(nfcData.lookupString(s))
|
||
|
}
|
||
|
return compInfo(nfkcData.lookupString(s))
|
||
|
}
|
||
|
|
||
|
// compInfo converts the information contained in v and sz
|
||
|
// to a Properties. See the comment at the top of the file
|
||
|
// for more information on the format.
|
||
|
func compInfo(v uint16, sz int) Properties {
|
||
|
if v == 0 {
|
||
|
return Properties{size: uint8(sz)}
|
||
|
} else if v >= 0x8000 {
|
||
|
p := Properties{
|
||
|
size: uint8(sz),
|
||
|
ccc: uint8(v),
|
||
|
tccc: uint8(v),
|
||
|
flags: qcInfo(v >> 8),
|
||
|
}
|
||
|
if p.ccc > 0 || p.combinesBackward() {
|
||
|
p.nLead = uint8(p.flags & 0x3)
|
||
|
}
|
||
|
return p
|
||
|
}
|
||
|
// has decomposition
|
||
|
h := decomps[v]
|
||
|
f := (qcInfo(h&headerFlagsMask) >> 2) | 0x4
|
||
|
p := Properties{size: uint8(sz), flags: f, index: v}
|
||
|
if v >= firstCCC {
|
||
|
v += uint16(h&headerLenMask) + 1
|
||
|
c := decomps[v]
|
||
|
p.tccc = c >> 2
|
||
|
p.flags |= qcInfo(c & 0x3)
|
||
|
if v >= firstLeadingCCC {
|
||
|
p.nLead = c & 0x3
|
||
|
if v >= firstStarterWithNLead {
|
||
|
// We were tricked. Remove the decomposition.
|
||
|
p.flags &= 0x03
|
||
|
p.index = 0
|
||
|
return p
|
||
|
}
|
||
|
p.ccc = decomps[v+1]
|
||
|
}
|
||
|
}
|
||
|
return p
|
||
|
}
|