aboutsummaryrefslogtreecommitdiff
path: root/vendor/golang.org/x/text/unicode/norm/normalize.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/golang.org/x/text/unicode/norm/normalize.go')
-rw-r--r--vendor/golang.org/x/text/unicode/norm/normalize.go610
1 files changed, 610 insertions, 0 deletions
diff --git a/vendor/golang.org/x/text/unicode/norm/normalize.go b/vendor/golang.org/x/text/unicode/norm/normalize.go
new file mode 100644
index 0000000..4747ad0
--- /dev/null
+++ b/vendor/golang.org/x/text/unicode/norm/normalize.go
@@ -0,0 +1,610 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Note: the file data_test.go that is generated should not be checked in.
+//go:generate go run maketables.go triegen.go
+//go:generate go test -tags test
+
+// Package norm contains types and functions for normalizing Unicode strings.
+package norm // import "golang.org/x/text/unicode/norm"
+
+import (
+ "unicode/utf8"
+
+ "golang.org/x/text/transform"
+)
+
+// A Form denotes a canonical representation of Unicode code points.
+// The Unicode-defined normalization and equivalence forms are:
+//
+// NFC Unicode Normalization Form C
+// NFD Unicode Normalization Form D
+// NFKC Unicode Normalization Form KC
+// NFKD Unicode Normalization Form KD
+//
+// For a Form f, this documentation uses the notation f(x) to mean
+// the bytes or string x converted to the given form.
+// A position n in x is called a boundary if conversion to the form can
+// proceed independently on both sides:
+//
+// f(x) == append(f(x[0:n]), f(x[n:])...)
+//
+// References: https://unicode.org/reports/tr15/ and
+// https://unicode.org/notes/tn5/.
+type Form int
+
+const (
+ NFC Form = iota
+ NFD
+ NFKC
+ NFKD
+)
+
+// Bytes returns f(b). May return b if f(b) = b.
+func (f Form) Bytes(b []byte) []byte {
+ src := inputBytes(b)
+ ft := formTable[f]
+ n, ok := ft.quickSpan(src, 0, len(b), true)
+ if ok {
+ return b
+ }
+ out := make([]byte, n, len(b))
+ copy(out, b[0:n])
+ rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
+ return doAppendInner(&rb, n)
+}
+
+// String returns f(s).
+func (f Form) String(s string) string {
+ src := inputString(s)
+ ft := formTable[f]
+ n, ok := ft.quickSpan(src, 0, len(s), true)
+ if ok {
+ return s
+ }
+ out := make([]byte, n, len(s))
+ copy(out, s[0:n])
+ rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
+ return string(doAppendInner(&rb, n))
+}
+
+// IsNormal returns true if b == f(b).
+func (f Form) IsNormal(b []byte) bool {
+ src := inputBytes(b)
+ ft := formTable[f]
+ bp, ok := ft.quickSpan(src, 0, len(b), true)
+ if ok {
+ return true
+ }
+ rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
+ rb.setFlusher(nil, cmpNormalBytes)
+ for bp < len(b) {
+ rb.out = b[bp:]
+ if bp = decomposeSegment(&rb, bp, true); bp < 0 {
+ return false
+ }
+ bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
+ }
+ return true
+}
+
+func cmpNormalBytes(rb *reorderBuffer) bool {
+ b := rb.out
+ for i := 0; i < rb.nrune; i++ {
+ info := rb.rune[i]
+ if int(info.size) > len(b) {
+ return false
+ }
+ p := info.pos
+ pe := p + info.size
+ for ; p < pe; p++ {
+ if b[0] != rb.byte[p] {
+ return false
+ }
+ b = b[1:]
+ }
+ }
+ return true
+}
+
+// IsNormalString returns true if s == f(s).
+func (f Form) IsNormalString(s string) bool {
+ src := inputString(s)
+ ft := formTable[f]
+ bp, ok := ft.quickSpan(src, 0, len(s), true)
+ if ok {
+ return true
+ }
+ rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
+ rb.setFlusher(nil, func(rb *reorderBuffer) bool {
+ for i := 0; i < rb.nrune; i++ {
+ info := rb.rune[i]
+ if bp+int(info.size) > len(s) {
+ return false
+ }
+ p := info.pos
+ pe := p + info.size
+ for ; p < pe; p++ {
+ if s[bp] != rb.byte[p] {
+ return false
+ }
+ bp++
+ }
+ }
+ return true
+ })
+ for bp < len(s) {
+ if bp = decomposeSegment(&rb, bp, true); bp < 0 {
+ return false
+ }
+ bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
+ }
+ return true
+}
+
+// patchTail fixes a case where a rune may be incorrectly normalized
+// if it is followed by illegal continuation bytes. It returns the
+// patched buffer and whether the decomposition is still in progress.
+func patchTail(rb *reorderBuffer) bool {
+ info, p := lastRuneStart(&rb.f, rb.out)
+ if p == -1 || info.size == 0 {
+ return true
+ }
+ end := p + int(info.size)
+ extra := len(rb.out) - end
+ if extra > 0 {
+ // Potentially allocating memory. However, this only
+ // happens with ill-formed UTF-8.
+ x := make([]byte, 0)
+ x = append(x, rb.out[len(rb.out)-extra:]...)
+ rb.out = rb.out[:end]
+ decomposeToLastBoundary(rb)
+ rb.doFlush()
+ rb.out = append(rb.out, x...)
+ return false
+ }
+ buf := rb.out[p:]
+ rb.out = rb.out[:p]
+ decomposeToLastBoundary(rb)
+ if s := rb.ss.next(info); s == ssStarter {
+ rb.doFlush()
+ rb.ss.first(info)
+ } else if s == ssOverflow {
+ rb.doFlush()
+ rb.insertCGJ()
+ rb.ss = 0
+ }
+ rb.insertUnsafe(inputBytes(buf), 0, info)
+ return true
+}
+
+func appendQuick(rb *reorderBuffer, i int) int {
+ if rb.nsrc == i {
+ return i
+ }
+ end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
+ rb.out = rb.src.appendSlice(rb.out, i, end)
+ return end
+}
+
+// Append returns f(append(out, b...)).
+// The buffer out must be nil, empty, or equal to f(out).
+func (f Form) Append(out []byte, src ...byte) []byte {
+ return f.doAppend(out, inputBytes(src), len(src))
+}
+
+func (f Form) doAppend(out []byte, src input, n int) []byte {
+ if n == 0 {
+ return out
+ }
+ ft := formTable[f]
+ // Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
+ if len(out) == 0 {
+ p, _ := ft.quickSpan(src, 0, n, true)
+ out = src.appendSlice(out, 0, p)
+ if p == n {
+ return out
+ }
+ rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
+ return doAppendInner(&rb, p)
+ }
+ rb := reorderBuffer{f: *ft, src: src, nsrc: n}
+ return doAppend(&rb, out, 0)
+}
+
+func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
+ rb.setFlusher(out, appendFlush)
+ src, n := rb.src, rb.nsrc
+ doMerge := len(out) > 0
+ if q := src.skipContinuationBytes(p); q > p {
+ // Move leading non-starters to destination.
+ rb.out = src.appendSlice(rb.out, p, q)
+ p = q
+ doMerge = patchTail(rb)
+ }
+ fd := &rb.f
+ if doMerge {
+ var info Properties
+ if p < n {
+ info = fd.info(src, p)
+ if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
+ if p == 0 {
+ decomposeToLastBoundary(rb)
+ }
+ p = decomposeSegment(rb, p, true)
+ }
+ }
+ if info.size == 0 {
+ rb.doFlush()
+ // Append incomplete UTF-8 encoding.
+ return src.appendSlice(rb.out, p, n)
+ }
+ if rb.nrune > 0 {
+ return doAppendInner(rb, p)
+ }
+ }
+ p = appendQuick(rb, p)
+ return doAppendInner(rb, p)
+}
+
+func doAppendInner(rb *reorderBuffer, p int) []byte {
+ for n := rb.nsrc; p < n; {
+ p = decomposeSegment(rb, p, true)
+ p = appendQuick(rb, p)
+ }
+ return rb.out
+}
+
+// AppendString returns f(append(out, []byte(s))).
+// The buffer out must be nil, empty, or equal to f(out).
+func (f Form) AppendString(out []byte, src string) []byte {
+ return f.doAppend(out, inputString(src), len(src))
+}
+
+// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
+// It is not guaranteed to return the largest such n.
+func (f Form) QuickSpan(b []byte) int {
+ n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
+ return n
+}
+
+// Span implements transform.SpanningTransformer. It returns a boundary n such
+// that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
+func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
+ n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
+ if n < len(b) {
+ if !ok {
+ err = transform.ErrEndOfSpan
+ } else {
+ err = transform.ErrShortSrc
+ }
+ }
+ return n, err
+}
+
+// SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
+// It is not guaranteed to return the largest such n.
+func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
+ n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
+ if n < len(s) {
+ if !ok {
+ err = transform.ErrEndOfSpan
+ } else {
+ err = transform.ErrShortSrc
+ }
+ }
+ return n, err
+}
+
+// quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
+// whether any non-normalized parts were found. If atEOF is false, n will
+// not point past the last segment if this segment might be become
+// non-normalized by appending other runes.
+func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
+ var lastCC uint8
+ ss := streamSafe(0)
+ lastSegStart := i
+ for n = end; i < n; {
+ if j := src.skipASCII(i, n); i != j {
+ i = j
+ lastSegStart = i - 1
+ lastCC = 0
+ ss = 0
+ continue
+ }
+ info := f.info(src, i)
+ if info.size == 0 {
+ if atEOF {
+ // include incomplete runes
+ return n, true
+ }
+ return lastSegStart, true
+ }
+ // This block needs to be before the next, because it is possible to
+ // have an overflow for runes that are starters (e.g. with U+FF9E).
+ switch ss.next(info) {
+ case ssStarter:
+ lastSegStart = i
+ case ssOverflow:
+ return lastSegStart, false
+ case ssSuccess:
+ if lastCC > info.ccc {
+ return lastSegStart, false
+ }
+ }
+ if f.composing {
+ if !info.isYesC() {
+ break
+ }
+ } else {
+ if !info.isYesD() {
+ break
+ }
+ }
+ lastCC = info.ccc
+ i += int(info.size)
+ }
+ if i == n {
+ if !atEOF {
+ n = lastSegStart
+ }
+ return n, true
+ }
+ return lastSegStart, false
+}
+
+// QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
+// It is not guaranteed to return the largest such n.
+func (f Form) QuickSpanString(s string) int {
+ n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
+ return n
+}
+
+// FirstBoundary returns the position i of the first boundary in b
+// or -1 if b contains no boundary.
+func (f Form) FirstBoundary(b []byte) int {
+ return f.firstBoundary(inputBytes(b), len(b))
+}
+
+func (f Form) firstBoundary(src input, nsrc int) int {
+ i := src.skipContinuationBytes(0)
+ if i >= nsrc {
+ return -1
+ }
+ fd := formTable[f]
+ ss := streamSafe(0)
+ // We should call ss.first here, but we can't as the first rune is
+ // skipped already. This means FirstBoundary can't really determine
+ // CGJ insertion points correctly. Luckily it doesn't have to.
+ for {
+ info := fd.info(src, i)
+ if info.size == 0 {
+ return -1
+ }
+ if s := ss.next(info); s != ssSuccess {
+ return i
+ }
+ i += int(info.size)
+ if i >= nsrc {
+ if !info.BoundaryAfter() && !ss.isMax() {
+ return -1
+ }
+ return nsrc
+ }
+ }
+}
+
+// FirstBoundaryInString returns the position i of the first boundary in s
+// or -1 if s contains no boundary.
+func (f Form) FirstBoundaryInString(s string) int {
+ return f.firstBoundary(inputString(s), len(s))
+}
+
+// NextBoundary reports the index of the boundary between the first and next
+// segment in b or -1 if atEOF is false and there are not enough bytes to
+// determine this boundary.
+func (f Form) NextBoundary(b []byte, atEOF bool) int {
+ return f.nextBoundary(inputBytes(b), len(b), atEOF)
+}
+
+// NextBoundaryInString reports the index of the boundary between the first and
+// next segment in b or -1 if atEOF is false and there are not enough bytes to
+// determine this boundary.
+func (f Form) NextBoundaryInString(s string, atEOF bool) int {
+ return f.nextBoundary(inputString(s), len(s), atEOF)
+}
+
+func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
+ if nsrc == 0 {
+ if atEOF {
+ return 0
+ }
+ return -1
+ }
+ fd := formTable[f]
+ info := fd.info(src, 0)
+ if info.size == 0 {
+ if atEOF {
+ return 1
+ }
+ return -1
+ }
+ ss := streamSafe(0)
+ ss.first(info)
+
+ for i := int(info.size); i < nsrc; i += int(info.size) {
+ info = fd.info(src, i)
+ if info.size == 0 {
+ if atEOF {
+ return i
+ }
+ return -1
+ }
+ // TODO: Using streamSafe to determine the boundary isn't the same as
+ // using BoundaryBefore. Determine which should be used.
+ if s := ss.next(info); s != ssSuccess {
+ return i
+ }
+ }
+ if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
+ return -1
+ }
+ return nsrc
+}
+
+// LastBoundary returns the position i of the last boundary in b
+// or -1 if b contains no boundary.
+func (f Form) LastBoundary(b []byte) int {
+ return lastBoundary(formTable[f], b)
+}
+
+func lastBoundary(fd *formInfo, b []byte) int {
+ i := len(b)
+ info, p := lastRuneStart(fd, b)
+ if p == -1 {
+ return -1
+ }
+ if info.size == 0 { // ends with incomplete rune
+ if p == 0 { // starts with incomplete rune
+ return -1
+ }
+ i = p
+ info, p = lastRuneStart(fd, b[:i])
+ if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
+ return i
+ }
+ }
+ if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
+ return i
+ }
+ if info.BoundaryAfter() {
+ return i
+ }
+ ss := streamSafe(0)
+ v := ss.backwards(info)
+ for i = p; i >= 0 && v != ssStarter; i = p {
+ info, p = lastRuneStart(fd, b[:i])
+ if v = ss.backwards(info); v == ssOverflow {
+ break
+ }
+ if p+int(info.size) != i {
+ if p == -1 { // no boundary found
+ return -1
+ }
+ return i // boundary after an illegal UTF-8 encoding
+ }
+ }
+ return i
+}
+
+// decomposeSegment scans the first segment in src into rb. It inserts 0x034f
+// (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
+// and returns the number of bytes consumed from src or iShortDst or iShortSrc.
+func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
+ // Force one character to be consumed.
+ info := rb.f.info(rb.src, sp)
+ if info.size == 0 {
+ return 0
+ }
+ if s := rb.ss.next(info); s == ssStarter {
+ // TODO: this could be removed if we don't support merging.
+ if rb.nrune > 0 {
+ goto end
+ }
+ } else if s == ssOverflow {
+ rb.insertCGJ()
+ goto end
+ }
+ if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
+ return int(err)
+ }
+ for {
+ sp += int(info.size)
+ if sp >= rb.nsrc {
+ if !atEOF && !info.BoundaryAfter() {
+ return int(iShortSrc)
+ }
+ break
+ }
+ info = rb.f.info(rb.src, sp)
+ if info.size == 0 {
+ if !atEOF {
+ return int(iShortSrc)
+ }
+ break
+ }
+ if s := rb.ss.next(info); s == ssStarter {
+ break
+ } else if s == ssOverflow {
+ rb.insertCGJ()
+ break
+ }
+ if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
+ return int(err)
+ }
+ }
+end:
+ if !rb.doFlush() {
+ return int(iShortDst)
+ }
+ return sp
+}
+
+// lastRuneStart returns the runeInfo and position of the last
+// rune in buf or the zero runeInfo and -1 if no rune was found.
+func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
+ p := len(buf) - 1
+ for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
+ }
+ if p < 0 {
+ return Properties{}, -1
+ }
+ return fd.info(inputBytes(buf), p), p
+}
+
+// decomposeToLastBoundary finds an open segment at the end of the buffer
+// and scans it into rb. Returns the buffer minus the last segment.
+func decomposeToLastBoundary(rb *reorderBuffer) {
+ fd := &rb.f
+ info, i := lastRuneStart(fd, rb.out)
+ if int(info.size) != len(rb.out)-i {
+ // illegal trailing continuation bytes
+ return
+ }
+ if info.BoundaryAfter() {
+ return
+ }
+ var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
+ padd := 0
+ ss := streamSafe(0)
+ p := len(rb.out)
+ for {
+ add[padd] = info
+ v := ss.backwards(info)
+ if v == ssOverflow {
+ // Note that if we have an overflow, it the string we are appending to
+ // is not correctly normalized. In this case the behavior is undefined.
+ break
+ }
+ padd++
+ p -= int(info.size)
+ if v == ssStarter || p < 0 {
+ break
+ }
+ info, i = lastRuneStart(fd, rb.out[:p])
+ if int(info.size) != p-i {
+ break
+ }
+ }
+ rb.ss = ss
+ // Copy bytes for insertion as we may need to overwrite rb.out.
+ var buf [maxBufferSize * utf8.UTFMax]byte
+ cp := buf[:copy(buf[:], rb.out[p:])]
+ rb.out = rb.out[:p]
+ for padd--; padd >= 0; padd-- {
+ info = add[padd]
+ rb.insertUnsafe(inputBytes(cp), 0, info)
+ cp = cp[info.size:]
+ }
+}