diff options
author | Gibheer <gibheer+git@zero-knowledge.org> | 2024-09-05 19:38:25 +0200 |
---|---|---|
committer | Gibheer <gibheer+git@zero-knowledge.org> | 2024-09-05 19:38:25 +0200 |
commit | 6ea4d2c82de80efc87708e5e182034b7c6c2019e (patch) | |
tree | 35c0856a929040216c82153ca62d43b27530a887 /vendor/golang.org/x/text/unicode/norm/iter.go | |
parent | 6f64eeace1b66639b9380b44e88a8d54850a4306 (diff) |
lib/pq is out of maintenance for some time now, so switch to the newer
more active library. Looks like it finally stabilized after a long time.
Diffstat (limited to 'vendor/golang.org/x/text/unicode/norm/iter.go')
-rw-r--r-- | vendor/golang.org/x/text/unicode/norm/iter.go | 458 |
1 files changed, 458 insertions, 0 deletions
diff --git a/vendor/golang.org/x/text/unicode/norm/iter.go b/vendor/golang.org/x/text/unicode/norm/iter.go new file mode 100644 index 0000000..417c6b2 --- /dev/null +++ b/vendor/golang.org/x/text/unicode/norm/iter.go @@ -0,0 +1,458 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package norm + +import ( + "fmt" + "unicode/utf8" +) + +// MaxSegmentSize is the maximum size of a byte buffer needed to consider any +// sequence of starter and non-starter runes for the purpose of normalization. +const MaxSegmentSize = maxByteBufferSize + +// An Iter iterates over a string or byte slice, while normalizing it +// to a given Form. +type Iter struct { + rb reorderBuffer + buf [maxByteBufferSize]byte + info Properties // first character saved from previous iteration + next iterFunc // implementation of next depends on form + asciiF iterFunc + + p int // current position in input source + multiSeg []byte // remainder of multi-segment decomposition +} + +type iterFunc func(*Iter) []byte + +// Init initializes i to iterate over src after normalizing it to Form f. +func (i *Iter) Init(f Form, src []byte) { + i.p = 0 + if len(src) == 0 { + i.setDone() + i.rb.nsrc = 0 + return + } + i.multiSeg = nil + i.rb.init(f, src) + i.next = i.rb.f.nextMain + i.asciiF = nextASCIIBytes + i.info = i.rb.f.info(i.rb.src, i.p) + i.rb.ss.first(i.info) +} + +// InitString initializes i to iterate over src after normalizing it to Form f. +func (i *Iter) InitString(f Form, src string) { + i.p = 0 + if len(src) == 0 { + i.setDone() + i.rb.nsrc = 0 + return + } + i.multiSeg = nil + i.rb.initString(f, src) + i.next = i.rb.f.nextMain + i.asciiF = nextASCIIString + i.info = i.rb.f.info(i.rb.src, i.p) + i.rb.ss.first(i.info) +} + +// Seek sets the segment to be returned by the next call to Next to start +// at position p. It is the responsibility of the caller to set p to the +// start of a segment. +func (i *Iter) Seek(offset int64, whence int) (int64, error) { + var abs int64 + switch whence { + case 0: + abs = offset + case 1: + abs = int64(i.p) + offset + case 2: + abs = int64(i.rb.nsrc) + offset + default: + return 0, fmt.Errorf("norm: invalid whence") + } + if abs < 0 { + return 0, fmt.Errorf("norm: negative position") + } + if int(abs) >= i.rb.nsrc { + i.setDone() + return int64(i.p), nil + } + i.p = int(abs) + i.multiSeg = nil + i.next = i.rb.f.nextMain + i.info = i.rb.f.info(i.rb.src, i.p) + i.rb.ss.first(i.info) + return abs, nil +} + +// returnSlice returns a slice of the underlying input type as a byte slice. +// If the underlying is of type []byte, it will simply return a slice. +// If the underlying is of type string, it will copy the slice to the buffer +// and return that. +func (i *Iter) returnSlice(a, b int) []byte { + if i.rb.src.bytes == nil { + return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])] + } + return i.rb.src.bytes[a:b] +} + +// Pos returns the byte position at which the next call to Next will commence processing. +func (i *Iter) Pos() int { + return i.p +} + +func (i *Iter) setDone() { + i.next = nextDone + i.p = i.rb.nsrc +} + +// Done returns true if there is no more input to process. +func (i *Iter) Done() bool { + return i.p >= i.rb.nsrc +} + +// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input. +// For any input a and b for which f(a) == f(b), subsequent calls +// to Next will return the same segments. +// Modifying runes are grouped together with the preceding starter, if such a starter exists. +// Although not guaranteed, n will typically be the smallest possible n. +func (i *Iter) Next() []byte { + return i.next(i) +} + +func nextASCIIBytes(i *Iter) []byte { + p := i.p + 1 + if p >= i.rb.nsrc { + p0 := i.p + i.setDone() + return i.rb.src.bytes[p0:p] + } + if i.rb.src.bytes[p] < utf8.RuneSelf { + p0 := i.p + i.p = p + return i.rb.src.bytes[p0:p] + } + i.info = i.rb.f.info(i.rb.src, i.p) + i.next = i.rb.f.nextMain + return i.next(i) +} + +func nextASCIIString(i *Iter) []byte { + p := i.p + 1 + if p >= i.rb.nsrc { + i.buf[0] = i.rb.src.str[i.p] + i.setDone() + return i.buf[:1] + } + if i.rb.src.str[p] < utf8.RuneSelf { + i.buf[0] = i.rb.src.str[i.p] + i.p = p + return i.buf[:1] + } + i.info = i.rb.f.info(i.rb.src, i.p) + i.next = i.rb.f.nextMain + return i.next(i) +} + +func nextHangul(i *Iter) []byte { + p := i.p + next := p + hangulUTF8Size + if next >= i.rb.nsrc { + i.setDone() + } else if i.rb.src.hangul(next) == 0 { + i.rb.ss.next(i.info) + i.info = i.rb.f.info(i.rb.src, i.p) + i.next = i.rb.f.nextMain + return i.next(i) + } + i.p = next + return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))] +} + +func nextDone(i *Iter) []byte { + return nil +} + +// nextMulti is used for iterating over multi-segment decompositions +// for decomposing normal forms. +func nextMulti(i *Iter) []byte { + j := 0 + d := i.multiSeg + // skip first rune + for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ { + } + for j < len(d) { + info := i.rb.f.info(input{bytes: d}, j) + if info.BoundaryBefore() { + i.multiSeg = d[j:] + return d[:j] + } + j += int(info.size) + } + // treat last segment as normal decomposition + i.next = i.rb.f.nextMain + return i.next(i) +} + +// nextMultiNorm is used for iterating over multi-segment decompositions +// for composing normal forms. +func nextMultiNorm(i *Iter) []byte { + j := 0 + d := i.multiSeg + for j < len(d) { + info := i.rb.f.info(input{bytes: d}, j) + if info.BoundaryBefore() { + i.rb.compose() + seg := i.buf[:i.rb.flushCopy(i.buf[:])] + i.rb.insertUnsafe(input{bytes: d}, j, info) + i.multiSeg = d[j+int(info.size):] + return seg + } + i.rb.insertUnsafe(input{bytes: d}, j, info) + j += int(info.size) + } + i.multiSeg = nil + i.next = nextComposed + return doNormComposed(i) +} + +// nextDecomposed is the implementation of Next for forms NFD and NFKD. +func nextDecomposed(i *Iter) (next []byte) { + outp := 0 + inCopyStart, outCopyStart := i.p, 0 + for { + if sz := int(i.info.size); sz <= 1 { + i.rb.ss = 0 + p := i.p + i.p++ // ASCII or illegal byte. Either way, advance by 1. + if i.p >= i.rb.nsrc { + i.setDone() + return i.returnSlice(p, i.p) + } else if i.rb.src._byte(i.p) < utf8.RuneSelf { + i.next = i.asciiF + return i.returnSlice(p, i.p) + } + outp++ + } else if d := i.info.Decomposition(); d != nil { + // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero. + // Case 1: there is a leftover to copy. In this case the decomposition + // must begin with a modifier and should always be appended. + // Case 2: no leftover. Simply return d if followed by a ccc == 0 value. + p := outp + len(d) + if outp > 0 { + i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) + // TODO: this condition should not be possible, but we leave it + // in for defensive purposes. + if p > len(i.buf) { + return i.buf[:outp] + } + } else if i.info.multiSegment() { + // outp must be 0 as multi-segment decompositions always + // start a new segment. + if i.multiSeg == nil { + i.multiSeg = d + i.next = nextMulti + return nextMulti(i) + } + // We are in the last segment. Treat as normal decomposition. + d = i.multiSeg + i.multiSeg = nil + p = len(d) + } + prevCC := i.info.tccc + if i.p += sz; i.p >= i.rb.nsrc { + i.setDone() + i.info = Properties{} // Force BoundaryBefore to succeed. + } else { + i.info = i.rb.f.info(i.rb.src, i.p) + } + switch i.rb.ss.next(i.info) { + case ssOverflow: + i.next = nextCGJDecompose + fallthrough + case ssStarter: + if outp > 0 { + copy(i.buf[outp:], d) + return i.buf[:p] + } + return d + } + copy(i.buf[outp:], d) + outp = p + inCopyStart, outCopyStart = i.p, outp + if i.info.ccc < prevCC { + goto doNorm + } + continue + } else if r := i.rb.src.hangul(i.p); r != 0 { + outp = decomposeHangul(i.buf[:], r) + i.p += hangulUTF8Size + inCopyStart, outCopyStart = i.p, outp + if i.p >= i.rb.nsrc { + i.setDone() + break + } else if i.rb.src.hangul(i.p) != 0 { + i.next = nextHangul + return i.buf[:outp] + } + } else { + p := outp + sz + if p > len(i.buf) { + break + } + outp = p + i.p += sz + } + if i.p >= i.rb.nsrc { + i.setDone() + break + } + prevCC := i.info.tccc + i.info = i.rb.f.info(i.rb.src, i.p) + if v := i.rb.ss.next(i.info); v == ssStarter { + break + } else if v == ssOverflow { + i.next = nextCGJDecompose + break + } + if i.info.ccc < prevCC { + goto doNorm + } + } + if outCopyStart == 0 { + return i.returnSlice(inCopyStart, i.p) + } else if inCopyStart < i.p { + i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) + } + return i.buf[:outp] +doNorm: + // Insert what we have decomposed so far in the reorderBuffer. + // As we will only reorder, there will always be enough room. + i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p) + i.rb.insertDecomposed(i.buf[0:outp]) + return doNormDecomposed(i) +} + +func doNormDecomposed(i *Iter) []byte { + for { + i.rb.insertUnsafe(i.rb.src, i.p, i.info) + if i.p += int(i.info.size); i.p >= i.rb.nsrc { + i.setDone() + break + } + i.info = i.rb.f.info(i.rb.src, i.p) + if i.info.ccc == 0 { + break + } + if s := i.rb.ss.next(i.info); s == ssOverflow { + i.next = nextCGJDecompose + break + } + } + // new segment or too many combining characters: exit normalization + return i.buf[:i.rb.flushCopy(i.buf[:])] +} + +func nextCGJDecompose(i *Iter) []byte { + i.rb.ss = 0 + i.rb.insertCGJ() + i.next = nextDecomposed + i.rb.ss.first(i.info) + buf := doNormDecomposed(i) + return buf +} + +// nextComposed is the implementation of Next for forms NFC and NFKC. +func nextComposed(i *Iter) []byte { + outp, startp := 0, i.p + var prevCC uint8 + for { + if !i.info.isYesC() { + goto doNorm + } + prevCC = i.info.tccc + sz := int(i.info.size) + if sz == 0 { + sz = 1 // illegal rune: copy byte-by-byte + } + p := outp + sz + if p > len(i.buf) { + break + } + outp = p + i.p += sz + if i.p >= i.rb.nsrc { + i.setDone() + break + } else if i.rb.src._byte(i.p) < utf8.RuneSelf { + i.rb.ss = 0 + i.next = i.asciiF + break + } + i.info = i.rb.f.info(i.rb.src, i.p) + if v := i.rb.ss.next(i.info); v == ssStarter { + break + } else if v == ssOverflow { + i.next = nextCGJCompose + break + } + if i.info.ccc < prevCC { + goto doNorm + } + } + return i.returnSlice(startp, i.p) +doNorm: + // reset to start position + i.p = startp + i.info = i.rb.f.info(i.rb.src, i.p) + i.rb.ss.first(i.info) + if i.info.multiSegment() { + d := i.info.Decomposition() + info := i.rb.f.info(input{bytes: d}, 0) + i.rb.insertUnsafe(input{bytes: d}, 0, info) + i.multiSeg = d[int(info.size):] + i.next = nextMultiNorm + return nextMultiNorm(i) + } + i.rb.ss.first(i.info) + i.rb.insertUnsafe(i.rb.src, i.p, i.info) + return doNormComposed(i) +} + +func doNormComposed(i *Iter) []byte { + // First rune should already be inserted. + for { + if i.p += int(i.info.size); i.p >= i.rb.nsrc { + i.setDone() + break + } + i.info = i.rb.f.info(i.rb.src, i.p) + if s := i.rb.ss.next(i.info); s == ssStarter { + break + } else if s == ssOverflow { + i.next = nextCGJCompose + break + } + i.rb.insertUnsafe(i.rb.src, i.p, i.info) + } + i.rb.compose() + seg := i.buf[:i.rb.flushCopy(i.buf[:])] + return seg +} + +func nextCGJCompose(i *Iter) []byte { + i.rb.ss = 0 // instead of first + i.rb.insertCGJ() + i.next = nextComposed + // Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter, + // even if they are not. This is particularly dubious for U+FF9E and UFF9A. + // If we ever change that, insert a check here. + i.rb.ss.first(i.info) + i.rb.insertUnsafe(i.rb.src, i.p, i.info) + return doNormComposed(i) +} |