Gibheer
6ea4d2c82d
lib/pq is out of maintenance for some time now, so switch to the newer more active library. Looks like it finally stabilized after a long time.
1072 lines
29 KiB
Go
1072 lines
29 KiB
Go
// Copyright 2015 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package bidi
|
|
|
|
import (
|
|
"fmt"
|
|
"log"
|
|
)
|
|
|
|
// This implementation is a port based on the reference implementation found at:
|
|
// https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/
|
|
//
|
|
// described in Unicode Bidirectional Algorithm (UAX #9).
|
|
//
|
|
// Input:
|
|
// There are two levels of input to the algorithm, since clients may prefer to
|
|
// supply some information from out-of-band sources rather than relying on the
|
|
// default behavior.
|
|
//
|
|
// - Bidi class array
|
|
// - Bidi class array, with externally supplied base line direction
|
|
//
|
|
// Output:
|
|
// Output is separated into several stages:
|
|
//
|
|
// - levels array over entire paragraph
|
|
// - reordering array over entire paragraph
|
|
// - levels array over line
|
|
// - reordering array over line
|
|
//
|
|
// Note that for conformance to the Unicode Bidirectional Algorithm,
|
|
// implementations are only required to generate correct reordering and
|
|
// character directionality (odd or even levels) over a line. Generating
|
|
// identical level arrays over a line is not required. Bidi explicit format
|
|
// codes (LRE, RLE, LRO, RLO, PDF) and BN can be assigned arbitrary levels and
|
|
// positions as long as the rest of the input is properly reordered.
|
|
//
|
|
// As the algorithm is defined to operate on a single paragraph at a time, this
|
|
// implementation is written to handle single paragraphs. Thus rule P1 is
|
|
// presumed by this implementation-- the data provided to the implementation is
|
|
// assumed to be a single paragraph, and either contains no 'B' codes, or a
|
|
// single 'B' code at the end of the input. 'B' is allowed as input to
|
|
// illustrate how the algorithm assigns it a level.
|
|
//
|
|
// Also note that rules L3 and L4 depend on the rendering engine that uses the
|
|
// result of the bidi algorithm. This implementation assumes that the rendering
|
|
// engine expects combining marks in visual order (e.g. to the left of their
|
|
// base character in RTL runs) and that it adjusts the glyphs used to render
|
|
// mirrored characters that are in RTL runs so that they render appropriately.
|
|
|
|
// level is the embedding level of a character. Even embedding levels indicate
|
|
// left-to-right order and odd levels indicate right-to-left order. The special
|
|
// level of -1 is reserved for undefined order.
|
|
type level int8
|
|
|
|
const implicitLevel level = -1
|
|
|
|
// in returns if x is equal to any of the values in set.
|
|
func (c Class) in(set ...Class) bool {
|
|
for _, s := range set {
|
|
if c == s {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// A paragraph contains the state of a paragraph.
|
|
type paragraph struct {
|
|
initialTypes []Class
|
|
|
|
// Arrays of properties needed for paired bracket evaluation in N0
|
|
pairTypes []bracketType // paired Bracket types for paragraph
|
|
pairValues []rune // rune for opening bracket or pbOpen and pbClose; 0 for pbNone
|
|
|
|
embeddingLevel level // default: = implicitLevel;
|
|
|
|
// at the paragraph levels
|
|
resultTypes []Class
|
|
resultLevels []level
|
|
|
|
// Index of matching PDI for isolate initiator characters. For other
|
|
// characters, the value of matchingPDI will be set to -1. For isolate
|
|
// initiators with no matching PDI, matchingPDI will be set to the length of
|
|
// the input string.
|
|
matchingPDI []int
|
|
|
|
// Index of matching isolate initiator for PDI characters. For other
|
|
// characters, and for PDIs with no matching isolate initiator, the value of
|
|
// matchingIsolateInitiator will be set to -1.
|
|
matchingIsolateInitiator []int
|
|
}
|
|
|
|
// newParagraph initializes a paragraph. The user needs to supply a few arrays
|
|
// corresponding to the preprocessed text input. The types correspond to the
|
|
// Unicode BiDi classes for each rune. pairTypes indicates the bracket type for
|
|
// each rune. pairValues provides a unique bracket class identifier for each
|
|
// rune (suggested is the rune of the open bracket for opening and matching
|
|
// close brackets, after normalization). The embedding levels are optional, but
|
|
// may be supplied to encode embedding levels of styled text.
|
|
func newParagraph(types []Class, pairTypes []bracketType, pairValues []rune, levels level) (*paragraph, error) {
|
|
var err error
|
|
if err = validateTypes(types); err != nil {
|
|
return nil, err
|
|
}
|
|
if err = validatePbTypes(pairTypes); err != nil {
|
|
return nil, err
|
|
}
|
|
if err = validatePbValues(pairValues, pairTypes); err != nil {
|
|
return nil, err
|
|
}
|
|
if err = validateParagraphEmbeddingLevel(levels); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
p := ¶graph{
|
|
initialTypes: append([]Class(nil), types...),
|
|
embeddingLevel: levels,
|
|
|
|
pairTypes: pairTypes,
|
|
pairValues: pairValues,
|
|
|
|
resultTypes: append([]Class(nil), types...),
|
|
}
|
|
p.run()
|
|
return p, nil
|
|
}
|
|
|
|
func (p *paragraph) Len() int { return len(p.initialTypes) }
|
|
|
|
// The algorithm. Does not include line-based processing (Rules L1, L2).
|
|
// These are applied later in the line-based phase of the algorithm.
|
|
func (p *paragraph) run() {
|
|
p.determineMatchingIsolates()
|
|
|
|
// 1) determining the paragraph level
|
|
// Rule P1 is the requirement for entering this algorithm.
|
|
// Rules P2, P3.
|
|
// If no externally supplied paragraph embedding level, use default.
|
|
if p.embeddingLevel == implicitLevel {
|
|
p.embeddingLevel = p.determineParagraphEmbeddingLevel(0, p.Len())
|
|
}
|
|
|
|
// Initialize result levels to paragraph embedding level.
|
|
p.resultLevels = make([]level, p.Len())
|
|
setLevels(p.resultLevels, p.embeddingLevel)
|
|
|
|
// 2) Explicit levels and directions
|
|
// Rules X1-X8.
|
|
p.determineExplicitEmbeddingLevels()
|
|
|
|
// Rule X9.
|
|
// We do not remove the embeddings, the overrides, the PDFs, and the BNs
|
|
// from the string explicitly. But they are not copied into isolating run
|
|
// sequences when they are created, so they are removed for all
|
|
// practical purposes.
|
|
|
|
// Rule X10.
|
|
// Run remainder of algorithm one isolating run sequence at a time
|
|
for _, seq := range p.determineIsolatingRunSequences() {
|
|
// 3) resolving weak types
|
|
// Rules W1-W7.
|
|
seq.resolveWeakTypes()
|
|
|
|
// 4a) resolving paired brackets
|
|
// Rule N0
|
|
resolvePairedBrackets(seq)
|
|
|
|
// 4b) resolving neutral types
|
|
// Rules N1-N3.
|
|
seq.resolveNeutralTypes()
|
|
|
|
// 5) resolving implicit embedding levels
|
|
// Rules I1, I2.
|
|
seq.resolveImplicitLevels()
|
|
|
|
// Apply the computed levels and types
|
|
seq.applyLevelsAndTypes()
|
|
}
|
|
|
|
// Assign appropriate levels to 'hide' LREs, RLEs, LROs, RLOs, PDFs, and
|
|
// BNs. This is for convenience, so the resulting level array will have
|
|
// a value for every character.
|
|
p.assignLevelsToCharactersRemovedByX9()
|
|
}
|
|
|
|
// determineMatchingIsolates determines the matching PDI for each isolate
|
|
// initiator and vice versa.
|
|
//
|
|
// Definition BD9.
|
|
//
|
|
// At the end of this function:
|
|
//
|
|
// - The member variable matchingPDI is set to point to the index of the
|
|
// matching PDI character for each isolate initiator character. If there is
|
|
// no matching PDI, it is set to the length of the input text. For other
|
|
// characters, it is set to -1.
|
|
// - The member variable matchingIsolateInitiator is set to point to the
|
|
// index of the matching isolate initiator character for each PDI character.
|
|
// If there is no matching isolate initiator, or the character is not a PDI,
|
|
// it is set to -1.
|
|
func (p *paragraph) determineMatchingIsolates() {
|
|
p.matchingPDI = make([]int, p.Len())
|
|
p.matchingIsolateInitiator = make([]int, p.Len())
|
|
|
|
for i := range p.matchingIsolateInitiator {
|
|
p.matchingIsolateInitiator[i] = -1
|
|
}
|
|
|
|
for i := range p.matchingPDI {
|
|
p.matchingPDI[i] = -1
|
|
|
|
if t := p.resultTypes[i]; t.in(LRI, RLI, FSI) {
|
|
depthCounter := 1
|
|
for j := i + 1; j < p.Len(); j++ {
|
|
if u := p.resultTypes[j]; u.in(LRI, RLI, FSI) {
|
|
depthCounter++
|
|
} else if u == PDI {
|
|
if depthCounter--; depthCounter == 0 {
|
|
p.matchingPDI[i] = j
|
|
p.matchingIsolateInitiator[j] = i
|
|
break
|
|
}
|
|
}
|
|
}
|
|
if p.matchingPDI[i] == -1 {
|
|
p.matchingPDI[i] = p.Len()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// determineParagraphEmbeddingLevel reports the resolved paragraph direction of
|
|
// the substring limited by the given range [start, end).
|
|
//
|
|
// Determines the paragraph level based on rules P2, P3. This is also used
|
|
// in rule X5c to find if an FSI should resolve to LRI or RLI.
|
|
func (p *paragraph) determineParagraphEmbeddingLevel(start, end int) level {
|
|
var strongType Class = unknownClass
|
|
|
|
// Rule P2.
|
|
for i := start; i < end; i++ {
|
|
if t := p.resultTypes[i]; t.in(L, AL, R) {
|
|
strongType = t
|
|
break
|
|
} else if t.in(FSI, LRI, RLI) {
|
|
i = p.matchingPDI[i] // skip over to the matching PDI
|
|
if i > end {
|
|
log.Panic("assert (i <= end)")
|
|
}
|
|
}
|
|
}
|
|
// Rule P3.
|
|
switch strongType {
|
|
case unknownClass: // none found
|
|
// default embedding level when no strong types found is 0.
|
|
return 0
|
|
case L:
|
|
return 0
|
|
default: // AL, R
|
|
return 1
|
|
}
|
|
}
|
|
|
|
const maxDepth = 125
|
|
|
|
// This stack will store the embedding levels and override and isolated
|
|
// statuses
|
|
type directionalStatusStack struct {
|
|
stackCounter int
|
|
embeddingLevelStack [maxDepth + 1]level
|
|
overrideStatusStack [maxDepth + 1]Class
|
|
isolateStatusStack [maxDepth + 1]bool
|
|
}
|
|
|
|
func (s *directionalStatusStack) empty() { s.stackCounter = 0 }
|
|
func (s *directionalStatusStack) pop() { s.stackCounter-- }
|
|
func (s *directionalStatusStack) depth() int { return s.stackCounter }
|
|
|
|
func (s *directionalStatusStack) push(level level, overrideStatus Class, isolateStatus bool) {
|
|
s.embeddingLevelStack[s.stackCounter] = level
|
|
s.overrideStatusStack[s.stackCounter] = overrideStatus
|
|
s.isolateStatusStack[s.stackCounter] = isolateStatus
|
|
s.stackCounter++
|
|
}
|
|
|
|
func (s *directionalStatusStack) lastEmbeddingLevel() level {
|
|
return s.embeddingLevelStack[s.stackCounter-1]
|
|
}
|
|
|
|
func (s *directionalStatusStack) lastDirectionalOverrideStatus() Class {
|
|
return s.overrideStatusStack[s.stackCounter-1]
|
|
}
|
|
|
|
func (s *directionalStatusStack) lastDirectionalIsolateStatus() bool {
|
|
return s.isolateStatusStack[s.stackCounter-1]
|
|
}
|
|
|
|
// Determine explicit levels using rules X1 - X8
|
|
func (p *paragraph) determineExplicitEmbeddingLevels() {
|
|
var stack directionalStatusStack
|
|
var overflowIsolateCount, overflowEmbeddingCount, validIsolateCount int
|
|
|
|
// Rule X1.
|
|
stack.push(p.embeddingLevel, ON, false)
|
|
|
|
for i, t := range p.resultTypes {
|
|
// Rules X2, X3, X4, X5, X5a, X5b, X5c
|
|
switch t {
|
|
case RLE, LRE, RLO, LRO, RLI, LRI, FSI:
|
|
isIsolate := t.in(RLI, LRI, FSI)
|
|
isRTL := t.in(RLE, RLO, RLI)
|
|
|
|
// override if this is an FSI that resolves to RLI
|
|
if t == FSI {
|
|
isRTL = (p.determineParagraphEmbeddingLevel(i+1, p.matchingPDI[i]) == 1)
|
|
}
|
|
if isIsolate {
|
|
p.resultLevels[i] = stack.lastEmbeddingLevel()
|
|
if stack.lastDirectionalOverrideStatus() != ON {
|
|
p.resultTypes[i] = stack.lastDirectionalOverrideStatus()
|
|
}
|
|
}
|
|
|
|
var newLevel level
|
|
if isRTL {
|
|
// least greater odd
|
|
newLevel = (stack.lastEmbeddingLevel() + 1) | 1
|
|
} else {
|
|
// least greater even
|
|
newLevel = (stack.lastEmbeddingLevel() + 2) &^ 1
|
|
}
|
|
|
|
if newLevel <= maxDepth && overflowIsolateCount == 0 && overflowEmbeddingCount == 0 {
|
|
if isIsolate {
|
|
validIsolateCount++
|
|
}
|
|
// Push new embedding level, override status, and isolated
|
|
// status.
|
|
// No check for valid stack counter, since the level check
|
|
// suffices.
|
|
switch t {
|
|
case LRO:
|
|
stack.push(newLevel, L, isIsolate)
|
|
case RLO:
|
|
stack.push(newLevel, R, isIsolate)
|
|
default:
|
|
stack.push(newLevel, ON, isIsolate)
|
|
}
|
|
// Not really part of the spec
|
|
if !isIsolate {
|
|
p.resultLevels[i] = newLevel
|
|
}
|
|
} else {
|
|
// This is an invalid explicit formatting character,
|
|
// so apply the "Otherwise" part of rules X2-X5b.
|
|
if isIsolate {
|
|
overflowIsolateCount++
|
|
} else { // !isIsolate
|
|
if overflowIsolateCount == 0 {
|
|
overflowEmbeddingCount++
|
|
}
|
|
}
|
|
}
|
|
|
|
// Rule X6a
|
|
case PDI:
|
|
if overflowIsolateCount > 0 {
|
|
overflowIsolateCount--
|
|
} else if validIsolateCount == 0 {
|
|
// do nothing
|
|
} else {
|
|
overflowEmbeddingCount = 0
|
|
for !stack.lastDirectionalIsolateStatus() {
|
|
stack.pop()
|
|
}
|
|
stack.pop()
|
|
validIsolateCount--
|
|
}
|
|
p.resultLevels[i] = stack.lastEmbeddingLevel()
|
|
|
|
// Rule X7
|
|
case PDF:
|
|
// Not really part of the spec
|
|
p.resultLevels[i] = stack.lastEmbeddingLevel()
|
|
|
|
if overflowIsolateCount > 0 {
|
|
// do nothing
|
|
} else if overflowEmbeddingCount > 0 {
|
|
overflowEmbeddingCount--
|
|
} else if !stack.lastDirectionalIsolateStatus() && stack.depth() >= 2 {
|
|
stack.pop()
|
|
}
|
|
|
|
case B: // paragraph separator.
|
|
// Rule X8.
|
|
|
|
// These values are reset for clarity, in this implementation B
|
|
// can only occur as the last code in the array.
|
|
stack.empty()
|
|
overflowIsolateCount = 0
|
|
overflowEmbeddingCount = 0
|
|
validIsolateCount = 0
|
|
p.resultLevels[i] = p.embeddingLevel
|
|
|
|
default:
|
|
p.resultLevels[i] = stack.lastEmbeddingLevel()
|
|
if stack.lastDirectionalOverrideStatus() != ON {
|
|
p.resultTypes[i] = stack.lastDirectionalOverrideStatus()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
type isolatingRunSequence struct {
|
|
p *paragraph
|
|
|
|
indexes []int // indexes to the original string
|
|
|
|
types []Class // type of each character using the index
|
|
resolvedLevels []level // resolved levels after application of rules
|
|
level level
|
|
sos, eos Class
|
|
}
|
|
|
|
func (i *isolatingRunSequence) Len() int { return len(i.indexes) }
|
|
|
|
func maxLevel(a, b level) level {
|
|
if a > b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
// Rule X10, second bullet: Determine the start-of-sequence (sos) and end-of-sequence (eos) types,
|
|
// either L or R, for each isolating run sequence.
|
|
func (p *paragraph) isolatingRunSequence(indexes []int) *isolatingRunSequence {
|
|
length := len(indexes)
|
|
types := make([]Class, length)
|
|
for i, x := range indexes {
|
|
types[i] = p.resultTypes[x]
|
|
}
|
|
|
|
// assign level, sos and eos
|
|
prevChar := indexes[0] - 1
|
|
for prevChar >= 0 && isRemovedByX9(p.initialTypes[prevChar]) {
|
|
prevChar--
|
|
}
|
|
prevLevel := p.embeddingLevel
|
|
if prevChar >= 0 {
|
|
prevLevel = p.resultLevels[prevChar]
|
|
}
|
|
|
|
var succLevel level
|
|
lastType := types[length-1]
|
|
if lastType.in(LRI, RLI, FSI) {
|
|
succLevel = p.embeddingLevel
|
|
} else {
|
|
// the first character after the end of run sequence
|
|
limit := indexes[length-1] + 1
|
|
for ; limit < p.Len() && isRemovedByX9(p.initialTypes[limit]); limit++ {
|
|
|
|
}
|
|
succLevel = p.embeddingLevel
|
|
if limit < p.Len() {
|
|
succLevel = p.resultLevels[limit]
|
|
}
|
|
}
|
|
level := p.resultLevels[indexes[0]]
|
|
return &isolatingRunSequence{
|
|
p: p,
|
|
indexes: indexes,
|
|
types: types,
|
|
level: level,
|
|
sos: typeForLevel(maxLevel(prevLevel, level)),
|
|
eos: typeForLevel(maxLevel(succLevel, level)),
|
|
}
|
|
}
|
|
|
|
// Resolving weak types Rules W1-W7.
|
|
//
|
|
// Note that some weak types (EN, AN) remain after this processing is
|
|
// complete.
|
|
func (s *isolatingRunSequence) resolveWeakTypes() {
|
|
|
|
// on entry, only these types remain
|
|
s.assertOnly(L, R, AL, EN, ES, ET, AN, CS, B, S, WS, ON, NSM, LRI, RLI, FSI, PDI)
|
|
|
|
// Rule W1.
|
|
// Changes all NSMs.
|
|
precedingCharacterType := s.sos
|
|
for i, t := range s.types {
|
|
if t == NSM {
|
|
s.types[i] = precedingCharacterType
|
|
} else {
|
|
// if t.in(LRI, RLI, FSI, PDI) {
|
|
// precedingCharacterType = ON
|
|
// }
|
|
precedingCharacterType = t
|
|
}
|
|
}
|
|
|
|
// Rule W2.
|
|
// EN does not change at the start of the run, because sos != AL.
|
|
for i, t := range s.types {
|
|
if t == EN {
|
|
for j := i - 1; j >= 0; j-- {
|
|
if t := s.types[j]; t.in(L, R, AL) {
|
|
if t == AL {
|
|
s.types[i] = AN
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Rule W3.
|
|
for i, t := range s.types {
|
|
if t == AL {
|
|
s.types[i] = R
|
|
}
|
|
}
|
|
|
|
// Rule W4.
|
|
// Since there must be values on both sides for this rule to have an
|
|
// effect, the scan skips the first and last value.
|
|
//
|
|
// Although the scan proceeds left to right, and changes the type
|
|
// values in a way that would appear to affect the computations
|
|
// later in the scan, there is actually no problem. A change in the
|
|
// current value can only affect the value to its immediate right,
|
|
// and only affect it if it is ES or CS. But the current value can
|
|
// only change if the value to its right is not ES or CS. Thus
|
|
// either the current value will not change, or its change will have
|
|
// no effect on the remainder of the analysis.
|
|
|
|
for i := 1; i < s.Len()-1; i++ {
|
|
t := s.types[i]
|
|
if t == ES || t == CS {
|
|
prevSepType := s.types[i-1]
|
|
succSepType := s.types[i+1]
|
|
if prevSepType == EN && succSepType == EN {
|
|
s.types[i] = EN
|
|
} else if s.types[i] == CS && prevSepType == AN && succSepType == AN {
|
|
s.types[i] = AN
|
|
}
|
|
}
|
|
}
|
|
|
|
// Rule W5.
|
|
for i, t := range s.types {
|
|
if t == ET {
|
|
// locate end of sequence
|
|
runStart := i
|
|
runEnd := s.findRunLimit(runStart, ET)
|
|
|
|
// check values at ends of sequence
|
|
t := s.sos
|
|
if runStart > 0 {
|
|
t = s.types[runStart-1]
|
|
}
|
|
if t != EN {
|
|
t = s.eos
|
|
if runEnd < len(s.types) {
|
|
t = s.types[runEnd]
|
|
}
|
|
}
|
|
if t == EN {
|
|
setTypes(s.types[runStart:runEnd], EN)
|
|
}
|
|
// continue at end of sequence
|
|
i = runEnd
|
|
}
|
|
}
|
|
|
|
// Rule W6.
|
|
for i, t := range s.types {
|
|
if t.in(ES, ET, CS) {
|
|
s.types[i] = ON
|
|
}
|
|
}
|
|
|
|
// Rule W7.
|
|
for i, t := range s.types {
|
|
if t == EN {
|
|
// set default if we reach start of run
|
|
prevStrongType := s.sos
|
|
for j := i - 1; j >= 0; j-- {
|
|
t = s.types[j]
|
|
if t == L || t == R { // AL's have been changed to R
|
|
prevStrongType = t
|
|
break
|
|
}
|
|
}
|
|
if prevStrongType == L {
|
|
s.types[i] = L
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// 6) resolving neutral types Rules N1-N2.
|
|
func (s *isolatingRunSequence) resolveNeutralTypes() {
|
|
|
|
// on entry, only these types can be in resultTypes
|
|
s.assertOnly(L, R, EN, AN, B, S, WS, ON, RLI, LRI, FSI, PDI)
|
|
|
|
for i, t := range s.types {
|
|
switch t {
|
|
case WS, ON, B, S, RLI, LRI, FSI, PDI:
|
|
// find bounds of run of neutrals
|
|
runStart := i
|
|
runEnd := s.findRunLimit(runStart, B, S, WS, ON, RLI, LRI, FSI, PDI)
|
|
|
|
// determine effective types at ends of run
|
|
var leadType, trailType Class
|
|
|
|
// Note that the character found can only be L, R, AN, or
|
|
// EN.
|
|
if runStart == 0 {
|
|
leadType = s.sos
|
|
} else {
|
|
leadType = s.types[runStart-1]
|
|
if leadType.in(AN, EN) {
|
|
leadType = R
|
|
}
|
|
}
|
|
if runEnd == len(s.types) {
|
|
trailType = s.eos
|
|
} else {
|
|
trailType = s.types[runEnd]
|
|
if trailType.in(AN, EN) {
|
|
trailType = R
|
|
}
|
|
}
|
|
|
|
var resolvedType Class
|
|
if leadType == trailType {
|
|
// Rule N1.
|
|
resolvedType = leadType
|
|
} else {
|
|
// Rule N2.
|
|
// Notice the embedding level of the run is used, not
|
|
// the paragraph embedding level.
|
|
resolvedType = typeForLevel(s.level)
|
|
}
|
|
|
|
setTypes(s.types[runStart:runEnd], resolvedType)
|
|
|
|
// skip over run of (former) neutrals
|
|
i = runEnd
|
|
}
|
|
}
|
|
}
|
|
|
|
func setLevels(levels []level, newLevel level) {
|
|
for i := range levels {
|
|
levels[i] = newLevel
|
|
}
|
|
}
|
|
|
|
func setTypes(types []Class, newType Class) {
|
|
for i := range types {
|
|
types[i] = newType
|
|
}
|
|
}
|
|
|
|
// 7) resolving implicit embedding levels Rules I1, I2.
|
|
func (s *isolatingRunSequence) resolveImplicitLevels() {
|
|
|
|
// on entry, only these types can be in resultTypes
|
|
s.assertOnly(L, R, EN, AN)
|
|
|
|
s.resolvedLevels = make([]level, len(s.types))
|
|
setLevels(s.resolvedLevels, s.level)
|
|
|
|
if (s.level & 1) == 0 { // even level
|
|
for i, t := range s.types {
|
|
// Rule I1.
|
|
if t == L {
|
|
// no change
|
|
} else if t == R {
|
|
s.resolvedLevels[i] += 1
|
|
} else { // t == AN || t == EN
|
|
s.resolvedLevels[i] += 2
|
|
}
|
|
}
|
|
} else { // odd level
|
|
for i, t := range s.types {
|
|
// Rule I2.
|
|
if t == R {
|
|
// no change
|
|
} else { // t == L || t == AN || t == EN
|
|
s.resolvedLevels[i] += 1
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Applies the levels and types resolved in rules W1-I2 to the
|
|
// resultLevels array.
|
|
func (s *isolatingRunSequence) applyLevelsAndTypes() {
|
|
for i, x := range s.indexes {
|
|
s.p.resultTypes[x] = s.types[i]
|
|
s.p.resultLevels[x] = s.resolvedLevels[i]
|
|
}
|
|
}
|
|
|
|
// Return the limit of the run consisting only of the types in validSet
|
|
// starting at index. This checks the value at index, and will return
|
|
// index if that value is not in validSet.
|
|
func (s *isolatingRunSequence) findRunLimit(index int, validSet ...Class) int {
|
|
loop:
|
|
for ; index < len(s.types); index++ {
|
|
t := s.types[index]
|
|
for _, valid := range validSet {
|
|
if t == valid {
|
|
continue loop
|
|
}
|
|
}
|
|
return index // didn't find a match in validSet
|
|
}
|
|
return len(s.types)
|
|
}
|
|
|
|
// Algorithm validation. Assert that all values in types are in the
|
|
// provided set.
|
|
func (s *isolatingRunSequence) assertOnly(codes ...Class) {
|
|
loop:
|
|
for i, t := range s.types {
|
|
for _, c := range codes {
|
|
if t == c {
|
|
continue loop
|
|
}
|
|
}
|
|
log.Panicf("invalid bidi code %v present in assertOnly at position %d", t, s.indexes[i])
|
|
}
|
|
}
|
|
|
|
// determineLevelRuns returns an array of level runs. Each level run is
|
|
// described as an array of indexes into the input string.
|
|
//
|
|
// Determines the level runs. Rule X9 will be applied in determining the
|
|
// runs, in the way that makes sure the characters that are supposed to be
|
|
// removed are not included in the runs.
|
|
func (p *paragraph) determineLevelRuns() [][]int {
|
|
run := []int{}
|
|
allRuns := [][]int{}
|
|
currentLevel := implicitLevel
|
|
|
|
for i := range p.initialTypes {
|
|
if !isRemovedByX9(p.initialTypes[i]) {
|
|
if p.resultLevels[i] != currentLevel {
|
|
// we just encountered a new run; wrap up last run
|
|
if currentLevel >= 0 { // only wrap it up if there was a run
|
|
allRuns = append(allRuns, run)
|
|
run = nil
|
|
}
|
|
// Start new run
|
|
currentLevel = p.resultLevels[i]
|
|
}
|
|
run = append(run, i)
|
|
}
|
|
}
|
|
// Wrap up the final run, if any
|
|
if len(run) > 0 {
|
|
allRuns = append(allRuns, run)
|
|
}
|
|
return allRuns
|
|
}
|
|
|
|
// Definition BD13. Determine isolating run sequences.
|
|
func (p *paragraph) determineIsolatingRunSequences() []*isolatingRunSequence {
|
|
levelRuns := p.determineLevelRuns()
|
|
|
|
// Compute the run that each character belongs to
|
|
runForCharacter := make([]int, p.Len())
|
|
for i, run := range levelRuns {
|
|
for _, index := range run {
|
|
runForCharacter[index] = i
|
|
}
|
|
}
|
|
|
|
sequences := []*isolatingRunSequence{}
|
|
|
|
var currentRunSequence []int
|
|
|
|
for _, run := range levelRuns {
|
|
first := run[0]
|
|
if p.initialTypes[first] != PDI || p.matchingIsolateInitiator[first] == -1 {
|
|
currentRunSequence = nil
|
|
// int run = i;
|
|
for {
|
|
// Copy this level run into currentRunSequence
|
|
currentRunSequence = append(currentRunSequence, run...)
|
|
|
|
last := currentRunSequence[len(currentRunSequence)-1]
|
|
lastT := p.initialTypes[last]
|
|
if lastT.in(LRI, RLI, FSI) && p.matchingPDI[last] != p.Len() {
|
|
run = levelRuns[runForCharacter[p.matchingPDI[last]]]
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
sequences = append(sequences, p.isolatingRunSequence(currentRunSequence))
|
|
}
|
|
}
|
|
return sequences
|
|
}
|
|
|
|
// Assign level information to characters removed by rule X9. This is for
|
|
// ease of relating the level information to the original input data. Note
|
|
// that the levels assigned to these codes are arbitrary, they're chosen so
|
|
// as to avoid breaking level runs.
|
|
func (p *paragraph) assignLevelsToCharactersRemovedByX9() {
|
|
for i, t := range p.initialTypes {
|
|
if t.in(LRE, RLE, LRO, RLO, PDF, BN) {
|
|
p.resultTypes[i] = t
|
|
p.resultLevels[i] = -1
|
|
}
|
|
}
|
|
// now propagate forward the levels information (could have
|
|
// propagated backward, the main thing is not to introduce a level
|
|
// break where one doesn't already exist).
|
|
|
|
if p.resultLevels[0] == -1 {
|
|
p.resultLevels[0] = p.embeddingLevel
|
|
}
|
|
for i := 1; i < len(p.initialTypes); i++ {
|
|
if p.resultLevels[i] == -1 {
|
|
p.resultLevels[i] = p.resultLevels[i-1]
|
|
}
|
|
}
|
|
// Embedding information is for informational purposes only so need not be
|
|
// adjusted.
|
|
}
|
|
|
|
//
|
|
// Output
|
|
//
|
|
|
|
// getLevels computes levels array breaking lines at offsets in linebreaks.
|
|
// Rule L1.
|
|
//
|
|
// The linebreaks array must include at least one value. The values must be
|
|
// in strictly increasing order (no duplicates) between 1 and the length of
|
|
// the text, inclusive. The last value must be the length of the text.
|
|
func (p *paragraph) getLevels(linebreaks []int) []level {
|
|
// Note that since the previous processing has removed all
|
|
// P, S, and WS values from resultTypes, the values referred to
|
|
// in these rules are the initial types, before any processing
|
|
// has been applied (including processing of overrides).
|
|
//
|
|
// This example implementation has reinserted explicit format codes
|
|
// and BN, in order that the levels array correspond to the
|
|
// initial text. Their final placement is not normative.
|
|
// These codes are treated like WS in this implementation,
|
|
// so they don't interrupt sequences of WS.
|
|
|
|
validateLineBreaks(linebreaks, p.Len())
|
|
|
|
result := append([]level(nil), p.resultLevels...)
|
|
|
|
// don't worry about linebreaks since if there is a break within
|
|
// a series of WS values preceding S, the linebreak itself
|
|
// causes the reset.
|
|
for i, t := range p.initialTypes {
|
|
if t.in(B, S) {
|
|
// Rule L1, clauses one and two.
|
|
result[i] = p.embeddingLevel
|
|
|
|
// Rule L1, clause three.
|
|
for j := i - 1; j >= 0; j-- {
|
|
if isWhitespace(p.initialTypes[j]) { // including format codes
|
|
result[j] = p.embeddingLevel
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Rule L1, clause four.
|
|
start := 0
|
|
for _, limit := range linebreaks {
|
|
for j := limit - 1; j >= start; j-- {
|
|
if isWhitespace(p.initialTypes[j]) { // including format codes
|
|
result[j] = p.embeddingLevel
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
start = limit
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// getReordering returns the reordering of lines from a visual index to a
|
|
// logical index for line breaks at the given offsets.
|
|
//
|
|
// Lines are concatenated from left to right. So for example, the fifth
|
|
// character from the left on the third line is
|
|
//
|
|
// getReordering(linebreaks)[linebreaks[1] + 4]
|
|
//
|
|
// (linebreaks[1] is the position after the last character of the second
|
|
// line, which is also the index of the first character on the third line,
|
|
// and adding four gets the fifth character from the left).
|
|
//
|
|
// The linebreaks array must include at least one value. The values must be
|
|
// in strictly increasing order (no duplicates) between 1 and the length of
|
|
// the text, inclusive. The last value must be the length of the text.
|
|
func (p *paragraph) getReordering(linebreaks []int) []int {
|
|
validateLineBreaks(linebreaks, p.Len())
|
|
|
|
return computeMultilineReordering(p.getLevels(linebreaks), linebreaks)
|
|
}
|
|
|
|
// Return multiline reordering array for a given level array. Reordering
|
|
// does not occur across a line break.
|
|
func computeMultilineReordering(levels []level, linebreaks []int) []int {
|
|
result := make([]int, len(levels))
|
|
|
|
start := 0
|
|
for _, limit := range linebreaks {
|
|
tempLevels := make([]level, limit-start)
|
|
copy(tempLevels, levels[start:])
|
|
|
|
for j, order := range computeReordering(tempLevels) {
|
|
result[start+j] = order + start
|
|
}
|
|
start = limit
|
|
}
|
|
return result
|
|
}
|
|
|
|
// Return reordering array for a given level array. This reorders a single
|
|
// line. The reordering is a visual to logical map. For example, the
|
|
// leftmost char is string.charAt(order[0]). Rule L2.
|
|
func computeReordering(levels []level) []int {
|
|
result := make([]int, len(levels))
|
|
// initialize order
|
|
for i := range result {
|
|
result[i] = i
|
|
}
|
|
|
|
// locate highest level found on line.
|
|
// Note the rules say text, but no reordering across line bounds is
|
|
// performed, so this is sufficient.
|
|
highestLevel := level(0)
|
|
lowestOddLevel := level(maxDepth + 2)
|
|
for _, level := range levels {
|
|
if level > highestLevel {
|
|
highestLevel = level
|
|
}
|
|
if level&1 != 0 && level < lowestOddLevel {
|
|
lowestOddLevel = level
|
|
}
|
|
}
|
|
|
|
for level := highestLevel; level >= lowestOddLevel; level-- {
|
|
for i := 0; i < len(levels); i++ {
|
|
if levels[i] >= level {
|
|
// find range of text at or above this level
|
|
start := i
|
|
limit := i + 1
|
|
for limit < len(levels) && levels[limit] >= level {
|
|
limit++
|
|
}
|
|
|
|
for j, k := start, limit-1; j < k; j, k = j+1, k-1 {
|
|
result[j], result[k] = result[k], result[j]
|
|
}
|
|
// skip to end of level run
|
|
i = limit
|
|
}
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// isWhitespace reports whether the type is considered a whitespace type for the
|
|
// line break rules.
|
|
func isWhitespace(c Class) bool {
|
|
switch c {
|
|
case LRE, RLE, LRO, RLO, PDF, LRI, RLI, FSI, PDI, BN, WS:
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// isRemovedByX9 reports whether the type is one of the types removed in X9.
|
|
func isRemovedByX9(c Class) bool {
|
|
switch c {
|
|
case LRE, RLE, LRO, RLO, PDF, BN:
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// typeForLevel reports the strong type (L or R) corresponding to the level.
|
|
func typeForLevel(level level) Class {
|
|
if (level & 0x1) == 0 {
|
|
return L
|
|
}
|
|
return R
|
|
}
|
|
|
|
func validateTypes(types []Class) error {
|
|
if len(types) == 0 {
|
|
return fmt.Errorf("types is null")
|
|
}
|
|
for i, t := range types[:len(types)-1] {
|
|
if t == B {
|
|
return fmt.Errorf("B type before end of paragraph at index: %d", i)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func validateParagraphEmbeddingLevel(embeddingLevel level) error {
|
|
if embeddingLevel != implicitLevel &&
|
|
embeddingLevel != 0 &&
|
|
embeddingLevel != 1 {
|
|
return fmt.Errorf("illegal paragraph embedding level: %d", embeddingLevel)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func validateLineBreaks(linebreaks []int, textLength int) error {
|
|
prev := 0
|
|
for i, next := range linebreaks {
|
|
if next <= prev {
|
|
return fmt.Errorf("bad linebreak: %d at index: %d", next, i)
|
|
}
|
|
prev = next
|
|
}
|
|
if prev != textLength {
|
|
return fmt.Errorf("last linebreak was %d, want %d", prev, textLength)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func validatePbTypes(pairTypes []bracketType) error {
|
|
if len(pairTypes) == 0 {
|
|
return fmt.Errorf("pairTypes is null")
|
|
}
|
|
for i, pt := range pairTypes {
|
|
switch pt {
|
|
case bpNone, bpOpen, bpClose:
|
|
default:
|
|
return fmt.Errorf("illegal pairType value at %d: %v", i, pairTypes[i])
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func validatePbValues(pairValues []rune, pairTypes []bracketType) error {
|
|
if pairValues == nil {
|
|
return fmt.Errorf("pairValues is null")
|
|
}
|
|
if len(pairTypes) != len(pairValues) {
|
|
return fmt.Errorf("pairTypes is different length from pairValues")
|
|
}
|
|
return nil
|
|
}
|