// Copyright 2015 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.package bidiimport ()// This implementation is a port based on the reference implementation found at:// https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava///// described in Unicode Bidirectional Algorithm (UAX #9).//// Input:// There are two levels of input to the algorithm, since clients may prefer to// supply some information from out-of-band sources rather than relying on the// default behavior.//// - Bidi class array// - Bidi class array, with externally supplied base line direction//// Output:// Output is separated into several stages://// - levels array over entire paragraph// - reordering array over entire paragraph// - levels array over line// - reordering array over line//// Note that for conformance to the Unicode Bidirectional Algorithm,// implementations are only required to generate correct reordering and// character directionality (odd or even levels) over a line. Generating// identical level arrays over a line is not required. Bidi explicit format// codes (LRE, RLE, LRO, RLO, PDF) and BN can be assigned arbitrary levels and// positions as long as the rest of the input is properly reordered.//// As the algorithm is defined to operate on a single paragraph at a time, this// implementation is written to handle single paragraphs. Thus rule P1 is// presumed by this implementation-- the data provided to the implementation is// assumed to be a single paragraph, and either contains no 'B' codes, or a// single 'B' code at the end of the input. 'B' is allowed as input to// illustrate how the algorithm assigns it a level.//// Also note that rules L3 and L4 depend on the rendering engine that uses the// result of the bidi algorithm. This implementation assumes that the rendering// engine expects combining marks in visual order (e.g. to the left of their// base character in RTL runs) and that it adjusts the glyphs used to render// mirrored characters that are in RTL runs so that they render appropriately.// level is the embedding level of a character. Even embedding levels indicate// left-to-right order and odd levels indicate right-to-left order. The special// level of -1 is reserved for undefined order.type level int8const implicitLevel level = -1// in returns if x is equal to any of the values in set.func ( Class) ( ...Class) bool {for , := range {if == {returntrue } }returnfalse}// A paragraph contains the state of a paragraph.type paragraph struct { initialTypes []Class// Arrays of properties needed for paired bracket evaluation in N0 pairTypes []bracketType// paired Bracket types for paragraph pairValues []rune// rune for opening bracket or pbOpen and pbClose; 0 for pbNone embeddingLevel level// default: = implicitLevel;// at the paragraph levels resultTypes []Class resultLevels []level// Index of matching PDI for isolate initiator characters. For other // characters, the value of matchingPDI will be set to -1. For isolate // initiators with no matching PDI, matchingPDI will be set to the length of // the input string. matchingPDI []int// Index of matching isolate initiator for PDI characters. For other // characters, and for PDIs with no matching isolate initiator, the value of // matchingIsolateInitiator will be set to -1. matchingIsolateInitiator []int}// newParagraph initializes a paragraph. The user needs to supply a few arrays// corresponding to the preprocessed text input. The types correspond to the// Unicode BiDi classes for each rune. pairTypes indicates the bracket type for// each rune. pairValues provides a unique bracket class identifier for each// rune (suggested is the rune of the open bracket for opening and matching// close brackets, after normalization). The embedding levels are optional, but// may be supplied to encode embedding levels of styled text.func newParagraph( []Class, []bracketType, []rune, level) (*paragraph, error) {varerrorif = validateTypes(); != nil {returnnil, }if = validatePbTypes(); != nil {returnnil, }if = validatePbValues(, ); != nil {returnnil, }if = validateParagraphEmbeddingLevel(); != nil {returnnil, } := ¶graph{initialTypes: append([]Class(nil), ...),embeddingLevel: ,pairTypes: ,pairValues: ,resultTypes: append([]Class(nil), ...), } .run()return , nil}func ( *paragraph) () int { returnlen(.initialTypes) }// The algorithm. Does not include line-based processing (Rules L1, L2).// These are applied later in the line-based phase of the algorithm.func ( *paragraph) () { .determineMatchingIsolates()// 1) determining the paragraph level // Rule P1 is the requirement for entering this algorithm. // Rules P2, P3. // If no externally supplied paragraph embedding level, use default.if .embeddingLevel == implicitLevel { .embeddingLevel = .determineParagraphEmbeddingLevel(0, .Len()) }// Initialize result levels to paragraph embedding level. .resultLevels = make([]level, .Len())setLevels(.resultLevels, .embeddingLevel)// 2) Explicit levels and directions // Rules X1-X8. .determineExplicitEmbeddingLevels()// Rule X9. // We do not remove the embeddings, the overrides, the PDFs, and the BNs // from the string explicitly. But they are not copied into isolating run // sequences when they are created, so they are removed for all // practical purposes.// Rule X10. // Run remainder of algorithm one isolating run sequence at a timefor , := range .determineIsolatingRunSequences() {// 3) resolving weak types // Rules W1-W7. .resolveWeakTypes()// 4a) resolving paired brackets // Rule N0resolvePairedBrackets()// 4b) resolving neutral types // Rules N1-N3. .resolveNeutralTypes()// 5) resolving implicit embedding levels // Rules I1, I2. .resolveImplicitLevels()// Apply the computed levels and types .applyLevelsAndTypes() }// Assign appropriate levels to 'hide' LREs, RLEs, LROs, RLOs, PDFs, and // BNs. This is for convenience, so the resulting level array will have // a value for every character. .assignLevelsToCharactersRemovedByX9()}// determineMatchingIsolates determines the matching PDI for each isolate// initiator and vice versa.//// Definition BD9.//// At the end of this function://// - The member variable matchingPDI is set to point to the index of the// matching PDI character for each isolate initiator character. If there is// no matching PDI, it is set to the length of the input text. For other// characters, it is set to -1.// - The member variable matchingIsolateInitiator is set to point to the// index of the matching isolate initiator character for each PDI character.// If there is no matching isolate initiator, or the character is not a PDI,// it is set to -1.func ( *paragraph) () { .matchingPDI = make([]int, .Len()) .matchingIsolateInitiator = make([]int, .Len())for := range .matchingIsolateInitiator { .matchingIsolateInitiator[] = -1 }for := range .matchingPDI { .matchingPDI[] = -1if := .resultTypes[]; .in(LRI, RLI, FSI) { := 1for := + 1; < .Len(); ++ {if := .resultTypes[]; .in(LRI, RLI, FSI) { ++ } elseif == PDI {if --; == 0 { .matchingPDI[] = .matchingIsolateInitiator[] = break } } }if .matchingPDI[] == -1 { .matchingPDI[] = .Len() } } }}// determineParagraphEmbeddingLevel reports the resolved paragraph direction of// the substring limited by the given range [start, end).//// Determines the paragraph level based on rules P2, P3. This is also used// in rule X5c to find if an FSI should resolve to LRI or RLI.func ( *paragraph) (, int) level {varClass = unknownClass// Rule P2.for := ; < ; ++ {if := .resultTypes[]; .in(L, AL, R) { = break } elseif .in(FSI, LRI, RLI) { = .matchingPDI[] // skip over to the matching PDIif > {log.Panic("assert (i <= end)") } } }// Rule P3.switch {caseunknownClass: // none found// default embedding level when no strong types found is 0.return0caseL:return0default: // AL, Rreturn1 }}const maxDepth = 125// This stack will store the embedding levels and override and isolated// statusestype directionalStatusStack struct { stackCounter int embeddingLevelStack [maxDepth + 1]level overrideStatusStack [maxDepth + 1]Class isolateStatusStack [maxDepth + 1]bool}func ( *directionalStatusStack) () { .stackCounter = 0 }func ( *directionalStatusStack) () { .stackCounter-- }func ( *directionalStatusStack) () int { return .stackCounter }func ( *directionalStatusStack) ( level, Class, bool) { .embeddingLevelStack[.stackCounter] = .overrideStatusStack[.stackCounter] = .isolateStatusStack[.stackCounter] = .stackCounter++}func ( *directionalStatusStack) () level {return .embeddingLevelStack[.stackCounter-1]}func ( *directionalStatusStack) () Class {return .overrideStatusStack[.stackCounter-1]}func ( *directionalStatusStack) () bool {return .isolateStatusStack[.stackCounter-1]}// Determine explicit levels using rules X1 - X8func ( *paragraph) () {vardirectionalStatusStackvar , , int// Rule X1. .push(.embeddingLevel, ON, false)for , := range .resultTypes {// Rules X2, X3, X4, X5, X5a, X5b, X5cswitch {caseRLE, LRE, RLO, LRO, RLI, LRI, FSI: := .in(RLI, LRI, FSI) := .in(RLE, RLO, RLI)// override if this is an FSI that resolves to RLIif == FSI { = (.determineParagraphEmbeddingLevel(+1, .matchingPDI[]) == 1) }if { .resultLevels[] = .lastEmbeddingLevel()if .lastDirectionalOverrideStatus() != ON { .resultTypes[] = .lastDirectionalOverrideStatus() } }varlevelif {// least greater odd = (.lastEmbeddingLevel() + 1) | 1 } else {// least greater even = (.lastEmbeddingLevel() + 2) &^ 1 }if <= maxDepth && == 0 && == 0 {if { ++ }// Push new embedding level, override status, and isolated // status. // No check for valid stack counter, since the level check // suffices.switch {caseLRO: .push(, L, )caseRLO: .push(, R, )default: .push(, ON, ) }// Not really part of the specif ! { .resultLevels[] = } } else {// This is an invalid explicit formatting character, // so apply the "Otherwise" part of rules X2-X5b.if { ++ } else { // !isIsolateif == 0 { ++ } } }// Rule X6acasePDI:if > 0 { -- } elseif == 0 {// do nothing } else { = 0for !.lastDirectionalIsolateStatus() { .pop() } .pop() -- } .resultLevels[] = .lastEmbeddingLevel()// Rule X7casePDF:// Not really part of the spec .resultLevels[] = .lastEmbeddingLevel()if > 0 {// do nothing } elseif > 0 { -- } elseif !.lastDirectionalIsolateStatus() && .depth() >= 2 { .pop() }caseB: // paragraph separator.// Rule X8.// These values are reset for clarity, in this implementation B // can only occur as the last code in the array. .empty() = 0 = 0 = 0 .resultLevels[] = .embeddingLeveldefault: .resultLevels[] = .lastEmbeddingLevel()if .lastDirectionalOverrideStatus() != ON { .resultTypes[] = .lastDirectionalOverrideStatus() } } }}type isolatingRunSequence struct { p *paragraph indexes []int// indexes to the original string types []Class// type of each character using the index resolvedLevels []level// resolved levels after application of rules level level sos, eos Class}func ( *isolatingRunSequence) () int { returnlen(.indexes) }func maxLevel(, level) level {if > {return }return}// Rule X10, second bullet: Determine the start-of-sequence (sos) and end-of-sequence (eos) types,// either L or R, for each isolating run sequence.func ( *paragraph) ( []int) *isolatingRunSequence { := len() := make([]Class, )for , := range { [] = .resultTypes[] }// assign level, sos and eos := [0] - 1for >= 0 && isRemovedByX9(.initialTypes[]) { -- } := .embeddingLevelif >= 0 { = .resultLevels[] }varlevel := [-1]if .in(LRI, RLI, FSI) { = .embeddingLevel } else {// the first character after the end of run sequence := [-1] + 1for ; < .Len() && isRemovedByX9(.initialTypes[]); ++ { } = .embeddingLevelif < .Len() { = .resultLevels[] } } := .resultLevels[[0]]return &isolatingRunSequence{p: ,indexes: ,types: ,level: ,sos: typeForLevel(maxLevel(, )),eos: typeForLevel(maxLevel(, )), }}// Resolving weak types Rules W1-W7.//// Note that some weak types (EN, AN) remain after this processing is// complete.func ( *isolatingRunSequence) () {// on entry, only these types remain .assertOnly(L, R, AL, EN, ES, ET, AN, CS, B, S, WS, ON, NSM, LRI, RLI, FSI, PDI)// Rule W1. // Changes all NSMs. := .sosfor , := range .types {if == NSM { .types[] = } else {// if t.in(LRI, RLI, FSI, PDI) { // precedingCharacterType = ON // } = } }// Rule W2. // EN does not change at the start of the run, because sos != AL.for , := range .types {if == EN {for := - 1; >= 0; -- {if := .types[]; .in(L, R, AL) {if == AL { .types[] = AN }break } } } }// Rule W3.for , := range .types {if == AL { .types[] = R } }// Rule W4. // Since there must be values on both sides for this rule to have an // effect, the scan skips the first and last value. // // Although the scan proceeds left to right, and changes the type // values in a way that would appear to affect the computations // later in the scan, there is actually no problem. A change in the // current value can only affect the value to its immediate right, // and only affect it if it is ES or CS. But the current value can // only change if the value to its right is not ES or CS. Thus // either the current value will not change, or its change will have // no effect on the remainder of the analysis.for := 1; < .Len()-1; ++ { := .types[]if == ES || == CS { := .types[-1] := .types[+1]if == EN && == EN { .types[] = EN } elseif .types[] == CS && == AN && == AN { .types[] = AN } } }// Rule W5.for , := range .types {if == ET {// locate end of sequence := := .findRunLimit(, ET)// check values at ends of sequence := .sosif > 0 { = .types[-1] }if != EN { = .eosif < len(.types) { = .types[] } }if == EN {setTypes(.types[:], EN) }// continue at end of sequence = } }// Rule W6.for , := range .types {if .in(ES, ET, CS) { .types[] = ON } }// Rule W7.for , := range .types {if == EN {// set default if we reach start of run := .sosfor := - 1; >= 0; -- { = .types[]if == L || == R { // AL's have been changed to R = break } }if == L { .types[] = L } } }}// 6) resolving neutral types Rules N1-N2.func ( *isolatingRunSequence) () {// on entry, only these types can be in resultTypes .assertOnly(L, R, EN, AN, B, S, WS, ON, RLI, LRI, FSI, PDI)for , := range .types {switch {caseWS, ON, B, S, RLI, LRI, FSI, PDI:// find bounds of run of neutrals := := .findRunLimit(, B, S, WS, ON, RLI, LRI, FSI, PDI)// determine effective types at ends of runvar , Class// Note that the character found can only be L, R, AN, or // EN.if == 0 { = .sos } else { = .types[-1]if .in(AN, EN) { = R } }if == len(.types) { = .eos } else { = .types[]if .in(AN, EN) { = R } }varClassif == {// Rule N1. = } else {// Rule N2. // Notice the embedding level of the run is used, not // the paragraph embedding level. = typeForLevel(.level) }setTypes(.types[:], )// skip over run of (former) neutrals = } }}func setLevels( []level, level) {for := range { [] = }}func setTypes( []Class, Class) {for := range { [] = }}// 7) resolving implicit embedding levels Rules I1, I2.func ( *isolatingRunSequence) () {// on entry, only these types can be in resultTypes .assertOnly(L, R, EN, AN) .resolvedLevels = make([]level, len(.types))setLevels(.resolvedLevels, .level)if (.level & 1) == 0 { // even levelfor , := range .types {// Rule I1.if == L {// no change } elseif == R { .resolvedLevels[] += 1 } else { // t == AN || t == EN .resolvedLevels[] += 2 } } } else { // odd levelfor , := range .types {// Rule I2.if == R {// no change } else { // t == L || t == AN || t == EN .resolvedLevels[] += 1 } } }}// Applies the levels and types resolved in rules W1-I2 to the// resultLevels array.func ( *isolatingRunSequence) () {for , := range .indexes { .p.resultTypes[] = .types[] .p.resultLevels[] = .resolvedLevels[] }}// Return the limit of the run consisting only of the types in validSet// starting at index. This checks the value at index, and will return// index if that value is not in validSet.func ( *isolatingRunSequence) ( int, ...Class) int {:for ; < len(.types); ++ { := .types[]for , := range {if == {continue } }return// didn't find a match in validSet }returnlen(.types)}// Algorithm validation. Assert that all values in types are in the// provided set.func ( *isolatingRunSequence) ( ...Class) {:for , := range .types {for , := range {if == {continue } }log.Panicf("invalid bidi code %v present in assertOnly at position %d", , .indexes[]) }}// determineLevelRuns returns an array of level runs. Each level run is// described as an array of indexes into the input string.//// Determines the level runs. Rule X9 will be applied in determining the// runs, in the way that makes sure the characters that are supposed to be// removed are not included in the runs.func ( *paragraph) () [][]int { := []int{} := [][]int{} := implicitLevelfor := range .initialTypes {if !isRemovedByX9(.initialTypes[]) {if .resultLevels[] != {// we just encountered a new run; wrap up last runif >= 0 { // only wrap it up if there was a run = append(, ) = nil }// Start new run = .resultLevels[] } = append(, ) } }// Wrap up the final run, if anyiflen() > 0 { = append(, ) }return}// Definition BD13. Determine isolating run sequences.func ( *paragraph) () []*isolatingRunSequence { := .determineLevelRuns()// Compute the run that each character belongs to := make([]int, .Len())for , := range {for , := range { [] = } } := []*isolatingRunSequence{}var []intfor , := range { := [0]if .initialTypes[] != PDI || .matchingIsolateInitiator[] == -1 { = nil// int run = i;for {// Copy this level run into currentRunSequence = append(, ...) := [len()-1] := .initialTypes[]if .in(LRI, RLI, FSI) && .matchingPDI[] != .Len() { = [[.matchingPDI[]]] } else {break } } = append(, .isolatingRunSequence()) } }return}// Assign level information to characters removed by rule X9. This is for// ease of relating the level information to the original input data. Note// that the levels assigned to these codes are arbitrary, they're chosen so// as to avoid breaking level runs.func ( *paragraph) () {for , := range .initialTypes {if .in(LRE, RLE, LRO, RLO, PDF, BN) { .resultTypes[] = .resultLevels[] = -1 } }// now propagate forward the levels information (could have // propagated backward, the main thing is not to introduce a level // break where one doesn't already exist).if .resultLevels[0] == -1 { .resultLevels[0] = .embeddingLevel }for := 1; < len(.initialTypes); ++ {if .resultLevels[] == -1 { .resultLevels[] = .resultLevels[-1] } }// Embedding information is for informational purposes only so need not be // adjusted.}//// Output//// getLevels computes levels array breaking lines at offsets in linebreaks.// Rule L1.//// The linebreaks array must include at least one value. The values must be// in strictly increasing order (no duplicates) between 1 and the length of// the text, inclusive. The last value must be the length of the text.func ( *paragraph) ( []int) []level {// Note that since the previous processing has removed all // P, S, and WS values from resultTypes, the values referred to // in these rules are the initial types, before any processing // has been applied (including processing of overrides). // // This example implementation has reinserted explicit format codes // and BN, in order that the levels array correspond to the // initial text. Their final placement is not normative. // These codes are treated like WS in this implementation, // so they don't interrupt sequences of WS.validateLineBreaks(, .Len()) := append([]level(nil), .resultLevels...)// don't worry about linebreaks since if there is a break within // a series of WS values preceding S, the linebreak itself // causes the reset.for , := range .initialTypes {if .in(B, S) {// Rule L1, clauses one and two. [] = .embeddingLevel// Rule L1, clause three.for := - 1; >= 0; -- {ifisWhitespace(.initialTypes[]) { // including format codes [] = .embeddingLevel } else {break } } } }// Rule L1, clause four. := 0for , := range {for := - 1; >= ; -- {ifisWhitespace(.initialTypes[]) { // including format codes [] = .embeddingLevel } else {break } } = }return}// getReordering returns the reordering of lines from a visual index to a// logical index for line breaks at the given offsets.//// Lines are concatenated from left to right. So for example, the fifth// character from the left on the third line is//// getReordering(linebreaks)[linebreaks[1] + 4]//// (linebreaks[1] is the position after the last character of the second// line, which is also the index of the first character on the third line,// and adding four gets the fifth character from the left).//// The linebreaks array must include at least one value. The values must be// in strictly increasing order (no duplicates) between 1 and the length of// the text, inclusive. The last value must be the length of the text.func ( *paragraph) ( []int) []int {validateLineBreaks(, .Len())returncomputeMultilineReordering(.getLevels(), )}// Return multiline reordering array for a given level array. Reordering// does not occur across a line break.func computeMultilineReordering( []level, []int) []int { := make([]int, len()) := 0for , := range { := make([]level, -)copy(, [:])for , := rangecomputeReordering() { [+] = + } = }return}// Return reordering array for a given level array. This reorders a single// line. The reordering is a visual to logical map. For example, the// leftmost char is string.charAt(order[0]). Rule L2.func computeReordering( []level) []int { := make([]int, len())// initialize orderfor := range { [] = }// locate highest level found on line. // Note the rules say text, but no reordering across line bounds is // performed, so this is sufficient. := level(0) := level(maxDepth + 2)for , := range {if > { = }if &1 != 0 && < { = } }for := ; >= ; -- {for := 0; < len(); ++ {if [] >= {// find range of text at or above this level := := + 1for < len() && [] >= { ++ }for , := , -1; < ; , = +1, -1 { [], [] = [], [] }// skip to end of level run = } } }return}// isWhitespace reports whether the type is considered a whitespace type for the// line break rules.func isWhitespace( Class) bool {switch {caseLRE, RLE, LRO, RLO, PDF, LRI, RLI, FSI, PDI, BN, WS:returntrue }returnfalse}// isRemovedByX9 reports whether the type is one of the types removed in X9.func isRemovedByX9( Class) bool {switch {caseLRE, RLE, LRO, RLO, PDF, BN:returntrue }returnfalse}// typeForLevel reports the strong type (L or R) corresponding to the level.func typeForLevel( level) Class {if ( & 0x1) == 0 {returnL }returnR}func validateTypes( []Class) error {iflen() == 0 {returnfmt.Errorf("types is null") }for , := range [:len()-1] {if == B {returnfmt.Errorf("B type before end of paragraph at index: %d", ) } }returnnil}func validateParagraphEmbeddingLevel( level) error {if != implicitLevel && != 0 && != 1 {returnfmt.Errorf("illegal paragraph embedding level: %d", ) }returnnil}func validateLineBreaks( []int, int) error { := 0for , := range {if <= {returnfmt.Errorf("bad linebreak: %d at index: %d", , ) } = }if != {returnfmt.Errorf("last linebreak was %d, want %d", , ) }returnnil}func validatePbTypes( []bracketType) error {iflen() == 0 {returnfmt.Errorf("pairTypes is null") }for , := range {switch {casebpNone, bpOpen, bpClose:default:returnfmt.Errorf("illegal pairType value at %d: %v", , []) } }returnnil}func validatePbValues( []rune, []bracketType) error {if == nil {returnfmt.Errorf("pairValues is null") }iflen() != len() {returnfmt.Errorf("pairTypes is different length from pairValues") }returnnil}
The pages are generated with Goldsv0.7.0-preview. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds.