// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Note: the file data_test.go that is generated should not be checked in.
//go:generate go run maketables.go triegen.go
//go:generate go test -tags test

// Package norm contains types and functions for normalizing Unicode strings.
package norm // import "golang.org/x/text/unicode/norm" import ( ) // A Form denotes a canonical representation of Unicode code points. // The Unicode-defined normalization and equivalence forms are: // // NFC Unicode Normalization Form C // NFD Unicode Normalization Form D // NFKC Unicode Normalization Form KC // NFKD Unicode Normalization Form KD // // For a Form f, this documentation uses the notation f(x) to mean // the bytes or string x converted to the given form. // A position n in x is called a boundary if conversion to the form can // proceed independently on both sides: // f(x) == append(f(x[0:n]), f(x[n:])...) // // References: https://unicode.org/reports/tr15/ and // https://unicode.org/notes/tn5/. type Form int const ( NFC Form = iota NFD NFKC NFKD ) // Bytes returns f(b). May return b if f(b) = b. func ( Form) ( []byte) []byte { := inputBytes() := formTable[] , := .quickSpan(, 0, len(), true) if { return } := make([]byte, , len()) copy(, [0:]) := reorderBuffer{f: *, src: , nsrc: len(), out: , flushF: appendFlush} return doAppendInner(&, ) } // String returns f(s). func ( Form) ( string) string { := inputString() := formTable[] , := .quickSpan(, 0, len(), true) if { return } := make([]byte, , len()) copy(, [0:]) := reorderBuffer{f: *, src: , nsrc: len(), out: , flushF: appendFlush} return string(doAppendInner(&, )) } // IsNormal returns true if b == f(b). func ( Form) ( []byte) bool { := inputBytes() := formTable[] , := .quickSpan(, 0, len(), true) if { return true } := reorderBuffer{f: *, src: , nsrc: len()} .setFlusher(nil, cmpNormalBytes) for < len() { .out = [:] if = decomposeSegment(&, , true); < 0 { return false } , _ = .f.quickSpan(.src, , len(), true) } return true } func cmpNormalBytes( *reorderBuffer) bool { := .out for := 0; < .nrune; ++ { := .rune[] if int(.size) > len() { return false } := .pos := + .size for ; < ; ++ { if [0] != .byte[] { return false } = [1:] } } return true } // IsNormalString returns true if s == f(s). func ( Form) ( string) bool { := inputString() := formTable[] , := .quickSpan(, 0, len(), true) if { return true } := reorderBuffer{f: *, src: , nsrc: len()} .setFlusher(nil, func( *reorderBuffer) bool { for := 0; < .nrune; ++ { := .rune[] if +int(.size) > len() { return false } := .pos := + .size for ; < ; ++ { if [] != .byte[] { return false } ++ } } return true }) for < len() { if = decomposeSegment(&, , true); < 0 { return false } , _ = .f.quickSpan(.src, , len(), true) } return true } // patchTail fixes a case where a rune may be incorrectly normalized // if it is followed by illegal continuation bytes. It returns the // patched buffer and whether the decomposition is still in progress. func patchTail( *reorderBuffer) bool { , := lastRuneStart(&.f, .out) if == -1 || .size == 0 { return true } := + int(.size) := len(.out) - if > 0 { // Potentially allocating memory. However, this only // happens with ill-formed UTF-8. := make([]byte, 0) = append(, .out[len(.out)-:]...) .out = .out[:] decomposeToLastBoundary() .doFlush() .out = append(.out, ...) return false } := .out[:] .out = .out[:] decomposeToLastBoundary() if := .ss.next(); == ssStarter { .doFlush() .ss.first() } else if == ssOverflow { .doFlush() .insertCGJ() .ss = 0 } .insertUnsafe(inputBytes(), 0, ) return true } func appendQuick( *reorderBuffer, int) int { if .nsrc == { return } , := .f.quickSpan(.src, , .nsrc, true) .out = .src.appendSlice(.out, , ) return } // Append returns f(append(out, b...)). // The buffer out must be nil, empty, or equal to f(out). func ( Form) ( []byte, ...byte) []byte { return .doAppend(, inputBytes(), len()) } func ( Form) ( []byte, input, int) []byte { if == 0 { return } := formTable[] // Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer. if len() == 0 { , := .quickSpan(, 0, , true) = .appendSlice(, 0, ) if == { return } := reorderBuffer{f: *, src: , nsrc: , out: , flushF: appendFlush} return doAppendInner(&, ) } := reorderBuffer{f: *, src: , nsrc: } return doAppend(&, , 0) } func doAppend( *reorderBuffer, []byte, int) []byte { .setFlusher(, appendFlush) , := .src, .nsrc := len() > 0 if := .skipContinuationBytes(); > { // Move leading non-starters to destination. .out = .appendSlice(.out, , ) = = patchTail() } := &.f if { var Properties if < { = .info(, ) if !.BoundaryBefore() || .nLeadingNonStarters() > 0 { if == 0 { decomposeToLastBoundary() } = decomposeSegment(, , true) } } if .size == 0 { .doFlush() // Append incomplete UTF-8 encoding. return .appendSlice(.out, , ) } if .nrune > 0 { return doAppendInner(, ) } } = appendQuick(, ) return doAppendInner(, ) } func doAppendInner( *reorderBuffer, int) []byte { for := .nsrc; < ; { = decomposeSegment(, , true) = appendQuick(, ) } return .out } // AppendString returns f(append(out, []byte(s))). // The buffer out must be nil, empty, or equal to f(out). func ( Form) ( []byte, string) []byte { return .doAppend(, inputString(), len()) } // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]). // It is not guaranteed to return the largest such n. func ( Form) ( []byte) int { , := formTable[].quickSpan(inputBytes(), 0, len(), true) return } // Span implements transform.SpanningTransformer. It returns a boundary n such // that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n. func ( Form) ( []byte, bool) ( int, error) { , := formTable[].quickSpan(inputBytes(), 0, len(), ) if < len() { if ! { = transform.ErrEndOfSpan } else { = transform.ErrShortSrc } } return , } // SpanString returns a boundary n such that s[0:n] == f(s[0:n]). // It is not guaranteed to return the largest such n. func ( Form) ( string, bool) ( int, error) { , := formTable[].quickSpan(inputString(), 0, len(), ) if < len() { if ! { = transform.ErrEndOfSpan } else { = transform.ErrShortSrc } } return , } // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and // whether any non-normalized parts were found. If atEOF is false, n will // not point past the last segment if this segment might be become // non-normalized by appending other runes. func ( *formInfo) ( input, , int, bool) ( int, bool) { var uint8 := streamSafe(0) := for = ; < ; { if := .skipASCII(, ); != { = = - 1 = 0 = 0 continue } := .info(, ) if .size == 0 { if { // include incomplete runes return , true } return , true } // This block needs to be before the next, because it is possible to // have an overflow for runes that are starters (e.g. with U+FF9E). switch .next() { case ssStarter: = case ssOverflow: return , false case ssSuccess: if > .ccc { return , false } } if .composing { if !.isYesC() { break } } else { if !.isYesD() { break } } = .ccc += int(.size) } if == { if ! { = } return , true } return , false } // QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]). // It is not guaranteed to return the largest such n. func ( Form) ( string) int { , := formTable[].quickSpan(inputString(), 0, len(), true) return } // FirstBoundary returns the position i of the first boundary in b // or -1 if b contains no boundary. func ( Form) ( []byte) int { return .firstBoundary(inputBytes(), len()) } func ( Form) ( input, int) int { := .skipContinuationBytes(0) if >= { return -1 } := formTable[] := streamSafe(0) // We should call ss.first here, but we can't as the first rune is // skipped already. This means FirstBoundary can't really determine // CGJ insertion points correctly. Luckily it doesn't have to. for { := .info(, ) if .size == 0 { return -1 } if := .next(); != ssSuccess { return } += int(.size) if >= { if !.BoundaryAfter() && !.isMax() { return -1 } return } } } // FirstBoundaryInString returns the position i of the first boundary in s // or -1 if s contains no boundary. func ( Form) ( string) int { return .firstBoundary(inputString(), len()) } // NextBoundary reports the index of the boundary between the first and next // segment in b or -1 if atEOF is false and there are not enough bytes to // determine this boundary. func ( Form) ( []byte, bool) int { return .nextBoundary(inputBytes(), len(), ) } // NextBoundaryInString reports the index of the boundary between the first and // next segment in b or -1 if atEOF is false and there are not enough bytes to // determine this boundary. func ( Form) ( string, bool) int { return .nextBoundary(inputString(), len(), ) } func ( Form) ( input, int, bool) int { if == 0 { if { return 0 } return -1 } := formTable[] := .info(, 0) if .size == 0 { if { return 1 } return -1 } := streamSafe(0) .first() for := int(.size); < ; += int(.size) { = .info(, ) if .size == 0 { if { return } return -1 } // TODO: Using streamSafe to determine the boundary isn't the same as // using BoundaryBefore. Determine which should be used. if := .next(); != ssSuccess { return } } if ! && !.BoundaryAfter() && !.isMax() { return -1 } return } // LastBoundary returns the position i of the last boundary in b // or -1 if b contains no boundary. func ( Form) ( []byte) int { return lastBoundary(formTable[], ) } func lastBoundary( *formInfo, []byte) int { := len() , := lastRuneStart(, ) if == -1 { return -1 } if .size == 0 { // ends with incomplete rune if == 0 { // starts with incomplete rune return -1 } = , = lastRuneStart(, [:]) if == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter return } } if +int(.size) != { // trailing non-starter bytes: illegal UTF-8 return } if .BoundaryAfter() { return } := streamSafe(0) := .backwards() for = ; >= 0 && != ssStarter; = { , = lastRuneStart(, [:]) if = .backwards(); == ssOverflow { break } if +int(.size) != { if == -1 { // no boundary found return -1 } return // boundary after an illegal UTF-8 encoding } } return } // decomposeSegment scans the first segment in src into rb. It inserts 0x034f // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters // and returns the number of bytes consumed from src or iShortDst or iShortSrc. func decomposeSegment( *reorderBuffer, int, bool) int { // Force one character to be consumed. := .f.info(.src, ) if .size == 0 { return 0 } if := .ss.next(); == ssStarter { // TODO: this could be removed if we don't support merging. if .nrune > 0 { goto } } else if == ssOverflow { .insertCGJ() goto } if := .insertFlush(.src, , ); != iSuccess { return int() } for { += int(.size) if >= .nsrc { if ! && !.BoundaryAfter() { return int(iShortSrc) } break } = .f.info(.src, ) if .size == 0 { if ! { return int(iShortSrc) } break } if := .ss.next(); == ssStarter { break } else if == ssOverflow { .insertCGJ() break } if := .insertFlush(.src, , ); != iSuccess { return int() } } : if !.doFlush() { return int(iShortDst) } return } // lastRuneStart returns the runeInfo and position of the last // rune in buf or the zero runeInfo and -1 if no rune was found. func lastRuneStart( *formInfo, []byte) (Properties, int) { := len() - 1 for ; >= 0 && !utf8.RuneStart([]); -- { } if < 0 { return Properties{}, -1 } return .info(inputBytes(), ), } // decomposeToLastBoundary finds an open segment at the end of the buffer // and scans it into rb. Returns the buffer minus the last segment. func decomposeToLastBoundary( *reorderBuffer) { := &.f , := lastRuneStart(, .out) if int(.size) != len(.out)- { // illegal trailing continuation bytes return } if .BoundaryAfter() { return } var [maxNonStarters + 1]Properties // stores runeInfo in reverse order := 0 := streamSafe(0) := len(.out) for { [] = := .backwards() if == ssOverflow { // Note that if we have an overflow, it the string we are appending to // is not correctly normalized. In this case the behavior is undefined. break } ++ -= int(.size) if == ssStarter || < 0 { break } , = lastRuneStart(, .out[:]) if int(.size) != - { break } } .ss = // Copy bytes for insertion as we may need to overwrite rb.out. var [maxBufferSize * utf8.UTFMax]byte := [:copy([:], .out[:])] .out = .out[:] for --; >= 0; -- { = [] .insertUnsafe(inputBytes(), 0, ) = [.size:] } }