// Copyright 2011 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.package normimportconst ( maxNonStarters = 30// The maximum number of characters needed for a buffer is // maxNonStarters + 1 for the starter + 1 for the GCJ maxBufferSize = maxNonStarters + 2 maxNFCExpansion = 3// NFC(0x1D160) maxNFKCExpansion = 18// NFKC(0xFDFA) maxByteBufferSize = utf8.UTFMax * maxBufferSize// 128)// ssState is used for reporting the segment state after inserting a rune.// It is returned by streamSafe.next.type ssState intconst (// Indicates a rune was successfully added to the segment. ssSuccess ssState = iota// Indicates a rune starts a new segment and should not be added. ssStarter// Indicates a rune caused a segment overflow and a CGJ should be inserted. ssOverflow)// streamSafe implements the policy of when a CGJ should be inserted.type streamSafe uint8// first inserts the first rune of a segment. It is a faster version of next if// it is known p represents the first rune in a segment.func ( *streamSafe) ( Properties) { * = streamSafe(.nTrailingNonStarters())}// insert returns a ssState value to indicate whether a rune represented by p// can be inserted.func ( *streamSafe) ( Properties) ssState {if * > maxNonStarters {panic("streamSafe was not reset") } := .nLeadingNonStarters()if * += streamSafe(); * > maxNonStarters { * = 0returnssOverflow }// The Stream-Safe Text Processing prescribes that the counting can stop // as soon as a starter is encountered. However, there are some starters, // like Jamo V and T, that can combine with other runes, leaving their // successive non-starters appended to the previous, possibly causing an // overflow. We will therefore consider any rune with a non-zero nLead to // be a non-starter. Note that it always hold that if nLead > 0 then // nLead == nTrail.if == 0 { * = streamSafe(.nTrailingNonStarters())returnssStarter }returnssSuccess}// backwards is used for checking for overflow and segment starts// when traversing a string backwards. Users do not need to call first// for the first rune. The state of the streamSafe retains the count of// the non-starters loaded.func ( *streamSafe) ( Properties) ssState {if * > maxNonStarters {panic("streamSafe was not reset") } := * + streamSafe(.nTrailingNonStarters())if > maxNonStarters {returnssOverflow } * = if .nLeadingNonStarters() == 0 {returnssStarter }returnssSuccess}func ( streamSafe) () bool {return == maxNonStarters}// GraphemeJoiner is inserted after maxNonStarters non-starter runes.constGraphemeJoiner = "\u034F"// reorderBuffer is used to normalize a single segment. Characters inserted with// insert are decomposed and reordered based on CCC. The compose method can// be used to recombine characters. Note that the byte buffer does not hold// the UTF-8 characters in order. Only the rune array is maintained in sorted// order. flush writes the resulting segment to a byte array.type reorderBuffer struct { rune [maxBufferSize]Properties// Per character info. byte [maxByteBufferSize]byte// UTF-8 buffer. Referenced by runeInfo.pos. nbyte uint8// Number or bytes. ss streamSafe// For limiting length of non-starter sequence. nrune int// Number of runeInfos. f formInfo src input nsrc int tmpBytes input out []byte flushF func(*reorderBuffer) bool}func ( *reorderBuffer) ( Form, []byte) { .f = *formTable[] .src.setBytes() .nsrc = len() .ss = 0}func ( *reorderBuffer) ( Form, string) { .f = *formTable[] .src.setString() .nsrc = len() .ss = 0}func ( *reorderBuffer) ( []byte, func(*reorderBuffer) bool) { .out = .flushF = }// reset discards all characters from the buffer.func ( *reorderBuffer) () { .nrune = 0 .nbyte = 0}func ( *reorderBuffer) () bool {if .f.composing { .compose() } := .flushF() .reset()return}// appendFlush appends the normalized segment to rb.out.func appendFlush( *reorderBuffer) bool {for := 0; < .nrune; ++ { := .rune[].pos := + .rune[].size .out = append(.out, .byte[:]...) }returntrue}// flush appends the normalized segment to out and resets rb.func ( *reorderBuffer) ( []byte) []byte {for := 0; < .nrune; ++ { := .rune[].pos := + .rune[].size = append(, .byte[:]...) } .reset()return}// flushCopy copies the normalized segment to buf and resets rb.// It returns the number of bytes written to buf.func ( *reorderBuffer) ( []byte) int { := 0for := 0; < .nrune; ++ { := .rune[] += copy([:], .byte[.pos:.pos+.size]) } .reset()return}// insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class.// It returns false if the buffer is not large enough to hold the rune.// It is used internally by insert and insertString only.func ( *reorderBuffer) ( Properties) { := .nrune := .rune[:] := .cccif > 0 {// Find insertion position + move elements to make room.for ; > 0; -- {if [-1].ccc <= {break } [] = [-1] } } .nrune += 1 := uint8(.nbyte) .nbyte += utf8.UTFMax .pos = [] = }// insertErr is an error code returned by insert. Using this type instead// of error improves performance up to 20% for many of the benchmarks.type insertErr intconst ( iSuccess insertErr = -iota iShortDst iShortSrc)// insertFlush inserts the given rune in the buffer ordered by CCC.// If a decomposition with multiple segments are encountered, they leading// ones are flushed.// It returns a non-zero error code if the rune was not inserted.func ( *reorderBuffer) ( input, int, Properties) insertErr {if := .hangul(); != 0 { .decomposeHangul()returniSuccess }if .hasDecomposition() {return .insertDecomposed(.Decomposition()) } .insertSingle(, , )returniSuccess}// insertUnsafe inserts the given rune in the buffer ordered by CCC.// It is assumed there is sufficient space to hold the runes. It is the// responsibility of the caller to ensure this. This can be done by checking// the state returned by the streamSafe type.func ( *reorderBuffer) ( input, int, Properties) {if := .hangul(); != 0 { .decomposeHangul() }if .hasDecomposition() {// TODO: inline. .insertDecomposed(.Decomposition()) } else { .insertSingle(, , ) }}// insertDecomposed inserts an entry in to the reorderBuffer for each rune// in dcomp. dcomp must be a sequence of decomposed UTF-8-encoded runes.// It flushes the buffer on each new segment start.func ( *reorderBuffer) ( []byte) insertErr { .tmpBytes.setBytes()// As the streamSafe accounting already handles the counting for modifiers, // we don't have to call next. However, we do need to keep the accounting // intact when flushing the buffer.for := 0; < len(); { := .f.info(.tmpBytes, )if .BoundaryBefore() && .nrune > 0 && !.doFlush() {returniShortDst } += copy(.byte[.nbyte:], [:+int(.size)]) .insertOrdered() }returniSuccess}// insertSingle inserts an entry in the reorderBuffer for the rune at// position i. info is the runeInfo for the rune at position i.func ( *reorderBuffer) ( input, int, Properties) { .copySlice(.byte[.nbyte:], , +int(.size)) .insertOrdered()}// insertCGJ inserts a Combining Grapheme Joiner (0x034f) into rb.func ( *reorderBuffer) () { .insertSingle(input{str: GraphemeJoiner}, 0, Properties{size: uint8(len(GraphemeJoiner))})}// appendRune inserts a rune at the end of the buffer. It is used for Hangul.func ( *reorderBuffer) ( rune) { := .nbyte := utf8.EncodeRune(.byte[:], rune()) .nbyte += utf8.UTFMax .rune[.nrune] = Properties{pos: , size: uint8()} .nrune++}// assignRune sets a rune at position pos. It is used for Hangul and recomposition.func ( *reorderBuffer) ( int, rune) { := .rune[].pos := utf8.EncodeRune(.byte[:], rune()) .rune[] = Properties{pos: , size: uint8()}}// runeAt returns the rune at position n. It is used for Hangul and recomposition.func ( *reorderBuffer) ( int) rune { := .rune[] , := utf8.DecodeRune(.byte[.pos : .pos+.size])return}// bytesAt returns the UTF-8 encoding of the rune at position n.// It is used for Hangul and recomposition.func ( *reorderBuffer) ( int) []byte { := .rune[]return .byte[.pos : int(.pos)+int(.size)]}// For Hangul we combine algorithmically, instead of using tables.const ( hangulBase = 0xAC00// UTF-8(hangulBase) -> EA B0 80 hangulBase0 = 0xEA hangulBase1 = 0xB0 hangulBase2 = 0x80 hangulEnd = hangulBase + jamoLVTCount// UTF-8(0xD7A4) -> ED 9E A4 hangulEnd0 = 0xED hangulEnd1 = 0x9E hangulEnd2 = 0xA4 jamoLBase = 0x1100// UTF-8(jamoLBase) -> E1 84 00 jamoLBase0 = 0xE1 jamoLBase1 = 0x84 jamoLEnd = 0x1113 jamoVBase = 0x1161 jamoVEnd = 0x1176 jamoTBase = 0x11A7 jamoTEnd = 0x11C3 jamoTCount = 28 jamoVCount = 21 jamoVTCount = 21 * 28 jamoLVTCount = 19 * 21 * 28)const hangulUTF8Size = 3func isHangul( []byte) bool {iflen() < hangulUTF8Size {returnfalse } := [0]if < hangulBase0 {returnfalse } := [1]switch {case == hangulBase0:return >= hangulBase1case < hangulEnd0:returntruecase > hangulEnd0:returnfalsecase < hangulEnd1:returntrue }return == hangulEnd1 && [2] < hangulEnd2}func isHangulString( string) bool {iflen() < hangulUTF8Size {returnfalse } := [0]if < hangulBase0 {returnfalse } := [1]switch {case == hangulBase0:return >= hangulBase1case < hangulEnd0:returntruecase > hangulEnd0:returnfalsecase < hangulEnd1:returntrue }return == hangulEnd1 && [2] < hangulEnd2}// Caller must ensure len(b) >= 2.func isJamoVT( []byte) bool {// True if (rune & 0xff00) == jamoLBasereturn [0] == jamoLBase0 && ([1]&0xFC) == jamoLBase1}func isHangulWithoutJamoT( []byte) bool { , := utf8.DecodeRune() -= hangulBasereturn < jamoLVTCount && %jamoTCount == 0}// decomposeHangul writes the decomposed Hangul to buf and returns the number// of bytes written. len(buf) should be at least 9.func decomposeHangul( []byte, rune) int {const = 3 -= hangulBase := % jamoTCount /= jamoTCountutf8.EncodeRune(, jamoLBase+/jamoVCount)utf8.EncodeRune([:], jamoVBase+%jamoVCount)if != 0 {utf8.EncodeRune([2*:], jamoTBase+)return3 * }return2 * }// decomposeHangul algorithmically decomposes a Hangul rune into// its Jamo components.// See https://unicode.org/reports/tr15/#Hangul for details on decomposing Hangul.func ( *reorderBuffer) ( rune) { -= hangulBase := % jamoTCount /= jamoTCount .appendRune(jamoLBase + /jamoVCount) .appendRune(jamoVBase + %jamoVCount)if != 0 { .appendRune(jamoTBase + ) }}// combineHangul algorithmically combines Jamo character components into Hangul.// See https://unicode.org/reports/tr15/#Hangul for details on combining Hangul.func ( *reorderBuffer) (, , int) { := .rune[:] := .nrunefor ; < ; ++ { := [-1].ccc := [].cccif == 0 { = - 1 }if != -1 && >= {// b[i] is blocked by greater-equal cccX below it [] = [] ++ } else { := .runeAt() // also used to compare to hangulBase := .runeAt() // also used to compare to jamoTswitch {casejamoLBase <= && < jamoLEnd &&jamoVBase <= && < jamoVEnd:// 11xx plus 116x to LV .assignRune(, hangulBase+ (-jamoLBase)*jamoVTCount+(-jamoVBase)*jamoTCount)casehangulBase <= && < hangulEnd &&jamoTBase < && < jamoTEnd && ((-hangulBase)%jamoTCount) == 0:// ACxx plus 11Ax to LVT .assignRune(, +-jamoTBase)default: [] = [] ++ } } } .nrune = }// compose recombines the runes in the buffer.// It should only be used to recompose a single segment, as it will not// handle alternations between Hangul and non-Hangul characters correctly.func ( *reorderBuffer) () {// Lazily load the map used by the combine func below, but do // it outside of the loop.recompMapOnce.Do(buildRecompMap)// UAX #15, section X5 , including Corrigendum #5 // "In any character sequence beginning with starter S, a character C is // blocked from S if and only if there is some character B between S // and C, and either B is a starter or it has the same or higher // combining class as C." := .nruneif == 0 {return } := 1 := .rune[:]for , := 0, 1; < ; ++ {ifisJamoVT(.bytesAt()) {// Redo from start in Hangul mode. Necessary to support // U+320E..U+321E in NFKC mode. .combineHangul(, , )return } := []// We can only use combineForward as a filter if we later // get the info for the combined character. This is more // expensive than using the filter. Using combinesBackward() // is safe.if .combinesBackward() { := [-1].ccc := .ccc := false// b[i] blocked by starter or greater or equal CCC?if == 0 { = - 1 } else { = != -1 && >= }if ! { := combine(.runeAt(), .runeAt())if != 0 { .assignRune(, )continue } } } [] = [] ++ } .nrune = }
The pages are generated with Goldsv0.7.3. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds.