Source File
forminfo.go
Belonging Package
vendor/golang.org/x/text/unicode/norm
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package norm
import
// This file contains Form-specific logic and wrappers for data in tables.go.
// Rune info is stored in a separate trie per composing form. A composing form
// and its corresponding decomposing form share the same trie. Each trie maps
// a rune to a uint16. The values take two forms. For v >= 0x8000:
// bits
// 15: 1 (inverse of NFD_QC bit of qcInfo)
// 13..7: qcInfo (see below). isYesD is always true (no decomposition).
// 6..0: ccc (compressed CCC value).
// For v < 0x8000, the respective rune has a decomposition and v is an index
// into a byte array of UTF-8 decomposition sequences and additional info and
// has the form:
// <header> <decomp_byte>* [<tccc> [<lccc>]]
// The header contains the number of bytes in the decomposition (excluding this
// length byte). The two most significant bits of this length byte correspond
// to bit 5 and 4 of qcInfo (see below). The byte sequence itself starts at v+1.
// The byte sequence is followed by a trailing and leading CCC if the values
// for these are not zero. The value of v determines which ccc are appended
// to the sequences. For v < firstCCC, there are none, for v >= firstCCC,
// the sequence is followed by a trailing ccc, and for v >= firstLeadingCC
// there is an additional leading ccc. The value of tccc itself is the
// trailing CCC shifted left 2 bits. The two least-significant bits of tccc
// are the number of trailing non-starters.
const (
qcInfoMask = 0x3F // to clear all but the relevant bits in a qcInfo
headerLenMask = 0x3F // extract the length value from the header byte
headerFlagsMask = 0xC0 // extract the qcInfo bits from the header byte
)
// Properties provides access to normalization properties of a rune.
type Properties struct {
pos uint8 // start position in reorderBuffer; used in composition.go
size uint8 // length of UTF-8 encoding of this rune
ccc uint8 // leading canonical combining class (ccc if not decomposition)
tccc uint8 // trailing canonical combining class (ccc if not decomposition)
nLead uint8 // number of leading non-starters.
flags qcInfo // quick check flags
index uint16
}
// functions dispatchable per form
type lookupFunc func(b input, i int) Properties
// formInfo holds Form-specific functions and tables.
type formInfo struct {
form Form
composing, compatibility bool // form type
info lookupFunc
nextMain iterFunc
}
var formTable = []*formInfo{{
form: NFC,
composing: true,
compatibility: false,
info: lookupInfoNFC,
nextMain: nextComposed,
}, {
form: NFD,
composing: false,
compatibility: false,
info: lookupInfoNFC,
nextMain: nextDecomposed,
}, {
form: NFKC,
composing: true,
compatibility: true,
info: lookupInfoNFKC,
nextMain: nextComposed,
}, {
form: NFKD,
composing: false,
compatibility: true,
info: lookupInfoNFKC,
nextMain: nextDecomposed,
}}
// We do not distinguish between boundaries for NFC, NFD, etc. to avoid
// unexpected behavior for the user. For example, in NFD, there is a boundary
// after 'a'. However, 'a' might combine with modifiers, so from the application's
// perspective it is not a good boundary. We will therefore always use the
// boundaries for the combining variants.
// BoundaryBefore returns true if this rune starts a new segment and
// cannot combine with any rune on the left.
func ( Properties) () bool {
if .ccc == 0 && !.combinesBackward() {
return true
}
// We assume that the CCC of the first character in a decomposition
// is always non-zero if different from info.ccc and that we can return
// false at this point. This is verified by maketables.
return false
}
// BoundaryAfter returns true if runes cannot combine with or otherwise
// interact with this or previous runes.
func ( Properties) () bool {
// TODO: loosen these conditions.
return .isInert()
}
// We pack quick check data in 4 bits:
//
// 5: Combines forward (0 == false, 1 == true)
// 4..3: NFC_QC Yes(00), No (10), or Maybe (11)
// 2: NFD_QC Yes (0) or No (1). No also means there is a decomposition.
// 1..0: Number of trailing non-starters.
//
// When all 4 bits are zero, the character is inert, meaning it is never
// influenced by normalization.
type qcInfo uint8
func ( Properties) () bool { return .flags&0x10 == 0 }
func ( Properties) () bool { return .flags&0x4 == 0 }
func ( Properties) () bool { return .flags&0x20 != 0 }
func ( Properties) () bool { return .flags&0x8 != 0 } // == isMaybe
func ( Properties) () bool { return .flags&0x4 != 0 } // == isNoD
func ( Properties) () bool {
return .flags&qcInfoMask == 0 && .ccc == 0
}
func ( Properties) () bool {
return .index >= firstMulti && .index < endMulti
}
func ( Properties) () uint8 {
return .nLead
}
func ( Properties) () uint8 {
return uint8(.flags & 0x03)
}
// Decomposition returns the decomposition for the underlying rune
// or nil if there is none.
func ( Properties) () []byte {
// TODO: create the decomposition for Hangul?
if .index == 0 {
return nil
}
:= .index
:= decomps[] & headerLenMask
++
return decomps[ : +uint16()]
}
// Size returns the length of UTF-8 encoding of the rune.
func ( Properties) () int {
return int(.size)
}
// CCC returns the canonical combining class of the underlying rune.
func ( Properties) () uint8 {
if .index >= firstCCCZeroExcept {
return 0
}
return ccc[.ccc]
}
// LeadCCC returns the CCC of the first rune in the decomposition.
// If there is no decomposition, LeadCCC equals CCC.
func ( Properties) () uint8 {
return ccc[.ccc]
}
// TrailCCC returns the CCC of the last rune in the decomposition.
// If there is no decomposition, TrailCCC equals CCC.
func ( Properties) () uint8 {
return ccc[.tccc]
}
func buildRecompMap() {
recompMap = make(map[uint32]rune, len(recompMapPacked)/8)
var [8]byte
for := 0; < len(recompMapPacked); += 8 {
copy([:], recompMapPacked[:+8])
:= binary.BigEndian.Uint32([:4])
:= binary.BigEndian.Uint32([4:])
recompMap[] = rune()
}
}
// Recomposition
// We use 32-bit keys instead of 64-bit for the two codepoint keys.
// This clips off the bits of three entries, but we know this will not
// result in a collision. In the unlikely event that changes to
// UnicodeData.txt introduce collisions, the compiler will catch it.
// Note that the recomposition map for NFC and NFKC are identical.
// combine returns the combined rune or 0 if it doesn't exist.
//
// The caller is responsible for calling
// recompMapOnce.Do(buildRecompMap) sometime before this is called.
func combine(, rune) rune {
:= uint32(uint16())<<16 + uint32(uint16())
if recompMap == nil {
panic("caller error") // see func comment
}
return recompMap[]
}
func lookupInfoNFC( input, int) Properties {
, := .charinfoNFC()
return compInfo(, )
}
func lookupInfoNFKC( input, int) Properties {
, := .charinfoNFKC()
return compInfo(, )
}
// Properties returns properties for the first rune in s.
func ( Form) ( []byte) Properties {
if == NFC || == NFD {
return compInfo(nfcData.lookup())
}
return compInfo(nfkcData.lookup())
}
// PropertiesString returns properties for the first rune in s.
func ( Form) ( string) Properties {
if == NFC || == NFD {
return compInfo(nfcData.lookupString())
}
return compInfo(nfkcData.lookupString())
}
// compInfo converts the information contained in v and sz
// to a Properties. See the comment at the top of the file
// for more information on the format.
func compInfo( uint16, int) Properties {
if == 0 {
return Properties{size: uint8()}
} else if >= 0x8000 {
:= Properties{
size: uint8(),
ccc: uint8(),
tccc: uint8(),
flags: qcInfo( >> 8),
}
if .ccc > 0 || .combinesBackward() {
.nLead = uint8(.flags & 0x3)
}
return
}
// has decomposition
:= decomps[]
:= (qcInfo(&headerFlagsMask) >> 2) | 0x4
:= Properties{size: uint8(), flags: , index: }
if >= firstCCC {
+= uint16(&headerLenMask) + 1
:= decomps[]
.tccc = >> 2
.flags |= qcInfo( & 0x3)
if >= firstLeadingCCC {
.nLead = & 0x3
if >= firstStarterWithNLead {
// We were tricked. Remove the decomposition.
.flags &= 0x03
.index = 0
return
}
.ccc = decomps[+1]
}
}
return
}
The pages are generated with Golds v0.7.3. (GOOS=linux GOARCH=amd64) Golds is a Go 101 project developed by Tapir Liu. PR and bug reports are welcome and can be submitted to the issue list. Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds. |