// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package unicode provides data and functions to test some properties of // Unicode code points.
package unicode const ( MaxRune = '\U0010FFFF' // Maximum valid Unicode code point. ReplacementChar = '\uFFFD' // Represents invalid code points. MaxASCII = '\u007F' // maximum ASCII value. MaxLatin1 = '\u00FF' // maximum Latin-1 value. ) // RangeTable defines a set of Unicode code points by listing the ranges of // code points within the set. The ranges are listed in two slices // to save space: a slice of 16-bit ranges and a slice of 32-bit ranges. // The two slices must be in sorted order and non-overlapping. // Also, R32 should contain only values >= 0x10000 (1<<16). type RangeTable struct { R16 []Range16 R32 []Range32 LatinOffset int // number of entries in R16 with Hi <= MaxLatin1 } // Range16 represents of a range of 16-bit Unicode code points. The range runs from Lo to Hi // inclusive and has the specified stride. type Range16 struct { Lo uint16 Hi uint16 Stride uint16 } // Range32 represents of a range of Unicode code points and is used when one or // more of the values will not fit in 16 bits. The range runs from Lo to Hi // inclusive and has the specified stride. Lo and Hi must always be >= 1<<16. type Range32 struct { Lo uint32 Hi uint32 Stride uint32 } // CaseRange represents a range of Unicode code points for simple (one // code point to one code point) case conversion. // The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas // are the number to add to the code point to reach the code point for a // different case for that character. They may be negative. If zero, it // means the character is in the corresponding case. There is a special // case representing sequences of alternating corresponding Upper and Lower // pairs. It appears with a fixed Delta of // // {UpperLower, UpperLower, UpperLower} // // The constant UpperLower has an otherwise impossible delta value. type CaseRange struct { Lo uint32 Hi uint32 Delta d } // SpecialCase represents language-specific case mappings such as Turkish. // Methods of SpecialCase customize (by overriding) the standard mappings. type SpecialCase []CaseRange // BUG(r): There is no mechanism for full case folding, that is, for // characters that involve multiple runes in the input or output. // Indices into the Delta arrays inside CaseRanges for case mapping. const ( UpperCase = iota LowerCase TitleCase MaxCase ) type d [MaxCase]rune // to make the CaseRanges text shorter // If the Delta field of a [CaseRange] is UpperLower, it means // this CaseRange represents a sequence of the form (say) // [Upper] [Lower] [Upper] [Lower]. const ( UpperLower = MaxRune + 1 // (Cannot be a valid delta.) ) // linearMax is the maximum size table for linear search for non-Latin1 rune. // Derived by running 'go test -calibrate'. const linearMax = 18 // is16 reports whether r is in the sorted slice of 16-bit ranges. func is16( []Range16, uint16) bool { if len() <= linearMax || <= MaxLatin1 { for := range { := &[] if < .Lo { return false } if <= .Hi { return .Stride == 1 || (-.Lo)%.Stride == 0 } } return false } // binary search over ranges := 0 := len() for < { := int(uint(+) >> 1) := &[] if .Lo <= && <= .Hi { return .Stride == 1 || (-.Lo)%.Stride == 0 } if < .Lo { = } else { = + 1 } } return false } // is32 reports whether r is in the sorted slice of 32-bit ranges. func is32( []Range32, uint32) bool { if len() <= linearMax { for := range { := &[] if < .Lo { return false } if <= .Hi { return .Stride == 1 || (-.Lo)%.Stride == 0 } } return false } // binary search over ranges := 0 := len() for < { := int(uint(+) >> 1) := [] if .Lo <= && <= .Hi { return .Stride == 1 || (-.Lo)%.Stride == 0 } if < .Lo { = } else { = + 1 } } return false } // Is reports whether the rune is in the specified table of ranges. func ( *RangeTable, rune) bool { := .R16 // Compare as uint32 to correctly handle negative runes. if len() > 0 && uint32() <= uint32([len()-1].Hi) { return is16(, uint16()) } := .R32 if len() > 0 && >= rune([0].Lo) { return is32(, uint32()) } return false } func isExcludingLatin( *RangeTable, rune) bool { := .R16 // Compare as uint32 to correctly handle negative runes. if := .LatinOffset; len() > && uint32() <= uint32([len()-1].Hi) { return is16([:], uint16()) } := .R32 if len() > 0 && >= rune([0].Lo) { return is32(, uint32()) } return false } // IsUpper reports whether the rune is an upper case letter. func ( rune) bool { // See comment in IsGraphic. if uint32() <= MaxLatin1 { return properties[uint8()]&pLmask == pLu } return isExcludingLatin(Upper, ) } // IsLower reports whether the rune is a lower case letter. func ( rune) bool { // See comment in IsGraphic. if uint32() <= MaxLatin1 { return properties[uint8()]&pLmask == pLl } return isExcludingLatin(Lower, ) } // IsTitle reports whether the rune is a title case letter. func ( rune) bool { if <= MaxLatin1 { return false } return isExcludingLatin(Title, ) } // lookupCaseRange returns the CaseRange mapping for rune r or nil if no // mapping exists for r. func lookupCaseRange( rune, []CaseRange) *CaseRange { // binary search over ranges := 0 := len() for < { := int(uint(+) >> 1) := &[] if rune(.Lo) <= && <= rune(.Hi) { return } if < rune(.Lo) { = } else { = + 1 } } return nil } // convertCase converts r to _case using CaseRange cr. func convertCase( int, rune, *CaseRange) rune { := .Delta[] if > MaxRune { // In an Upper-Lower sequence, which always starts with // an UpperCase letter, the real deltas always look like: // {0, 1, 0} UpperCase (Lower is next) // {-1, 0, -1} LowerCase (Upper, Title are previous) // The characters at even offsets from the beginning of the // sequence are upper case; the ones at odd offsets are lower. // The correct mapping can be done by clearing or setting the low // bit in the sequence offset. // The constants UpperCase and TitleCase are even while LowerCase // is odd so we take the low bit from _case. return rune(.Lo) + ((-rune(.Lo))&^1 | rune(&1)) } return + } // to maps the rune using the specified case mapping. // It additionally reports whether caseRange contained a mapping for r. func to( int, rune, []CaseRange) ( rune, bool) { if < 0 || MaxCase <= { return ReplacementChar, false // as reasonable an error as any } if := lookupCaseRange(, ); != nil { return convertCase(, , ), true } return , false } // To maps the rune to the specified case: [UpperCase], [LowerCase], or [TitleCase]. func ( int, rune) rune { , _ = to(, , CaseRanges) return } // ToUpper maps the rune to upper case. func ( rune) rune { if <= MaxASCII { if 'a' <= && <= 'z' { -= 'a' - 'A' } return } return To(UpperCase, ) } // ToLower maps the rune to lower case. func ( rune) rune { if <= MaxASCII { if 'A' <= && <= 'Z' { += 'a' - 'A' } return } return To(LowerCase, ) } // ToTitle maps the rune to title case. func ( rune) rune { if <= MaxASCII { if 'a' <= && <= 'z' { // title case is upper case for ASCII -= 'a' - 'A' } return } return To(TitleCase, ) } // ToUpper maps the rune to upper case giving priority to the special mapping. func ( SpecialCase) ( rune) rune { , := to(UpperCase, , []CaseRange()) if == && ! { = ToUpper() } return } // ToTitle maps the rune to title case giving priority to the special mapping. func ( SpecialCase) ( rune) rune { , := to(TitleCase, , []CaseRange()) if == && ! { = ToTitle() } return } // ToLower maps the rune to lower case giving priority to the special mapping. func ( SpecialCase) ( rune) rune { , := to(LowerCase, , []CaseRange()) if == && ! { = ToLower() } return } // caseOrbit is defined in tables.go as []foldPair. Right now all the // entries fit in uint16, so use uint16. If that changes, compilation // will fail (the constants in the composite literal will not fit in uint16) // and the types here can change to uint32. type foldPair struct { From uint16 To uint16 } // SimpleFold iterates over Unicode code points equivalent under // the Unicode-defined simple case folding. Among the code points // equivalent to rune (including rune itself), SimpleFold returns the // smallest rune > r if one exists, or else the smallest rune >= 0. // If r is not a valid Unicode code point, SimpleFold(r) returns r. // // For example: // // SimpleFold('A') = 'a' // SimpleFold('a') = 'A' // // SimpleFold('K') = 'k' // SimpleFold('k') = '\u212A' (Kelvin symbol, K) // SimpleFold('\u212A') = 'K' // // SimpleFold('1') = '1' // // SimpleFold(-2) = -2 func ( rune) rune { if < 0 || > MaxRune { return } if int() < len(asciiFold) { return rune(asciiFold[]) } // Consult caseOrbit table for special cases. := 0 := len(caseOrbit) for < { := int(uint(+) >> 1) if rune(caseOrbit[].From) < { = + 1 } else { = } } if < len(caseOrbit) && rune(caseOrbit[].From) == { return rune(caseOrbit[].To) } // No folding specified. This is a one- or two-element // equivalence class containing rune and ToLower(rune) // and ToUpper(rune) if they are different from rune. if := lookupCaseRange(, CaseRanges); != nil { if := convertCase(LowerCase, , ); != { return } return convertCase(UpperCase, , ) } return }