// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package strings implements simple functions to manipulate UTF-8 encoded strings. // // For information about UTF-8 strings in Go, see https://blog.golang.org/strings.
package strings import ( ) // explode splits s into a slice of UTF-8 strings, // one string per Unicode character up to a maximum of n (n < 0 means no limit). // Invalid UTF-8 sequences become correct encodings of U+FFFD. func explode( string, int) []string { := utf8.RuneCountInString() if < 0 || > { = } := make([]string, ) for := 0; < -1; ++ { , := utf8.DecodeRuneInString() [] = [:] = [:] if == utf8.RuneError { [] = string(utf8.RuneError) } } if > 0 { [-1] = } return } // Count counts the number of non-overlapping instances of substr in s. // If substr is an empty string, Count returns 1 + the number of Unicode code points in s. func (, string) int { // special case if len() == 0 { return utf8.RuneCountInString() + 1 } if len() == 1 { return bytealg.CountString(, [0]) } := 0 for { := Index(, ) if == -1 { return } ++ = [+len():] } } // Contains reports whether substr is within s. func (, string) bool { return Index(, ) >= 0 } // ContainsAny reports whether any Unicode code points in chars are within s. func (, string) bool { return IndexAny(, ) >= 0 } // ContainsRune reports whether the Unicode code point r is within s. func ( string, rune) bool { return IndexRune(, ) >= 0 } // LastIndex returns the index of the last instance of substr in s, or -1 if substr is not present in s. func (, string) int { := len() switch { case == 0: return len() case == 1: return LastIndexByte(, [0]) case == len(): if == { return 0 } return -1 case > len(): return -1 } // Rabin-Karp search from the end of the string , := bytealg.HashStrRev() := len() - var uint32 for := len() - 1; >= ; -- { = *bytealg.PrimeRK + uint32([]) } if == && [:] == { return } for := - 1; >= 0; -- { *= bytealg.PrimeRK += uint32([]) -= * uint32([+]) if == && [:+] == { return } } return -1 } // IndexByte returns the index of the first instance of c in s, or -1 if c is not present in s. func ( string, byte) int { return bytealg.IndexByteString(, ) } // IndexRune returns the index of the first instance of the Unicode code point // r, or -1 if rune is not present in s. // If r is utf8.RuneError, it returns the first instance of any // invalid UTF-8 byte sequence. func ( string, rune) int { switch { case 0 <= && < utf8.RuneSelf: return IndexByte(, byte()) case == utf8.RuneError: for , := range { if == utf8.RuneError { return } } return -1 case !utf8.ValidRune(): return -1 default: return Index(, string()) } } // IndexAny returns the index of the first instance of any Unicode code point // from chars in s, or -1 if no Unicode code point from chars is present in s. func (, string) int { if == "" { // Avoid scanning all of s. return -1 } if len() == 1 { // Avoid scanning all of s. := rune([0]) if >= utf8.RuneSelf { = utf8.RuneError } return IndexRune(, ) } if len() > 8 { if , := makeASCIISet(); { for := 0; < len(); ++ { if .contains([]) { return } } return -1 } } for , := range { if IndexRune(, ) >= 0 { return } } return -1 } // LastIndexAny returns the index of the last instance of any Unicode code // point from chars in s, or -1 if no Unicode code point from chars is // present in s. func (, string) int { if == "" { // Avoid scanning all of s. return -1 } if len() == 1 { := rune([0]) if >= utf8.RuneSelf { = utf8.RuneError } if IndexRune(, ) >= 0 { return 0 } return -1 } if len() > 8 { if , := makeASCIISet(); { for := len() - 1; >= 0; -- { if .contains([]) { return } } return -1 } } if len() == 1 { := rune([0]) if >= utf8.RuneSelf { = utf8.RuneError } for := len(); > 0; { , := utf8.DecodeLastRuneInString([:]) -= if == { return } } return -1 } for := len(); > 0; { , := utf8.DecodeLastRuneInString([:]) -= if IndexRune(, ) >= 0 { return } } return -1 } // LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s. func ( string, byte) int { for := len() - 1; >= 0; -- { if [] == { return } } return -1 } // Generic split: splits after each instance of sep, // including sepSave bytes of sep in the subarrays. func genSplit(, string, , int) []string { if == 0 { return nil } if == "" { return explode(, ) } if < 0 { = Count(, ) + 1 } := make([]string, ) -- := 0 for < { := Index(, ) if < 0 { break } [] = [:+] = [+len():] ++ } [] = return [:+1] } // SplitN slices s into substrings separated by sep and returns a slice of // the substrings between those separators. // // The count determines the number of substrings to return: // n > 0: at most n substrings; the last substring will be the unsplit remainder. // n == 0: the result is nil (zero substrings) // n < 0: all substrings // // Edge cases for s and sep (for example, empty strings) are handled // as described in the documentation for Split. func (, string, int) []string { return genSplit(, , 0, ) } // SplitAfterN slices s into substrings after each instance of sep and // returns a slice of those substrings. // // The count determines the number of substrings to return: // n > 0: at most n substrings; the last substring will be the unsplit remainder. // n == 0: the result is nil (zero substrings) // n < 0: all substrings // // Edge cases for s and sep (for example, empty strings) are handled // as described in the documentation for SplitAfter. func (, string, int) []string { return genSplit(, , len(), ) } // Split slices s into all substrings separated by sep and returns a slice of // the substrings between those separators. // // If s does not contain sep and sep is not empty, Split returns a // slice of length 1 whose only element is s. // // If sep is empty, Split splits after each UTF-8 sequence. If both s // and sep are empty, Split returns an empty slice. // // It is equivalent to SplitN with a count of -1. func (, string) []string { return genSplit(, , 0, -1) } // SplitAfter slices s into all substrings after each instance of sep and // returns a slice of those substrings. // // If s does not contain sep and sep is not empty, SplitAfter returns // a slice of length 1 whose only element is s. // // If sep is empty, SplitAfter splits after each UTF-8 sequence. If // both s and sep are empty, SplitAfter returns an empty slice. // // It is equivalent to SplitAfterN with a count of -1. func (, string) []string { return genSplit(, , len(), -1) } var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1} // Fields splits the string s around each instance of one or more consecutive white space // characters, as defined by unicode.IsSpace, returning a slice of substrings of s or an // empty slice if s contains only white space. func ( string) []string { // First count the fields. // This is an exact count if s is ASCII, otherwise it is an approximation. := 0 := 1 // setBits is used to track which bits are set in the bytes of s. := uint8(0) for := 0; < len(); ++ { := [] |= := int(asciiSpace[]) += & ^ = } if >= utf8.RuneSelf { // Some runes in the input string are not ASCII. return FieldsFunc(, unicode.IsSpace) } // ASCII fast path := make([]string, ) := 0 := 0 := 0 // Skip spaces in the front of the input. for < len() && asciiSpace[[]] != 0 { ++ } = for < len() { if asciiSpace[[]] == 0 { ++ continue } [] = [:] ++ ++ // Skip spaces in between fields. for < len() && asciiSpace[[]] != 0 { ++ } = } if < len() { // Last field might end at EOF. [] = [:] } return } // FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c) // and returns an array of slices of s. If all code points in s satisfy f(c) or the // string is empty, an empty slice is returned. // // FieldsFunc makes no guarantees about the order in which it calls f(c) // and assumes that f always returns the same value for a given c. func ( string, func(rune) bool) []string { // A span is used to record a slice of s of the form s[start:end]. // The start index is inclusive and the end index is exclusive. type struct { int int } := make([], 0, 32) // Find the field start and end indices. // Doing this in a separate pass (rather than slicing the string s // and collecting the result substrings right away) is significantly // more efficient, possibly due to cache effects. := -1 // valid span start if >= 0 for , := range { if () { if >= 0 { = append(, {, }) // Set start to a negative value. // Note: using -1 here consistently and reproducibly // slows down this code by a several percent on amd64. = ^ } } else { if < 0 { = } } } // Last field might end at EOF. if >= 0 { = append(, {, len()}) } // Create strings from recorded field indices. := make([]string, len()) for , := range { [] = [.:.] } return } // Join concatenates the elements of its first argument to create a single string. The separator // string sep is placed between elements in the resulting string. func ( []string, string) string { switch len() { case 0: return "" case 1: return [0] } := len() * (len() - 1) for := 0; < len(); ++ { += len([]) } var Builder .Grow() .WriteString([0]) for , := range [1:] { .WriteString() .WriteString() } return .String() } // HasPrefix tests whether the string s begins with prefix. func (, string) bool { return len() >= len() && [0:len()] == } // HasSuffix tests whether the string s ends with suffix. func (, string) bool { return len() >= len() && [len()-len():] == } // Map returns a copy of the string s with all its characters modified // according to the mapping function. If mapping returns a negative value, the character is // dropped from the string with no replacement. func ( func(rune) rune, string) string { // In the worst case, the string can grow when mapped, making // things unpleasant. But it's so rare we barge in assuming it's // fine. It could also shrink but that falls out naturally. // The output buffer b is initialized on demand, the first // time a character differs. var Builder for , := range { := () if == && != utf8.RuneError { continue } var int if == utf8.RuneError { , = utf8.DecodeRuneInString([:]) if != 1 && == { continue } } else { = utf8.RuneLen() } .Grow(len() + utf8.UTFMax) .WriteString([:]) if >= 0 { .WriteRune() } = [+:] break } // Fast path for unchanged input if .Cap() == 0 { // didn't call b.Grow above return } for , := range { := () if >= 0 { // common case // Due to inlining, it is more performant to determine if WriteByte should be // invoked rather than always call WriteRune if < utf8.RuneSelf { .WriteByte(byte()) } else { // r is not a ASCII rune. .WriteRune() } } } return .String() } // Repeat returns a new string consisting of count copies of the string s. // // It panics if count is negative or if // the result of (len(s) * count) overflows. func ( string, int) string { if == 0 { return "" } // Since we cannot return an error on overflow, // we should panic if the repeat will generate // an overflow. // See Issue golang.org/issue/16237 if < 0 { panic("strings: negative Repeat count") } else if len()*/ != len() { panic("strings: Repeat count causes overflow") } := len() * var Builder .Grow() .WriteString() for .Len() < { if .Len() <= /2 { .WriteString(.String()) } else { .WriteString(.String()[:-.Len()]) break } } return .String() } // ToUpper returns s with all Unicode letters mapped to their upper case. func ( string) string { , := true, false for := 0; < len(); ++ { := [] if >= utf8.RuneSelf { = false break } = || ('a' <= && <= 'z') } if { // optimize for ASCII-only strings. if ! { return } var Builder .Grow(len()) for := 0; < len(); ++ { := [] if 'a' <= && <= 'z' { -= 'a' - 'A' } .WriteByte() } return .String() } return Map(unicode.ToUpper, ) } // ToLower returns s with all Unicode letters mapped to their lower case. func ( string) string { , := true, false for := 0; < len(); ++ { := [] if >= utf8.RuneSelf { = false break } = || ('A' <= && <= 'Z') } if { // optimize for ASCII-only strings. if ! { return } var Builder .Grow(len()) for := 0; < len(); ++ { := [] if 'A' <= && <= 'Z' { += 'a' - 'A' } .WriteByte() } return .String() } return Map(unicode.ToLower, ) } // ToTitle returns a copy of the string s with all Unicode letters mapped to // their Unicode title case. func ( string) string { return Map(unicode.ToTitle, ) } // ToUpperSpecial returns a copy of the string s with all Unicode letters mapped to their // upper case using the case mapping specified by c. func ( unicode.SpecialCase, string) string { return Map(.ToUpper, ) } // ToLowerSpecial returns a copy of the string s with all Unicode letters mapped to their // lower case using the case mapping specified by c. func ( unicode.SpecialCase, string) string { return Map(.ToLower, ) } // ToTitleSpecial returns a copy of the string s with all Unicode letters mapped to their // Unicode title case, giving priority to the special casing rules. func ( unicode.SpecialCase, string) string { return Map(.ToTitle, ) } // ToValidUTF8 returns a copy of the string s with each run of invalid UTF-8 byte sequences // replaced by the replacement string, which may be empty. func (, string) string { var Builder for , := range { if != utf8.RuneError { continue } , := utf8.DecodeRuneInString([:]) if == 1 { .Grow(len() + len()) .WriteString([:]) = [:] break } } // Fast path for unchanged input if .Cap() == 0 { // didn't call b.Grow above return } := false // previous byte was from an invalid UTF-8 sequence for := 0; < len(); { := [] if < utf8.RuneSelf { ++ = false .WriteByte() continue } , := utf8.DecodeRuneInString([:]) if == 1 { ++ if ! { = true .WriteString() } continue } = false .WriteString([ : +]) += } return .String() } // isSeparator reports whether the rune could mark a word boundary. // TODO: update when package unicode captures more of the properties. func isSeparator( rune) bool { // ASCII alphanumerics and underscore are not separators if <= 0x7F { switch { case '0' <= && <= '9': return false case 'a' <= && <= 'z': return false case 'A' <= && <= 'Z': return false case == '_': return false } return true } // Letters and digits are not separators if unicode.IsLetter() || unicode.IsDigit() { return false } // Otherwise, all we can do for now is treat spaces as separators. return unicode.IsSpace() } // Title returns a copy of the string s with all Unicode letters that begin words // mapped to their Unicode title case. // // BUG(rsc): The rule Title uses for word boundaries does not handle Unicode punctuation properly. func ( string) string { // Use a closure here to remember state. // Hackish but effective. Depends on Map scanning in order and calling // the closure once per rune. := ' ' return Map( func( rune) rune { if isSeparator() { = return unicode.ToTitle() } = return }, ) } // TrimLeftFunc returns a slice of the string s with all leading // Unicode code points c satisfying f(c) removed. func ( string, func(rune) bool) string { := indexFunc(, , false) if == -1 { return "" } return [:] } // TrimRightFunc returns a slice of the string s with all trailing // Unicode code points c satisfying f(c) removed. func ( string, func(rune) bool) string { := lastIndexFunc(, , false) if >= 0 && [] >= utf8.RuneSelf { , := utf8.DecodeRuneInString([:]) += } else { ++ } return [0:] } // TrimFunc returns a slice of the string s with all leading // and trailing Unicode code points c satisfying f(c) removed. func ( string, func(rune) bool) string { return TrimRightFunc(TrimLeftFunc(, ), ) } // IndexFunc returns the index into s of the first Unicode // code point satisfying f(c), or -1 if none do. func ( string, func(rune) bool) int { return indexFunc(, , true) } // LastIndexFunc returns the index into s of the last // Unicode code point satisfying f(c), or -1 if none do. func ( string, func(rune) bool) int { return lastIndexFunc(, , true) } // indexFunc is the same as IndexFunc except that if // truth==false, the sense of the predicate function is // inverted. func indexFunc( string, func(rune) bool, bool) int { for , := range { if () == { return } } return -1 } // lastIndexFunc is the same as LastIndexFunc except that if // truth==false, the sense of the predicate function is // inverted. func lastIndexFunc( string, func(rune) bool, bool) int { for := len(); > 0; { , := utf8.DecodeLastRuneInString([0:]) -= if () == { return } } return -1 } // asciiSet is a 32-byte value, where each bit represents the presence of a // given ASCII character in the set. The 128-bits of the lower 16 bytes, // starting with the least-significant bit of the lowest word to the // most-significant bit of the highest word, map to the full range of all // 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed, // ensuring that any non-ASCII character will be reported as not in the set. type asciiSet [8]uint32 // makeASCIISet creates a set of ASCII characters and reports whether all // characters in chars are ASCII. func makeASCIISet( string) ( asciiSet, bool) { for := 0; < len(); ++ { := [] if >= utf8.RuneSelf { return , false } [>>5] |= 1 << uint(&31) } return , true } // contains reports whether c is inside the set. func ( *asciiSet) ( byte) bool { return ([>>5] & (1 << uint(&31))) != 0 } func makeCutsetFunc( string) func(rune) bool { if len() == 1 && [0] < utf8.RuneSelf { return func( rune) bool { return == rune([0]) } } if , := makeASCIISet(); { return func( rune) bool { return < utf8.RuneSelf && .contains(byte()) } } return func( rune) bool { return IndexRune(, ) >= 0 } } // Trim returns a slice of the string s with all leading and // trailing Unicode code points contained in cutset removed. func (, string) string { if == "" || == "" { return } return TrimFunc(, makeCutsetFunc()) } // TrimLeft returns a slice of the string s with all leading // Unicode code points contained in cutset removed. // // To remove a prefix, use TrimPrefix instead. func (, string) string { if == "" || == "" { return } return TrimLeftFunc(, makeCutsetFunc()) } // TrimRight returns a slice of the string s, with all trailing // Unicode code points contained in cutset removed. // // To remove a suffix, use TrimSuffix instead. func (, string) string { if == "" || == "" { return } return TrimRightFunc(, makeCutsetFunc()) } // TrimSpace returns a slice of the string s, with all leading // and trailing white space removed, as defined by Unicode. func ( string) string { // Fast path for ASCII: look for the first ASCII non-space byte := 0 for ; < len(); ++ { := [] if >= utf8.RuneSelf { // If we run into a non-ASCII byte, fall back to the // slower unicode-aware method on the remaining bytes return TrimFunc([:], unicode.IsSpace) } if asciiSpace[] == 0 { break } } // Now look for the first ASCII non-space byte from the end := len() for ; > ; -- { := [-1] if >= utf8.RuneSelf { return TrimFunc([:], unicode.IsSpace) } if asciiSpace[] == 0 { break } } // At this point s[start:stop] starts and ends with an ASCII // non-space bytes, so we're done. Non-ASCII cases have already // been handled above. return [:] } // TrimPrefix returns s without the provided leading prefix string. // If s doesn't start with prefix, s is returned unchanged. func (, string) string { if HasPrefix(, ) { return [len():] } return } // TrimSuffix returns s without the provided trailing suffix string. // If s doesn't end with suffix, s is returned unchanged. func (, string) string { if HasSuffix(, ) { return [:len()-len()] } return } // Replace returns a copy of the string s with the first n // non-overlapping instances of old replaced by new. // If old is empty, it matches at the beginning of the string // and after each UTF-8 sequence, yielding up to k+1 replacements // for a k-rune string. // If n < 0, there is no limit on the number of replacements. func (, , string, int) string { if == || == 0 { return // avoid allocation } // Compute number of replacements. if := Count(, ); == 0 { return // avoid allocation } else if < 0 || < { = } // Apply replacements to buffer. var Builder .Grow(len() + *(len()-len())) := 0 for := 0; < ; ++ { := if len() == 0 { if > 0 { , := utf8.DecodeRuneInString([:]) += } } else { += Index([:], ) } .WriteString([:]) .WriteString() = + len() } .WriteString([:]) return .String() } // ReplaceAll returns a copy of the string s with all // non-overlapping instances of old replaced by new. // If old is empty, it matches at the beginning of the string // and after each UTF-8 sequence, yielding up to k+1 replacements // for a k-rune string. func (, , string) string { return Replace(, , , -1) } // EqualFold reports whether s and t, interpreted as UTF-8 strings, // are equal under Unicode case-folding, which is a more general // form of case-insensitivity. func (, string) bool { for != "" && != "" { // Extract first rune from each string. var , rune if [0] < utf8.RuneSelf { , = rune([0]), [1:] } else { , := utf8.DecodeRuneInString() , = , [:] } if [0] < utf8.RuneSelf { , = rune([0]), [1:] } else { , := utf8.DecodeRuneInString() , = , [:] } // If they match, keep going; if not, return false. // Easy case. if == { continue } // Make sr < tr to simplify what follows. if < { , = , } // Fast check for ASCII. if < utf8.RuneSelf { // ASCII only, sr/tr must be upper/lower case if 'A' <= && <= 'Z' && == +'a'-'A' { continue } return false } // General case. SimpleFold(x) returns the next equivalent rune > x // or wraps around to smaller values. := unicode.SimpleFold() for != && < { = unicode.SimpleFold() } if == { continue } return false } // One string is empty. Are both? return == } // Index returns the index of the first instance of substr in s, or -1 if substr is not present in s. func (, string) int { := len() switch { case == 0: return 0 case == 1: return IndexByte(, [0]) case == len(): if == { return 0 } return -1 case > len(): return -1 case <= bytealg.MaxLen: // Use brute force when s and substr both are small if len() <= bytealg.MaxBruteForce { return bytealg.IndexString(, ) } := [0] := [1] := 0 := len() - + 1 := 0 for < { if [] != { // IndexByte is faster than bytealg.IndexString, so use it as long as // we're not getting lots of false positives. := IndexByte([+1:], ) if < 0 { return -1 } += + 1 } if [+1] == && [:+] == { return } ++ ++ // Switch to bytealg.IndexString when IndexByte produces too many false positives. if > bytealg.Cutover() { := bytealg.IndexString([:], ) if >= 0 { return + } return -1 } } return -1 } := [0] := [1] := 0 := len() - + 1 := 0 for < { if [] != { := IndexByte([+1:], ) if < 0 { return -1 } += + 1 } if [+1] == && [:+] == { return } ++ ++ if >= 4+>>4 && < { // See comment in ../bytes/bytes.go. := bytealg.IndexRabinKarp([:], ) if < 0 { return -1 } return + } } return -1 }