// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package bytes implements functions for the manipulation of byte slices. // It is analogous to the facilities of the strings package.
package bytes import ( ) // Equal reports whether a and b // are the same length and contain the same bytes. // A nil argument is equivalent to an empty slice. func (, []byte) bool { // Neither cmd/compile nor gccgo allocates for these string conversions. return string() == string() } // Compare returns an integer comparing two byte slices lexicographically. // The result will be 0 if a==b, -1 if a < b, and +1 if a > b. // A nil argument is equivalent to an empty slice. func (, []byte) int { return bytealg.Compare(, ) } // explode splits s into a slice of UTF-8 sequences, one per Unicode code point (still slices of bytes), // up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes. func explode( []byte, int) [][]byte { if <= 0 { = len() } := make([][]byte, ) var int := 0 for len() > 0 { if +1 >= { [] = ++ break } _, = utf8.DecodeRune() [] = [0::] = [:] ++ } return [0:] } // Count counts the number of non-overlapping instances of sep in s. // If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s. func (, []byte) int { // special case if len() == 0 { return utf8.RuneCount() + 1 } if len() == 1 { return bytealg.Count(, [0]) } := 0 for { := Index(, ) if == -1 { return } ++ = [+len():] } } // Contains reports whether subslice is within b. func (, []byte) bool { return Index(, ) != -1 } // ContainsAny reports whether any of the UTF-8-encoded code points in chars are within b. func ( []byte, string) bool { return IndexAny(, ) >= 0 } // ContainsRune reports whether the rune is contained in the UTF-8-encoded byte slice b. func ( []byte, rune) bool { return IndexRune(, ) >= 0 } // IndexByte returns the index of the first instance of c in b, or -1 if c is not present in b. func ( []byte, byte) int { return bytealg.IndexByte(, ) } func indexBytePortable( []byte, byte) int { for , := range { if == { return } } return -1 } // LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s. func (, []byte) int { := len() switch { case == 0: return len() case == 1: return LastIndexByte(, [0]) case == len(): if Equal(, ) { return 0 } return -1 case > len(): return -1 } // Rabin-Karp search from the end of the string , := bytealg.HashStrRevBytes() := len() - var uint32 for := len() - 1; >= ; -- { = *bytealg.PrimeRK + uint32([]) } if == && Equal([:], ) { return } for := - 1; >= 0; -- { *= bytealg.PrimeRK += uint32([]) -= * uint32([+]) if == && Equal([:+], ) { return } } return -1 } // LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s. func ( []byte, byte) int { for := len() - 1; >= 0; -- { if [] == { return } } return -1 } // IndexRune interprets s as a sequence of UTF-8-encoded code points. // It returns the byte index of the first occurrence in s of the given rune. // It returns -1 if rune is not present in s. // If r is utf8.RuneError, it returns the first instance of any // invalid UTF-8 byte sequence. func ( []byte, rune) int { switch { case 0 <= && < utf8.RuneSelf: return IndexByte(, byte()) case == utf8.RuneError: for := 0; < len(); { , := utf8.DecodeRune([:]) if == utf8.RuneError { return } += } return -1 case !utf8.ValidRune(): return -1 default: var [utf8.UTFMax]byte := utf8.EncodeRune([:], ) return Index(, [:]) } } // IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points. // It returns the byte index of the first occurrence in s of any of the Unicode // code points in chars. It returns -1 if chars is empty or if there is no code // point in common. func ( []byte, string) int { if == "" { // Avoid scanning all of s. return -1 } if len() == 1 { := rune([0]) if >= utf8.RuneSelf { // search utf8.RuneError. for _, = range { if == utf8.RuneError { return 0 } } return -1 } if bytealg.IndexByteString(, [0]) >= 0 { return 0 } return -1 } if len() == 1 { := rune([0]) if >= utf8.RuneSelf { = utf8.RuneError } return IndexRune(, ) } if len() > 8 { if , := makeASCIISet(); { for , := range { if .contains() { return } } return -1 } } var int for := 0; < len(); += { := rune([]) if < utf8.RuneSelf { if bytealg.IndexByteString(, []) >= 0 { return } = 1 continue } , = utf8.DecodeRune([:]) if != utf8.RuneError { // r is 2 to 4 bytes if len() == { if == string() { return } continue } // Use bytealg.IndexString for performance if available. if bytealg.MaxLen >= { if bytealg.IndexString(, string()) >= 0 { return } continue } } for , := range { if == { return } } } return -1 } // LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code // points. It returns the byte index of the last occurrence in s of any of // the Unicode code points in chars. It returns -1 if chars is empty or if // there is no code point in common. func ( []byte, string) int { if == "" { // Avoid scanning all of s. return -1 } if len() > 8 { if , := makeASCIISet(); { for := len() - 1; >= 0; -- { if .contains([]) { return } } return -1 } } if len() == 1 { := rune([0]) if >= utf8.RuneSelf { for _, = range { if == utf8.RuneError { return 0 } } return -1 } if bytealg.IndexByteString(, [0]) >= 0 { return 0 } return -1 } if len() == 1 { := rune([0]) if >= utf8.RuneSelf { = utf8.RuneError } for := len(); > 0; { , := utf8.DecodeLastRune([:]) -= if == { return } } return -1 } for := len(); > 0; { := rune([-1]) if < utf8.RuneSelf { if bytealg.IndexByteString(, [-1]) >= 0 { return - 1 } -- continue } , := utf8.DecodeLastRune([:]) -= if != utf8.RuneError { // r is 2 to 4 bytes if len() == { if == string() { return } continue } // Use bytealg.IndexString for performance if available. if bytealg.MaxLen >= { if bytealg.IndexString(, string()) >= 0 { return } continue } } for , := range { if == { return } } } return -1 } // Generic split: splits after each instance of sep, // including sepSave bytes of sep in the subslices. func genSplit(, []byte, , int) [][]byte { if == 0 { return nil } if len() == 0 { return explode(, ) } if < 0 { = Count(, ) + 1 } := make([][]byte, ) -- := 0 for < { := Index(, ) if < 0 { break } [] = [: + : +] = [+len():] ++ } [] = return [:+1] } // SplitN slices s into subslices separated by sep and returns a slice of // the subslices between those separators. // If sep is empty, SplitN splits after each UTF-8 sequence. // The count determines the number of subslices to return: // n > 0: at most n subslices; the last subslice will be the unsplit remainder. // n == 0: the result is nil (zero subslices) // n < 0: all subslices func (, []byte, int) [][]byte { return genSplit(, , 0, ) } // SplitAfterN slices s into subslices after each instance of sep and // returns a slice of those subslices. // If sep is empty, SplitAfterN splits after each UTF-8 sequence. // The count determines the number of subslices to return: // n > 0: at most n subslices; the last subslice will be the unsplit remainder. // n == 0: the result is nil (zero subslices) // n < 0: all subslices func (, []byte, int) [][]byte { return genSplit(, , len(), ) } // Split slices s into all subslices separated by sep and returns a slice of // the subslices between those separators. // If sep is empty, Split splits after each UTF-8 sequence. // It is equivalent to SplitN with a count of -1. func (, []byte) [][]byte { return genSplit(, , 0, -1) } // SplitAfter slices s into all subslices after each instance of sep and // returns a slice of those subslices. // If sep is empty, SplitAfter splits after each UTF-8 sequence. // It is equivalent to SplitAfterN with a count of -1. func (, []byte) [][]byte { return genSplit(, , len(), -1) } var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1} // Fields interprets s as a sequence of UTF-8-encoded code points. // It splits the slice s around each instance of one or more consecutive white space // characters, as defined by unicode.IsSpace, returning a slice of subslices of s or an // empty slice if s contains only white space. func ( []byte) [][]byte { // First count the fields. // This is an exact count if s is ASCII, otherwise it is an approximation. := 0 := 1 // setBits is used to track which bits are set in the bytes of s. := uint8(0) for := 0; < len(); ++ { := [] |= := int(asciiSpace[]) += & ^ = } if >= utf8.RuneSelf { // Some runes in the input slice are not ASCII. return FieldsFunc(, unicode.IsSpace) } // ASCII fast path := make([][]byte, ) := 0 := 0 := 0 // Skip spaces in the front of the input. for < len() && asciiSpace[[]] != 0 { ++ } = for < len() { if asciiSpace[[]] == 0 { ++ continue } [] = [::] ++ ++ // Skip spaces in between fields. for < len() && asciiSpace[[]] != 0 { ++ } = } if < len() { // Last field might end at EOF. [] = [:len():len()] } return } // FieldsFunc interprets s as a sequence of UTF-8-encoded code points. // It splits the slice s at each run of code points c satisfying f(c) and // returns a slice of subslices of s. If all code points in s satisfy f(c), or // len(s) == 0, an empty slice is returned. // // FieldsFunc makes no guarantees about the order in which it calls f(c) // and assumes that f always returns the same value for a given c. func ( []byte, func(rune) bool) [][]byte { // A span is used to record a slice of s of the form s[start:end]. // The start index is inclusive and the end index is exclusive. type struct { int int } := make([], 0, 32) // Find the field start and end indices. // Doing this in a separate pass (rather than slicing the string s // and collecting the result substrings right away) is significantly // more efficient, possibly due to cache effects. := -1 // valid span start if >= 0 for := 0; < len(); { := 1 := rune([]) if >= utf8.RuneSelf { , = utf8.DecodeRune([:]) } if () { if >= 0 { = append(, {, }) = -1 } } else { if < 0 { = } } += } // Last field might end at EOF. if >= 0 { = append(, {, len()}) } // Create subslices from recorded field indices. := make([][]byte, len()) for , := range { [] = [.:.:.] } return } // Join concatenates the elements of s to create a new byte slice. The separator // sep is placed between elements in the resulting slice. func ( [][]byte, []byte) []byte { if len() == 0 { return []byte{} } if len() == 1 { // Just return a copy. return append([]byte(nil), [0]...) } := len() * (len() - 1) for , := range { += len() } := make([]byte, ) := copy(, [0]) for , := range [1:] { += copy([:], ) += copy([:], ) } return } // HasPrefix tests whether the byte slice s begins with prefix. func (, []byte) bool { return len() >= len() && Equal([0:len()], ) } // HasSuffix tests whether the byte slice s ends with suffix. func (, []byte) bool { return len() >= len() && Equal([len()-len():], ) } // Map returns a copy of the byte slice s with all its characters modified // according to the mapping function. If mapping returns a negative value, the character is // dropped from the byte slice with no replacement. The characters in s and the // output are interpreted as UTF-8-encoded code points. func ( func( rune) rune, []byte) []byte { // In the worst case, the slice can grow when mapped, making // things unpleasant. But it's so rare we barge in assuming it's // fine. It could also shrink but that falls out naturally. := len() // length of b := 0 // number of bytes encoded in b := make([]byte, ) for := 0; < len(); { := 1 := rune([]) if >= utf8.RuneSelf { , = utf8.DecodeRune([:]) } = () if >= 0 { := utf8.RuneLen() if < 0 { = len(string(utf8.RuneError)) } if + > { // Grow the buffer. = *2 + utf8.UTFMax := make([]byte, ) copy(, [0:]) = } += utf8.EncodeRune([:], ) } += } return [0:] } // Repeat returns a new byte slice consisting of count copies of b. // // It panics if count is negative or if // the result of (len(b) * count) overflows. func ( []byte, int) []byte { if == 0 { return []byte{} } // Since we cannot return an error on overflow, // we should panic if the repeat will generate // an overflow. // See Issue golang.org/issue/16237. if < 0 { panic("bytes: negative Repeat count") } else if len()*/ != len() { panic("bytes: Repeat count causes overflow") } := make([]byte, len()*) := copy(, ) for < len() { copy([:], [:]) *= 2 } return } // ToUpper returns a copy of the byte slice s with all Unicode letters mapped to // their upper case. func ( []byte) []byte { , := true, false for := 0; < len(); ++ { := [] if >= utf8.RuneSelf { = false break } = || ('a' <= && <= 'z') } if { // optimize for ASCII-only byte slices. if ! { // Just return a copy. return append([]byte(""), ...) } := make([]byte, len()) for := 0; < len(); ++ { := [] if 'a' <= && <= 'z' { -= 'a' - 'A' } [] = } return } return Map(unicode.ToUpper, ) } // ToLower returns a copy of the byte slice s with all Unicode letters mapped to // their lower case. func ( []byte) []byte { , := true, false for := 0; < len(); ++ { := [] if >= utf8.RuneSelf { = false break } = || ('A' <= && <= 'Z') } if { // optimize for ASCII-only byte slices. if ! { return append([]byte(""), ...) } := make([]byte, len()) for := 0; < len(); ++ { := [] if 'A' <= && <= 'Z' { += 'a' - 'A' } [] = } return } return Map(unicode.ToLower, ) } // ToTitle treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their title case. func ( []byte) []byte { return Map(unicode.ToTitle, ) } // ToUpperSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their // upper case, giving priority to the special casing rules. func ( unicode.SpecialCase, []byte) []byte { return Map(.ToUpper, ) } // ToLowerSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their // lower case, giving priority to the special casing rules. func ( unicode.SpecialCase, []byte) []byte { return Map(.ToLower, ) } // ToTitleSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their // title case, giving priority to the special casing rules. func ( unicode.SpecialCase, []byte) []byte { return Map(.ToTitle, ) } // ToValidUTF8 treats s as UTF-8-encoded bytes and returns a copy with each run of bytes // representing invalid UTF-8 replaced with the bytes in replacement, which may be empty. func (, []byte) []byte { := make([]byte, 0, len()+len()) := false // previous byte was from an invalid UTF-8 sequence for := 0; < len(); { := [] if < utf8.RuneSelf { ++ = false = append(, byte()) continue } , := utf8.DecodeRune([:]) if == 1 { ++ if ! { = true = append(, ...) } continue } = false = append(, [:+]...) += } return } // isSeparator reports whether the rune could mark a word boundary. // TODO: update when package unicode captures more of the properties. func isSeparator( rune) bool { // ASCII alphanumerics and underscore are not separators if <= 0x7F { switch { case '0' <= && <= '9': return false case 'a' <= && <= 'z': return false case 'A' <= && <= 'Z': return false case == '_': return false } return true } // Letters and digits are not separators if unicode.IsLetter() || unicode.IsDigit() { return false } // Otherwise, all we can do for now is treat spaces as separators. return unicode.IsSpace() } // Title treats s as UTF-8-encoded bytes and returns a copy with all Unicode letters that begin // words mapped to their title case. // // BUG(rsc): The rule Title uses for word boundaries does not handle Unicode punctuation properly. func ( []byte) []byte { // Use a closure here to remember state. // Hackish but effective. Depends on Map scanning in order and calling // the closure once per rune. := ' ' return Map( func( rune) rune { if isSeparator() { = return unicode.ToTitle() } = return }, ) } // TrimLeftFunc treats s as UTF-8-encoded bytes and returns a subslice of s by slicing off // all leading UTF-8-encoded code points c that satisfy f(c). func ( []byte, func( rune) bool) []byte { := indexFunc(, , false) if == -1 { return nil } return [:] } // TrimRightFunc returns a subslice of s by slicing off all trailing // UTF-8-encoded code points c that satisfy f(c). func ( []byte, func( rune) bool) []byte { := lastIndexFunc(, , false) if >= 0 && [] >= utf8.RuneSelf { , := utf8.DecodeRune([:]) += } else { ++ } return [0:] } // TrimFunc returns a subslice of s by slicing off all leading and trailing // UTF-8-encoded code points c that satisfy f(c). func ( []byte, func( rune) bool) []byte { return TrimRightFunc(TrimLeftFunc(, ), ) } // TrimPrefix returns s without the provided leading prefix string. // If s doesn't start with prefix, s is returned unchanged. func (, []byte) []byte { if HasPrefix(, ) { return [len():] } return } // TrimSuffix returns s without the provided trailing suffix string. // If s doesn't end with suffix, s is returned unchanged. func (, []byte) []byte { if HasSuffix(, ) { return [:len()-len()] } return } // IndexFunc interprets s as a sequence of UTF-8-encoded code points. // It returns the byte index in s of the first Unicode // code point satisfying f(c), or -1 if none do. func ( []byte, func( rune) bool) int { return indexFunc(, , true) } // LastIndexFunc interprets s as a sequence of UTF-8-encoded code points. // It returns the byte index in s of the last Unicode // code point satisfying f(c), or -1 if none do. func ( []byte, func( rune) bool) int { return lastIndexFunc(, , true) } // indexFunc is the same as IndexFunc except that if // truth==false, the sense of the predicate function is // inverted. func indexFunc( []byte, func( rune) bool, bool) int { := 0 for < len() { := 1 := rune([]) if >= utf8.RuneSelf { , = utf8.DecodeRune([:]) } if () == { return } += } return -1 } // lastIndexFunc is the same as LastIndexFunc except that if // truth==false, the sense of the predicate function is // inverted. func lastIndexFunc( []byte, func( rune) bool, bool) int { for := len(); > 0; { , := rune([-1]), 1 if >= utf8.RuneSelf { , = utf8.DecodeLastRune([0:]) } -= if () == { return } } return -1 } // asciiSet is a 32-byte value, where each bit represents the presence of a // given ASCII character in the set. The 128-bits of the lower 16 bytes, // starting with the least-significant bit of the lowest word to the // most-significant bit of the highest word, map to the full range of all // 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed, // ensuring that any non-ASCII character will be reported as not in the set. type asciiSet [8]uint32 // makeASCIISet creates a set of ASCII characters and reports whether all // characters in chars are ASCII. func makeASCIISet( string) ( asciiSet, bool) { for := 0; < len(); ++ { := [] if >= utf8.RuneSelf { return , false } [>>5] |= 1 << uint(&31) } return , true } // contains reports whether c is inside the set. func ( *asciiSet) ( byte) bool { return ([>>5] & (1 << uint(&31))) != 0 } func makeCutsetFunc( string) func( rune) bool { if len() == 1 && [0] < utf8.RuneSelf { return func( rune) bool { return == rune([0]) } } if , := makeASCIISet(); { return func( rune) bool { return < utf8.RuneSelf && .contains(byte()) } } return func( rune) bool { for , := range { if == { return true } } return false } } // Trim returns a subslice of s by slicing off all leading and // trailing UTF-8-encoded code points contained in cutset. func ( []byte, string) []byte { return TrimFunc(, makeCutsetFunc()) } // TrimLeft returns a subslice of s by slicing off all leading // UTF-8-encoded code points contained in cutset. func ( []byte, string) []byte { return TrimLeftFunc(, makeCutsetFunc()) } // TrimRight returns a subslice of s by slicing off all trailing // UTF-8-encoded code points that are contained in cutset. func ( []byte, string) []byte { return TrimRightFunc(, makeCutsetFunc()) } // TrimSpace returns a subslice of s by slicing off all leading and // trailing white space, as defined by Unicode. func ( []byte) []byte { // Fast path for ASCII: look for the first ASCII non-space byte := 0 for ; < len(); ++ { := [] if >= utf8.RuneSelf { // If we run into a non-ASCII byte, fall back to the // slower unicode-aware method on the remaining bytes return TrimFunc([:], unicode.IsSpace) } if asciiSpace[] == 0 { break } } // Now look for the first ASCII non-space byte from the end := len() for ; > ; -- { := [-1] if >= utf8.RuneSelf { return TrimFunc([:], unicode.IsSpace) } if asciiSpace[] == 0 { break } } // At this point s[start:stop] starts and ends with an ASCII // non-space bytes, so we're done. Non-ASCII cases have already // been handled above. if == { // Special case to preserve previous TrimLeftFunc behavior, // returning nil instead of empty slice if all spaces. return nil } return [:] } // Runes interprets s as a sequence of UTF-8-encoded code points. // It returns a slice of runes (Unicode code points) equivalent to s. func ( []byte) []rune { := make([]rune, utf8.RuneCount()) := 0 for len() > 0 { , := utf8.DecodeRune() [] = ++ = [:] } return } // Replace returns a copy of the slice s with the first n // non-overlapping instances of old replaced by new. // If old is empty, it matches at the beginning of the slice // and after each UTF-8 sequence, yielding up to k+1 replacements // for a k-rune slice. // If n < 0, there is no limit on the number of replacements. func (, , []byte, int) []byte { := 0 if != 0 { // Compute number of replacements. = Count(, ) } if == 0 { // Just return a copy. return append([]byte(nil), ...) } if < 0 || < { = } // Apply replacements to buffer. := make([]byte, len()+*(len()-len())) := 0 := 0 for := 0; < ; ++ { := if len() == 0 { if > 0 { , := utf8.DecodeRune([:]) += } } else { += Index([:], ) } += copy([:], [:]) += copy([:], ) = + len() } += copy([:], [:]) return [0:] } // ReplaceAll returns a copy of the slice s with all // non-overlapping instances of old replaced by new. // If old is empty, it matches at the beginning of the slice // and after each UTF-8 sequence, yielding up to k+1 replacements // for a k-rune slice. func (, , []byte) []byte { return Replace(, , , -1) } // EqualFold reports whether s and t, interpreted as UTF-8 strings, // are equal under Unicode case-folding, which is a more general // form of case-insensitivity. func (, []byte) bool { for len() != 0 && len() != 0 { // Extract first rune from each. var , rune if [0] < utf8.RuneSelf { , = rune([0]), [1:] } else { , := utf8.DecodeRune() , = , [:] } if [0] < utf8.RuneSelf { , = rune([0]), [1:] } else { , := utf8.DecodeRune() , = , [:] } // If they match, keep going; if not, return false. // Easy case. if == { continue } // Make sr < tr to simplify what follows. if < { , = , } // Fast check for ASCII. if < utf8.RuneSelf { // ASCII only, sr/tr must be upper/lower case if 'A' <= && <= 'Z' && == +'a'-'A' { continue } return false } // General case. SimpleFold(x) returns the next equivalent rune > x // or wraps around to smaller values. := unicode.SimpleFold() for != && < { = unicode.SimpleFold() } if == { continue } return false } // One string is empty. Are both? return len() == len() } // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. func (, []byte) int { := len() switch { case == 0: return 0 case == 1: return IndexByte(, [0]) case == len(): if Equal(, ) { return 0 } return -1 case > len(): return -1 case <= bytealg.MaxLen: // Use brute force when s and sep both are small if len() <= bytealg.MaxBruteForce { return bytealg.Index(, ) } := [0] := [1] := 0 := len() - + 1 := 0 for < { if [] != { // IndexByte is faster than bytealg.Index, so use it as long as // we're not getting lots of false positives. := IndexByte([+1:], ) if < 0 { return -1 } += + 1 } if [+1] == && Equal([:+], ) { return } ++ ++ // Switch to bytealg.Index when IndexByte produces too many false positives. if > bytealg.Cutover() { := bytealg.Index([:], ) if >= 0 { return + } return -1 } } return -1 } := [0] := [1] := 0 := 0 := len() - + 1 for < { if [] != { := IndexByte([+1:], ) if < 0 { break } += + 1 } if [+1] == && Equal([:+], ) { return } ++ ++ if >= 4+>>4 && < { // Give up on IndexByte, it isn't skipping ahead // far enough to be better than Rabin-Karp. // Experiments (using IndexPeriodic) suggest // the cutover is about 16 byte skips. // TODO: if large prefixes of sep are matching // we should cutover at even larger average skips, // because Equal becomes that much more expensive. // This code does not take that effect into account. := bytealg.IndexRabinKarpBytes([:], ) if < 0 { return -1 } return + } } return -1 }