// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Package suffixarray implements substring search in logarithmic time using // an in-memory suffix array. // // Example use: // // // create index for some data // index := suffixarray.New(data) // // // lookup byte slice s // offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data // offsets2 := index.Lookup(s, 3) // the list of at most 3 indices where s occurs in data
package suffixarray import ( ) // Can change for testing var maxData32 int = realMaxData32 const realMaxData32 = math.MaxInt32 // Index implements a suffix array for fast substring search. type Index struct { data []byte sa ints // suffix array for data; sa.len() == len(data) } // An ints is either an []int32 or an []int64. // That is, one of them is empty, and one is the real data. // The int64 form is used when len(data) > maxData32 type ints struct { int32 []int32 int64 []int64 } func ( *ints) () int { return len(.int32) + len(.int64) } func ( *ints) ( int) int64 { if .int32 != nil { return int64(.int32[]) } return .int64[] } func ( *ints) ( int, int64) { if .int32 != nil { .int32[] = int32() } else { .int64[] = } } func ( *ints) (, int) ints { if .int32 != nil { return ints{.int32[:], nil} } return ints{nil, .int64[:]} } // New creates a new [Index] for data. // [Index] creation time is O(N) for N = len(data). func ( []byte) *Index { := &Index{data: } if len() <= maxData32 { .sa.int32 = make([]int32, len()) text_32(, .sa.int32) } else { .sa.int64 = make([]int64, len()) text_64(, .sa.int64) } return } // writeInt writes an int x to w using buf to buffer the write. func writeInt( io.Writer, []byte, int) error { binary.PutVarint(, int64()) , := .Write([0:binary.MaxVarintLen64]) return } // readInt reads an int x from r using buf to buffer the read and returns x. func readInt( io.Reader, []byte) (int64, error) { , := io.ReadFull(, [0:binary.MaxVarintLen64]) // ok to continue with error , := binary.Varint() return , } // writeSlice writes data[:n] to w and returns n. // It uses buf to buffer the write. func writeSlice( io.Writer, []byte, ints) ( int, error) { // encode as many elements as fit into buf := binary.MaxVarintLen64 := .len() for ; < && +binary.MaxVarintLen64 <= len(); ++ { += binary.PutUvarint([:], uint64(.get())) } // update buffer size binary.PutVarint(, int64()) // write buffer _, = .Write([0:]) return } var errTooBig = errors.New("suffixarray: data too large") // readSlice reads data[:n] from r and returns n. // It uses buf to buffer the read. func readSlice( io.Reader, []byte, ints) ( int, error) { // read buffer size var int64 , = readInt(, ) if != nil { return } if int64(int()) != || int() < 0 { // We never write chunks this big anyway. return 0, errTooBig } := int() // read buffer w/o the size if _, = io.ReadFull(, [binary.MaxVarintLen64:]); != nil { return } // decode as many elements as present in buf for := binary.MaxVarintLen64; < ; ++ { , := binary.Uvarint([:]) .set(, int64()) += } return } const bufSize = 16 << 10 // reasonable for BenchmarkSaveRestore // Read reads the index from r into x; x must not be nil. func ( *Index) ( io.Reader) error { // buffer for all reads := make([]byte, bufSize) // read length , := readInt(, ) if != nil { return } if int64(int()) != || int() < 0 { return errTooBig } := int() // allocate space if 2* < cap(.data) || cap(.data) < || .sa.int32 != nil && > maxData32 || .sa.int64 != nil && <= maxData32 { // new data is significantly smaller or larger than // existing buffers - allocate new ones .data = make([]byte, ) .sa.int32 = nil .sa.int64 = nil if <= maxData32 { .sa.int32 = make([]int32, ) } else { .sa.int64 = make([]int64, ) } } else { // re-use existing buffers .data = .data[0:] .sa = .sa.slice(0, ) } // read data if , := io.ReadFull(, .data); != nil { return } // read index := .sa for .len() > 0 { , := readSlice(, , ) if != nil { return } = .slice(, .len()) } return nil } // Write writes the index x to w. func ( *Index) ( io.Writer) error { // buffer for all writes := make([]byte, bufSize) // write length if := writeInt(, , len(.data)); != nil { return } // write data if , := .Write(.data); != nil { return } // write index := .sa for .len() > 0 { , := writeSlice(, , ) if != nil { return } = .slice(, .len()) } return nil } // Bytes returns the data over which the index was created. // It must not be modified. func ( *Index) () []byte { return .data } func ( *Index) ( int) []byte { return .data[.sa.get():] } // lookupAll returns a slice into the matching region of the index. // The runtime is O(log(N)*len(s)). func ( *Index) ( []byte) ints { // find matching suffix index range [i:j] // find the first index where s would be the prefix := sort.Search(.sa.len(), func( int) bool { return bytes.Compare(.at(), ) >= 0 }) // starting at i, find the first index at which s is not a prefix := + sort.Search(.sa.len()-, func( int) bool { return !bytes.HasPrefix(.at(+), ) }) return .sa.slice(, ) } // Lookup returns an unsorted list of at most n indices where the byte string s // occurs in the indexed data. If n < 0, all occurrences are returned. // The result is nil if s is empty, s is not found, or n == 0. // Lookup time is O(log(N)*len(s) + len(result)) where N is the // size of the indexed data. func ( *Index) ( []byte, int) ( []int) { if len() > 0 && != 0 { := .lookupAll() := .len() if < 0 || < { = } // 0 <= n <= count if > 0 { = make([]int, ) if .int32 != nil { for := range { [] = int(.int32[]) } } else { for := range { [] = int(.int64[]) } } } } return } // FindAllIndex returns a sorted list of non-overlapping matches of the // regular expression r, where a match is a pair of indices specifying // the matched slice of x.Bytes(). If n < 0, all matches are returned // in successive order. Otherwise, at most n matches are returned and // they may not be successive. The result is nil if there are no matches, // or if n == 0. func ( *Index) ( *regexp.Regexp, int) ( [][]int) { // a non-empty literal prefix is used to determine possible // match start indices with Lookup , := .LiteralPrefix() := []byte() // worst-case scenario: no literal prefix if == "" { return .FindAllIndex(.data, ) } // if regexp is a literal just use Lookup and convert its // result into match pairs if { // Lookup returns indices that may belong to overlapping matches. // After eliminating them, we may end up with fewer than n matches. // If we don't have enough at the end, redo the search with an // increased value n1, but only if Lookup returned all the requested // indices in the first place (if it returned fewer than that then // there cannot be more). for := ; ; += 2 * ( - len()) /* overflow ok */ { := .Lookup(, ) if len() == 0 { return } slices.Sort() := make([]int, 2*len()) = make([][]int, len()) := 0 := 0 for , := range { if == { break } // ignore indices leading to overlapping matches if <= { := 2 * [+0] = [+1] = + len() [] = [ : +2] ++ = + len() } } = [0:] if len() >= || len() != { // found all matches or there's no chance to find more // (n and n1 can be negative) break } } if len() == 0 { = nil } return } // regexp has a non-empty literal prefix; Lookup(lit) computes // the indices of possible complete matches; use these as starting // points for anchored searches // (regexp "^" matches beginning of input, not beginning of line) = regexp.MustCompile("^" + .String()) // compiles because r compiled // same comment about Lookup applies here as in the loop above for := ; ; += 2 * ( - len()) /* overflow ok */ { := .Lookup(, ) if len() == 0 { return } slices.Sort() = [0:0] := 0 for , := range { if len() == { break } := .FindIndex(.data[:]) // anchored search - will not run off // ignore indices leading to overlapping matches if != nil && <= { [0] = // correct m [1] += = append(, ) = [1] } } if len() >= || len() != { // found all matches or there's no chance to find more // (n and n1 can be negative) break } } if len() == 0 { = nil } return }