// Copyright 2010 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.
// Package suffixarray implements substring search in logarithmic time using// an in-memory suffix array.//// Example use://// // create index for some data// index := suffixarray.New(data)//// // lookup byte slice s// offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data// offsets2 := index.Lookup(s, 3) // the list of at most 3 indices where s occurs in data
package suffixarrayimport ()// Can change for testingvar maxData32 int = realMaxData32const realMaxData32 = math.MaxInt32// Index implements a suffix array for fast substring search.typeIndexstruct { data []byte sa ints// suffix array for data; sa.len() == len(data)}// An ints is either an []int32 or an []int64.// That is, one of them is empty, and one is the real data.// The int64 form is used when len(data) > maxData32type ints struct { int32 []int32 int64 []int64}func ( *ints) () int {returnlen(.int32) + len(.int64)}func ( *ints) ( int) int64 {if .int32 != nil {returnint64(.int32[]) }return .int64[]}func ( *ints) ( int, int64) {if .int32 != nil { .int32[] = int32() } else { .int64[] = }}func ( *ints) (, int) ints {if .int32 != nil {returnints{.int32[:], nil} }returnints{nil, .int64[:]}}// New creates a new [Index] for data.// [Index] creation time is O(N) for N = len(data).func ( []byte) *Index { := &Index{data: }iflen() <= maxData32 { .sa.int32 = make([]int32, len())text_32(, .sa.int32) } else { .sa.int64 = make([]int64, len())text_64(, .sa.int64) }return}// writeInt writes an int x to w using buf to buffer the write.func writeInt( io.Writer, []byte, int) error {binary.PutVarint(, int64()) , := .Write([0:binary.MaxVarintLen64])return}// readInt reads an int x from r using buf to buffer the read and returns x.func readInt( io.Reader, []byte) (int64, error) { , := io.ReadFull(, [0:binary.MaxVarintLen64]) // ok to continue with error , := binary.Varint()return , }// writeSlice writes data[:n] to w and returns n.// It uses buf to buffer the write.func writeSlice( io.Writer, []byte, ints) ( int, error) {// encode as many elements as fit into buf := binary.MaxVarintLen64 := .len()for ; < && +binary.MaxVarintLen64 <= len(); ++ { += binary.PutUvarint([:], uint64(.get())) }// update buffer sizebinary.PutVarint(, int64())// write buffer _, = .Write([0:])return}var errTooBig = errors.New("suffixarray: data too large")// readSlice reads data[:n] from r and returns n.// It uses buf to buffer the read.func readSlice( io.Reader, []byte, ints) ( int, error) {// read buffer sizevarint64 , = readInt(, )if != nil {return }ifint64(int()) != || int() < 0 {// We never write chunks this big anyway.return0, errTooBig } := int()// read buffer w/o the sizeif _, = io.ReadFull(, [binary.MaxVarintLen64:]); != nil {return }// decode as many elements as present in buffor := binary.MaxVarintLen64; < ; ++ { , := binary.Uvarint([:]) .set(, int64()) += }return}const bufSize = 16 << 10// reasonable for BenchmarkSaveRestore// Read reads the index from r into x; x must not be nil.func ( *Index) ( io.Reader) error {// buffer for all reads := make([]byte, bufSize)// read length , := readInt(, )if != nil {return }ifint64(int()) != || int() < 0 {returnerrTooBig } := int()// allocate spaceif2* < cap(.data) || cap(.data) < || .sa.int32 != nil && > maxData32 || .sa.int64 != nil && <= maxData32 {// new data is significantly smaller or larger than // existing buffers - allocate new ones .data = make([]byte, ) .sa.int32 = nil .sa.int64 = nilif <= maxData32 { .sa.int32 = make([]int32, ) } else { .sa.int64 = make([]int64, ) } } else {// re-use existing buffers .data = .data[0:] .sa = .sa.slice(0, ) }// read dataif , := io.ReadFull(, .data); != nil {return }// read index := .safor .len() > 0 { , := readSlice(, , )if != nil {return } = .slice(, .len()) }returnnil}// Write writes the index x to w.func ( *Index) ( io.Writer) error {// buffer for all writes := make([]byte, bufSize)// write lengthif := writeInt(, , len(.data)); != nil {return }// write dataif , := .Write(.data); != nil {return }// write index := .safor .len() > 0 { , := writeSlice(, , )if != nil {return } = .slice(, .len()) }returnnil}// Bytes returns the data over which the index was created.// It must not be modified.func ( *Index) () []byte {return .data}func ( *Index) ( int) []byte {return .data[.sa.get():]}// lookupAll returns a slice into the matching region of the index.// The runtime is O(log(N)*len(s)).func ( *Index) ( []byte) ints {// find matching suffix index range [i:j] // find the first index where s would be the prefix := sort.Search(.sa.len(), func( int) bool { returnbytes.Compare(.at(), ) >= 0 })// starting at i, find the first index at which s is not a prefix := + sort.Search(.sa.len()-, func( int) bool { return !bytes.HasPrefix(.at(+), ) })return .sa.slice(, )}// Lookup returns an unsorted list of at most n indices where the byte string s// occurs in the indexed data. If n < 0, all occurrences are returned.// The result is nil if s is empty, s is not found, or n == 0.// Lookup time is O(log(N)*len(s) + len(result)) where N is the// size of the indexed data.func ( *Index) ( []byte, int) ( []int) {iflen() > 0 && != 0 { := .lookupAll() := .len()if < 0 || < { = }// 0 <= n <= countif > 0 { = make([]int, )if .int32 != nil {for := range { [] = int(.int32[]) } } else {for := range { [] = int(.int64[]) } } } }return}// FindAllIndex returns a sorted list of non-overlapping matches of the// regular expression r, where a match is a pair of indices specifying// the matched slice of x.Bytes(). If n < 0, all matches are returned// in successive order. Otherwise, at most n matches are returned and// they may not be successive. The result is nil if there are no matches,// or if n == 0.func ( *Index) ( *regexp.Regexp, int) ( [][]int) {// a non-empty literal prefix is used to determine possible // match start indices with Lookup , := .LiteralPrefix() := []byte()// worst-case scenario: no literal prefixif == "" {return .FindAllIndex(.data, ) }// if regexp is a literal just use Lookup and convert its // result into match pairsif {// Lookup returns indices that may belong to overlapping matches. // After eliminating them, we may end up with fewer than n matches. // If we don't have enough at the end, redo the search with an // increased value n1, but only if Lookup returned all the requested // indices in the first place (if it returned fewer than that then // there cannot be more).for := ; ; += 2 * ( - len()) /* overflow ok */ { := .Lookup(, )iflen() == 0 {return }slices.Sort() := make([]int, 2*len()) = make([][]int, len()) := 0 := 0for , := range {if == {break }// ignore indices leading to overlapping matchesif <= { := 2 * [+0] = [+1] = + len() [] = [ : +2] ++ = + len() } } = [0:]iflen() >= || len() != {// found all matches or there's no chance to find more // (n and n1 can be negative)break } }iflen() == 0 { = nil }return }// regexp has a non-empty literal prefix; Lookup(lit) computes // the indices of possible complete matches; use these as starting // points for anchored searches // (regexp "^" matches beginning of input, not beginning of line) = regexp.MustCompile("^" + .String()) // compiles because r compiled// same comment about Lookup applies here as in the loop abovefor := ; ; += 2 * ( - len()) /* overflow ok */ { := .Lookup(, )iflen() == 0 {return }slices.Sort() = [0:0] := 0for , := range {iflen() == {break } := .FindIndex(.data[:]) // anchored search - will not run off// ignore indices leading to overlapping matchesif != nil && <= { [0] = // correct m [1] += = append(, ) = [1] } }iflen() >= || len() != {// found all matches or there's no chance to find more // (n and n1 can be negative)break } }iflen() == 0 { = nil }return}
The pages are generated with Goldsv0.7.3. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds.