Source File
search.go
Belonging Package
strings
// Copyright 2012 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package strings
// stringFinder efficiently finds strings in a source text. It's implemented
// using the Boyer-Moore string search algorithm:
// https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
// https://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged
// document uses 1-based indexing)
type stringFinder struct {
// pattern is the string that we are searching for in the text.
pattern string
// badCharSkip[b] contains the distance between the last byte of pattern
// and the rightmost occurrence of b in pattern. If b is not in pattern,
// badCharSkip[b] is len(pattern).
//
// Whenever a mismatch is found with byte b in the text, we can safely
// shift the matching frame at least badCharSkip[b] until the next time
// the matching char could be in alignment.
badCharSkip [256]int
// goodSuffixSkip[i] defines how far we can shift the matching frame given
// that the suffix pattern[i+1:] matches, but the byte pattern[i] does
// not. There are two cases to consider:
//
// 1. The matched suffix occurs elsewhere in pattern (with a different
// byte preceding it that we might possibly match). In this case, we can
// shift the matching frame to align with the next suffix chunk. For
// example, the pattern "mississi" has the suffix "issi" next occurring
// (in right-to-left order) at index 1, so goodSuffixSkip[3] ==
// shift+len(suffix) == 3+4 == 7.
//
// 2. If the matched suffix does not occur elsewhere in pattern, then the
// matching frame may share part of its prefix with the end of the
// matching suffix. In this case, goodSuffixSkip[i] will contain how far
// to shift the frame to align this portion of the prefix to the
// suffix. For example, in the pattern "abcxxxabc", when the first
// mismatch from the back is found to be in position 3, the matching
// suffix "xxabc" is not found elsewhere in the pattern. However, its
// rightmost "abc" (at position 6) is a prefix of the whole pattern, so
// goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11.
goodSuffixSkip []int
}
func makeStringFinder( string) *stringFinder {
:= &stringFinder{
pattern: ,
goodSuffixSkip: make([]int, len()),
}
// last is the index of the last character in the pattern.
:= len() - 1
// Build bad character table.
// Bytes not in the pattern can skip one pattern's length.
for := range .badCharSkip {
.badCharSkip[] = len()
}
// The loop condition is < instead of <= so that the last byte does not
// have a zero distance to itself. Finding this byte out of place implies
// that it is not in the last position.
for := 0; < ; ++ {
.badCharSkip[[]] = -
}
// Build good suffix table.
// First pass: set each value to the next index which starts a prefix of
// pattern.
:=
for := ; >= 0; -- {
if HasPrefix(, [+1:]) {
= + 1
}
// lastPrefix is the shift, and (last-i) is len(suffix).
.goodSuffixSkip[] = + -
}
// Second pass: find repeats of pattern's suffix starting from the front.
for := 0; < ; ++ {
:= longestCommonSuffix(, [1:+1])
if [-] != [-] {
// (last-i) is the shift, and lenSuffix is len(suffix).
.goodSuffixSkip[-] = + -
}
}
return
}
func longestCommonSuffix(, string) ( int) {
for ; < len() && < len(); ++ {
if [len()-1-] != [len()-1-] {
break
}
}
return
}
// next returns the index in text of the first occurrence of the pattern. If
// the pattern is not found, it returns -1.
func ( *stringFinder) ( string) int {
:= len(.pattern) - 1
for < len() {
// Compare backwards from the end until the first unmatching character.
:= len(.pattern) - 1
for >= 0 && [] == .pattern[] {
--
--
}
if < 0 {
return + 1 // match
}
+= max(.badCharSkip[[]], .goodSuffixSkip[])
}
return -1
}
The pages are generated with Golds v0.7.3. (GOOS=linux GOARCH=amd64) Golds is a Go 101 project developed by Tapir Liu. PR and bug reports are welcome and can be submitted to the issue list. Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds. |