Source File
sais2.go
Belonging Package
index/suffixarray
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by go generate; DO NOT EDIT.
package suffixarray
func text_64( []byte, []int64) {
if int(int64(len())) != len() || len() != len() {
panic("suffixarray: misuse of text_64")
}
sais_8_64(, 256, , make([]int64, 2*256))
}
func sais_8_64( []byte, int, , []int64) {
if len() != len() || len() < {
panic("suffixarray: misuse of sais_8_64")
}
// Trivial base cases. Sorting 0 or 1 things is easy.
if len() == 0 {
return
}
if len() == 1 {
[0] = 0
return
}
// Establish slices indexed by text character
// holding character frequency and bucket-sort offsets.
// If there's only enough tmp for one slice,
// we make it the bucket offsets and recompute
// the character frequency each time we need it.
var , []int64
if len() >= 2* {
, = [:], [:2*]
[0] = -1 // mark as uninitialized
} else {
, = nil, [:]
}
// The SAIS algorithm.
// Each of these calls makes one scan through sa.
// See the individual functions for documentation
// about each's role in the algorithm.
:= placeLMS_8_64(, , , )
if <= 1 {
// 0 or 1 items are already sorted. Do nothing.
} else {
induceSubL_8_64(, , , )
induceSubS_8_64(, , , )
length_8_64(, , )
:= assignID_8_64(, , )
if < {
map_64(, )
recurse_64(, , , )
unmap_8_64(, , )
} else {
// If maxID == numLMS, then each LMS-substring
// is unique, so the relative ordering of two LMS-suffixes
// is determined by just the leading LMS-substring.
// That is, the LMS-suffix sort order matches the
// (simpler) LMS-substring sort order.
// Copy the original LMS-substring order into the
// suffix array destination.
copy(, [len()-:])
}
expand_8_64(, , , , )
}
induceL_8_64(, , , )
induceS_8_64(, , , )
// Mark for caller that we overwrote tmp.
[0] = -1
}
func sais_32( []int32, int, , []int32) {
if len() != len() || len() < {
panic("suffixarray: misuse of sais_32")
}
// Trivial base cases. Sorting 0 or 1 things is easy.
if len() == 0 {
return
}
if len() == 1 {
[0] = 0
return
}
// Establish slices indexed by text character
// holding character frequency and bucket-sort offsets.
// If there's only enough tmp for one slice,
// we make it the bucket offsets and recompute
// the character frequency each time we need it.
var , []int32
if len() >= 2* {
, = [:], [:2*]
[0] = -1 // mark as uninitialized
} else {
, = nil, [:]
}
// The SAIS algorithm.
// Each of these calls makes one scan through sa.
// See the individual functions for documentation
// about each's role in the algorithm.
:= placeLMS_32(, , , )
if <= 1 {
// 0 or 1 items are already sorted. Do nothing.
} else {
induceSubL_32(, , , )
induceSubS_32(, , , )
length_32(, , )
:= assignID_32(, , )
if < {
map_32(, )
recurse_32(, , , )
unmap_32(, , )
} else {
// If maxID == numLMS, then each LMS-substring
// is unique, so the relative ordering of two LMS-suffixes
// is determined by just the leading LMS-substring.
// That is, the LMS-suffix sort order matches the
// (simpler) LMS-substring sort order.
// Copy the original LMS-substring order into the
// suffix array destination.
copy(, [len()-:])
}
expand_32(, , , , )
}
induceL_32(, , , )
induceS_32(, , , )
// Mark for caller that we overwrote tmp.
[0] = -1
}
func sais_64( []int64, int, , []int64) {
if len() != len() || len() < {
panic("suffixarray: misuse of sais_64")
}
// Trivial base cases. Sorting 0 or 1 things is easy.
if len() == 0 {
return
}
if len() == 1 {
[0] = 0
return
}
// Establish slices indexed by text character
// holding character frequency and bucket-sort offsets.
// If there's only enough tmp for one slice,
// we make it the bucket offsets and recompute
// the character frequency each time we need it.
var , []int64
if len() >= 2* {
, = [:], [:2*]
[0] = -1 // mark as uninitialized
} else {
, = nil, [:]
}
// The SAIS algorithm.
// Each of these calls makes one scan through sa.
// See the individual functions for documentation
// about each's role in the algorithm.
:= placeLMS_64(, , , )
if <= 1 {
// 0 or 1 items are already sorted. Do nothing.
} else {
induceSubL_64(, , , )
induceSubS_64(, , , )
length_64(, , )
:= assignID_64(, , )
if < {
map_64(, )
recurse_64(, , , )
unmap_64(, , )
} else {
// If maxID == numLMS, then each LMS-substring
// is unique, so the relative ordering of two LMS-suffixes
// is determined by just the leading LMS-substring.
// That is, the LMS-suffix sort order matches the
// (simpler) LMS-substring sort order.
// Copy the original LMS-substring order into the
// suffix array destination.
copy(, [len()-:])
}
expand_64(, , , , )
}
induceL_64(, , , )
induceS_64(, , , )
// Mark for caller that we overwrote tmp.
[0] = -1
}
func freq_8_64( []byte, , []int64) []int64 {
if != nil && [0] >= 0 {
return // already computed
}
if == nil {
=
}
= [:256] // eliminate bounds check for freq[c] below
clear()
for , := range {
[]++
}
return
}
func freq_32( []int32, , []int32) []int32 {
if != nil && [0] >= 0 {
return // already computed
}
if == nil {
=
}
clear()
for , := range {
[]++
}
return
}
func freq_64( []int64, , []int64) []int64 {
if != nil && [0] >= 0 {
return // already computed
}
if == nil {
=
}
clear()
for , := range {
[]++
}
return
}
func bucketMin_8_64( []byte, , []int64) {
= freq_8_64(, , )
= [:256] // establish len(freq) = 256, so 0 ≤ i < 256 below
= [:256] // eliminate bounds check for bucket[i] below
:= int64(0)
for , := range {
[] =
+=
}
}
func bucketMin_32( []int32, , []int32) {
= freq_32(, , )
:= int32(0)
for , := range {
[] =
+=
}
}
func bucketMin_64( []int64, , []int64) {
= freq_64(, , )
:= int64(0)
for , := range {
[] =
+=
}
}
func bucketMax_8_64( []byte, , []int64) {
= freq_8_64(, , )
= [:256] // establish len(freq) = 256, so 0 ≤ i < 256 below
= [:256] // eliminate bounds check for bucket[i] below
:= int64(0)
for , := range {
+=
[] =
}
}
func bucketMax_32( []int32, , []int32) {
= freq_32(, , )
:= int32(0)
for , := range {
+=
[] =
}
}
func bucketMax_64( []int64, , []int64) {
= freq_64(, , )
:= int64(0)
for , := range {
+=
[] =
}
}
func placeLMS_8_64( []byte, , , []int64) int {
bucketMax_8_64(, , )
:= 0
:= int64(-1)
= [:256] // eliminate bounds check for bucket[c1] below
// The next stanza of code (until the blank line) loop backward
// over text, stopping to execute a code body at each position i
// such that text[i] is an L-character and text[i+1] is an S-character.
// That is, i+1 is the position of the start of an LMS-substring.
// These could be hoisted out into a function with a callback,
// but at a significant speed cost. Instead, we just write these
// seven lines a few times in this source file. The copies below
// refer back to the pattern established by this original as the
// "LMS-substring iterator".
//
// In every scan through the text, c0, c1 are successive characters of text.
// In this backward scan, c0 == text[i] and c1 == text[i+1].
// By scanning backward, we can keep track of whether the current
// position is type-S or type-L according to the usual definition:
//
// - position len(text) is type S with text[len(text)] == -1 (the sentinel)
// - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
// - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
//
// The backward scan lets us maintain the current type,
// update it when we see c0 != c1, and otherwise leave it alone.
// We want to identify all S positions with a preceding L.
// Position len(text) is one such position by definition, but we have
// nowhere to write it down, so we eliminate it by untruthfully
// setting isTypeS = false at the start of the loop.
, , := byte(0), byte(0), false
for := len() - 1; >= 0; -- {
, = [],
if < {
= true
} else if > && {
= false
// Bucket the index i+1 for the start of an LMS-substring.
:= [] - 1
[] =
[] = int64( + 1)
=
++
}
}
// We recorded the LMS-substring starts but really want the ends.
// Luckily, with two differences, the start indexes and the end indexes are the same.
// The first difference is that the rightmost LMS-substring's end index is len(text),
// so the caller must pretend that sa[-1] == len(text), as noted above.
// The second difference is that the first leftmost LMS-substring start index
// does not end an earlier LMS-substring, so as an optimization we can omit
// that leftmost LMS-substring start index (the last one we wrote).
//
// Exception: if numLMS <= 1, the caller is not going to bother with
// the recursion at all and will treat the result as containing LMS-substring starts.
// In that case, we don't remove the final entry.
if > 1 {
[] = 0
}
return
}
func placeLMS_32( []int32, , , []int32) int {
bucketMax_32(, , )
:= 0
:= int32(-1)
// The next stanza of code (until the blank line) loop backward
// over text, stopping to execute a code body at each position i
// such that text[i] is an L-character and text[i+1] is an S-character.
// That is, i+1 is the position of the start of an LMS-substring.
// These could be hoisted out into a function with a callback,
// but at a significant speed cost. Instead, we just write these
// seven lines a few times in this source file. The copies below
// refer back to the pattern established by this original as the
// "LMS-substring iterator".
//
// In every scan through the text, c0, c1 are successive characters of text.
// In this backward scan, c0 == text[i] and c1 == text[i+1].
// By scanning backward, we can keep track of whether the current
// position is type-S or type-L according to the usual definition:
//
// - position len(text) is type S with text[len(text)] == -1 (the sentinel)
// - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
// - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
//
// The backward scan lets us maintain the current type,
// update it when we see c0 != c1, and otherwise leave it alone.
// We want to identify all S positions with a preceding L.
// Position len(text) is one such position by definition, but we have
// nowhere to write it down, so we eliminate it by untruthfully
// setting isTypeS = false at the start of the loop.
, , := int32(0), int32(0), false
for := len() - 1; >= 0; -- {
, = [],
if < {
= true
} else if > && {
= false
// Bucket the index i+1 for the start of an LMS-substring.
:= [] - 1
[] =
[] = int32( + 1)
=
++
}
}
// We recorded the LMS-substring starts but really want the ends.
// Luckily, with two differences, the start indexes and the end indexes are the same.
// The first difference is that the rightmost LMS-substring's end index is len(text),
// so the caller must pretend that sa[-1] == len(text), as noted above.
// The second difference is that the first leftmost LMS-substring start index
// does not end an earlier LMS-substring, so as an optimization we can omit
// that leftmost LMS-substring start index (the last one we wrote).
//
// Exception: if numLMS <= 1, the caller is not going to bother with
// the recursion at all and will treat the result as containing LMS-substring starts.
// In that case, we don't remove the final entry.
if > 1 {
[] = 0
}
return
}
func placeLMS_64( []int64, , , []int64) int {
bucketMax_64(, , )
:= 0
:= int64(-1)
// The next stanza of code (until the blank line) loop backward
// over text, stopping to execute a code body at each position i
// such that text[i] is an L-character and text[i+1] is an S-character.
// That is, i+1 is the position of the start of an LMS-substring.
// These could be hoisted out into a function with a callback,
// but at a significant speed cost. Instead, we just write these
// seven lines a few times in this source file. The copies below
// refer back to the pattern established by this original as the
// "LMS-substring iterator".
//
// In every scan through the text, c0, c1 are successive characters of text.
// In this backward scan, c0 == text[i] and c1 == text[i+1].
// By scanning backward, we can keep track of whether the current
// position is type-S or type-L according to the usual definition:
//
// - position len(text) is type S with text[len(text)] == -1 (the sentinel)
// - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.
// - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.
//
// The backward scan lets us maintain the current type,
// update it when we see c0 != c1, and otherwise leave it alone.
// We want to identify all S positions with a preceding L.
// Position len(text) is one such position by definition, but we have
// nowhere to write it down, so we eliminate it by untruthfully
// setting isTypeS = false at the start of the loop.
, , := int64(0), int64(0), false
for := len() - 1; >= 0; -- {
, = [],
if < {
= true
} else if > && {
= false
// Bucket the index i+1 for the start of an LMS-substring.
:= [] - 1
[] =
[] = int64( + 1)
=
++
}
}
// We recorded the LMS-substring starts but really want the ends.
// Luckily, with two differences, the start indexes and the end indexes are the same.
// The first difference is that the rightmost LMS-substring's end index is len(text),
// so the caller must pretend that sa[-1] == len(text), as noted above.
// The second difference is that the first leftmost LMS-substring start index
// does not end an earlier LMS-substring, so as an optimization we can omit
// that leftmost LMS-substring start index (the last one we wrote).
//
// Exception: if numLMS <= 1, the caller is not going to bother with
// the recursion at all and will treat the result as containing LMS-substring starts.
// In that case, we don't remove the final entry.
if > 1 {
[] = 0
}
return
}
func induceSubL_8_64( []byte, , , []int64) {
// Initialize positions for left side of character buckets.
bucketMin_8_64(, , )
= [:256] // eliminate bounds check for bucket[cB] below
// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
// Because j-1 is type L, inserting it into sa now will sort it correctly.
// But we want to distinguish a j-1 with j-2 of type L from type S.
// We can process the former but want to leave the latter for the caller.
// We record the difference by negating j-1 if it is preceded by type S.
// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
// and so on, in sorted but not necessarily adjacent order, until it finds
// one preceded by an index of type S, at which point it must stop.
//
// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
// only the indexes of the leftmost L-type indexes for each LMS-substring.
//
// The suffix array sa therefore serves simultaneously as input, output,
// and a miraculously well-tailored work queue.
// placeLMS_8_64 left out the implicit entry sa[-1] == len(text),
// corresponding to the identified type-L index len(text)-1.
// Process it before the left-to-right scan of sa proper.
// See body in loop for commentary.
:= len() - 1
, := [-1], []
if < {
= -
}
// Cache recently used bucket index:
// we're processing suffixes in sorted order
// and accessing buckets indexed by the
// byte before the sorted order, which still
// has very good locality.
// Invariant: b is cached, possibly dirty copy of bucket[cB].
:=
:= []
[] = int64()
++
for := 0; < len(); ++ {
:= int([])
if == 0 {
// Skip empty entry.
continue
}
if < 0 {
// Leave discovered type-S index for caller.
[] = int64(-)
continue
}
[] = 0
// Index j was on work queue, meaning k := j-1 is L-type,
// so we can now place k correctly into sa.
// If k-1 is L-type, queue k for processing later in this loop.
// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
:= - 1
, := [-1], []
if < {
= -
}
if != {
[] =
=
= []
}
[] = int64()
++
}
}
func induceSubL_32( []int32, , , []int32) {
// Initialize positions for left side of character buckets.
bucketMin_32(, , )
// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
// Because j-1 is type L, inserting it into sa now will sort it correctly.
// But we want to distinguish a j-1 with j-2 of type L from type S.
// We can process the former but want to leave the latter for the caller.
// We record the difference by negating j-1 if it is preceded by type S.
// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
// and so on, in sorted but not necessarily adjacent order, until it finds
// one preceded by an index of type S, at which point it must stop.
//
// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
// only the indexes of the leftmost L-type indexes for each LMS-substring.
//
// The suffix array sa therefore serves simultaneously as input, output,
// and a miraculously well-tailored work queue.
// placeLMS_32 left out the implicit entry sa[-1] == len(text),
// corresponding to the identified type-L index len(text)-1.
// Process it before the left-to-right scan of sa proper.
// See body in loop for commentary.
:= len() - 1
, := [-1], []
if < {
= -
}
// Cache recently used bucket index:
// we're processing suffixes in sorted order
// and accessing buckets indexed by the
// int32 before the sorted order, which still
// has very good locality.
// Invariant: b is cached, possibly dirty copy of bucket[cB].
:=
:= []
[] = int32()
++
for := 0; < len(); ++ {
:= int([])
if == 0 {
// Skip empty entry.
continue
}
if < 0 {
// Leave discovered type-S index for caller.
[] = int32(-)
continue
}
[] = 0
// Index j was on work queue, meaning k := j-1 is L-type,
// so we can now place k correctly into sa.
// If k-1 is L-type, queue k for processing later in this loop.
// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
:= - 1
, := [-1], []
if < {
= -
}
if != {
[] =
=
= []
}
[] = int32()
++
}
}
func induceSubL_64( []int64, , , []int64) {
// Initialize positions for left side of character buckets.
bucketMin_64(, , )
// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly
// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.
// Because j-1 is type L, inserting it into sa now will sort it correctly.
// But we want to distinguish a j-1 with j-2 of type L from type S.
// We can process the former but want to leave the latter for the caller.
// We record the difference by negating j-1 if it is preceded by type S.
// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have
// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
// and so on, in sorted but not necessarily adjacent order, until it finds
// one preceded by an index of type S, at which point it must stop.
//
// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing
// only the indexes of the leftmost L-type indexes for each LMS-substring.
//
// The suffix array sa therefore serves simultaneously as input, output,
// and a miraculously well-tailored work queue.
// placeLMS_64 left out the implicit entry sa[-1] == len(text),
// corresponding to the identified type-L index len(text)-1.
// Process it before the left-to-right scan of sa proper.
// See body in loop for commentary.
:= len() - 1
, := [-1], []
if < {
= -
}
// Cache recently used bucket index:
// we're processing suffixes in sorted order
// and accessing buckets indexed by the
// int64 before the sorted order, which still
// has very good locality.
// Invariant: b is cached, possibly dirty copy of bucket[cB].
:=
:= []
[] = int64()
++
for := 0; < len(); ++ {
:= int([])
if == 0 {
// Skip empty entry.
continue
}
if < 0 {
// Leave discovered type-S index for caller.
[] = int64(-)
continue
}
[] = 0
// Index j was on work queue, meaning k := j-1 is L-type,
// so we can now place k correctly into sa.
// If k-1 is L-type, queue k for processing later in this loop.
// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
:= - 1
, := [-1], []
if < {
= -
}
if != {
[] =
=
= []
}
[] = int64()
++
}
}
func induceSubS_8_64( []byte, , , []int64) {
// Initialize positions for right side of character buckets.
bucketMax_8_64(, , )
= [:256] // eliminate bounds check for bucket[cB] below
// Analogous to induceSubL_8_64 above,
// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
// Because j-1 is type S, inserting it into sa now will sort it correctly.
// But we want to distinguish a j-1 with j-2 of type S from type L.
// We can process the former but want to leave the latter for the caller.
// We record the difference by negating j-1 if it is preceded by type L.
// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
// and so on, in sorted but not necessarily adjacent order, until it finds
// one preceded by an index of type L, at which point it must stop.
// That index (preceded by one of type L) is an LMS-substring start.
//
// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
// so that the loop finishes with the top of sa containing exactly
// the LMS-substring start indexes, sorted by LMS-substring.
// Cache recently used bucket index:
:= byte(0)
:= []
:= len()
for := len() - 1; >= 0; -- {
:= int([])
if == 0 {
// Skip empty entry.
continue
}
[] = 0
if < 0 {
// Leave discovered LMS-substring start index for caller.
--
[] = int64(-)
continue
}
// Index j was on work queue, meaning k := j-1 is S-type,
// so we can now place k correctly into sa.
// If k-1 is S-type, queue k for processing later in this loop.
// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
:= - 1
:= []
:= [-1]
if > {
= -
}
if != {
[] =
=
= []
}
--
[] = int64()
}
}
func induceSubS_32( []int32, , , []int32) {
// Initialize positions for right side of character buckets.
bucketMax_32(, , )
// Analogous to induceSubL_32 above,
// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
// Because j-1 is type S, inserting it into sa now will sort it correctly.
// But we want to distinguish a j-1 with j-2 of type S from type L.
// We can process the former but want to leave the latter for the caller.
// We record the difference by negating j-1 if it is preceded by type L.
// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
// and so on, in sorted but not necessarily adjacent order, until it finds
// one preceded by an index of type L, at which point it must stop.
// That index (preceded by one of type L) is an LMS-substring start.
//
// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
// so that the loop finishes with the top of sa containing exactly
// the LMS-substring start indexes, sorted by LMS-substring.
// Cache recently used bucket index:
:= int32(0)
:= []
:= len()
for := len() - 1; >= 0; -- {
:= int([])
if == 0 {
// Skip empty entry.
continue
}
[] = 0
if < 0 {
// Leave discovered LMS-substring start index for caller.
--
[] = int32(-)
continue
}
// Index j was on work queue, meaning k := j-1 is S-type,
// so we can now place k correctly into sa.
// If k-1 is S-type, queue k for processing later in this loop.
// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
:= - 1
:= []
:= [-1]
if > {
= -
}
if != {
[] =
=
= []
}
--
[] = int32()
}
}
func induceSubS_64( []int64, , , []int64) {
// Initialize positions for right side of character buckets.
bucketMax_64(, , )
// Analogous to induceSubL_64 above,
// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly
// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.
// Because j-1 is type S, inserting it into sa now will sort it correctly.
// But we want to distinguish a j-1 with j-2 of type S from type L.
// We can process the former but want to leave the latter for the caller.
// We record the difference by negating j-1 if it is preceded by type L.
// Either way, the insertion (into the text[j-1] bucket) is guaranteed to
// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have
// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,
// and so on, in sorted but not necessarily adjacent order, until it finds
// one preceded by an index of type L, at which point it must stop.
// That index (preceded by one of type L) is an LMS-substring start.
//
// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,
// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,
// so that the loop finishes with the top of sa containing exactly
// the LMS-substring start indexes, sorted by LMS-substring.
// Cache recently used bucket index:
:= int64(0)
:= []
:= len()
for := len() - 1; >= 0; -- {
:= int([])
if == 0 {
// Skip empty entry.
continue
}
[] = 0
if < 0 {
// Leave discovered LMS-substring start index for caller.
--
[] = int64(-)
continue
}
// Index j was on work queue, meaning k := j-1 is S-type,
// so we can now place k correctly into sa.
// If k-1 is S-type, queue k for processing later in this loop.
// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.
:= - 1
:= []
:= [-1]
if > {
= -
}
if != {
[] =
=
= []
}
--
[] = int64()
}
}
func length_8_64( []byte, []int64, int) {
:= 0 // index of current LMS-substring end (0 indicates final LMS-substring)
// The encoding of N text bytes into a “length” word
// adds 1 to each byte, packs them into the bottom
// N*8 bits of a word, and then bitwise inverts the result.
// That is, the text sequence A B C (hex 41 42 43)
// encodes as ^uint64(0x42_43_44).
// LMS-substrings can never start or end with 0xFF.
// Adding 1 ensures the encoded byte sequence never
// starts or ends with 0x00, so that present bytes can be
// distinguished from zero-padding in the top bits,
// so the length need not be separately encoded.
// Inverting the bytes increases the chance that a
// 4-byte encoding will still be ≥ len(text).
// In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F)
// then the high bit of the inversion will be set,
// making it clearly not a valid length (it would be a negative one).
//
// cx holds the pre-inverted encoding (the packed incremented bytes).
:= uint64(0) // byte-only
// This stanza (until the blank line) is the "LMS-substring iterator",
// described in placeLMS_8_64 above, with one line added to maintain cx.
, , := byte(0), byte(0), false
for := len() - 1; >= 0; -- {
, = [],
= <<8 | uint64(+1) // byte-only
if < {
= true
} else if > && {
= false
// Index j = i+1 is the start of an LMS-substring.
// Compute length or encoded text to store in sa[j/2].
:= + 1
var int64
if == 0 {
= 0
} else {
= int64( - )
if <= 64/8 && ^ >= uint64(len()) { // byte-only
= int64(^) // byte-only
} // byte-only
}
[>>1] =
= + 1
= uint64( + 1) // byte-only
}
}
}
func length_32( []int32, []int32, int) {
:= 0 // index of current LMS-substring end (0 indicates final LMS-substring)
// The encoding of N text int32s into a “length” word
// adds 1 to each int32, packs them into the bottom
// N*8 bits of a word, and then bitwise inverts the result.
// That is, the text sequence A B C (hex 41 42 43)
// encodes as ^uint32(0x42_43_44).
// LMS-substrings can never start or end with 0xFF.
// Adding 1 ensures the encoded int32 sequence never
// starts or ends with 0x00, so that present int32s can be
// distinguished from zero-padding in the top bits,
// so the length need not be separately encoded.
// Inverting the int32s increases the chance that a
// 4-int32 encoding will still be ≥ len(text).
// In particular, if the first int32 is ASCII (<= 0x7E, so +1 <= 0x7F)
// then the high bit of the inversion will be set,
// making it clearly not a valid length (it would be a negative one).
//
// cx holds the pre-inverted encoding (the packed incremented int32s).
// This stanza (until the blank line) is the "LMS-substring iterator",
// described in placeLMS_32 above, with one line added to maintain cx.
, , := int32(0), int32(0), false
for := len() - 1; >= 0; -- {
, = [],
if < {
= true
} else if > && {
= false
// Index j = i+1 is the start of an LMS-substring.
// Compute length or encoded text to store in sa[j/2].
:= + 1
var int32
if == 0 {
= 0
} else {
= int32( - )
}
[>>1] =
= + 1
}
}
}
func length_64( []int64, []int64, int) {
:= 0 // index of current LMS-substring end (0 indicates final LMS-substring)
// The encoding of N text int64s into a “length” word
// adds 1 to each int64, packs them into the bottom
// N*8 bits of a word, and then bitwise inverts the result.
// That is, the text sequence A B C (hex 41 42 43)
// encodes as ^uint64(0x42_43_44).
// LMS-substrings can never start or end with 0xFF.
// Adding 1 ensures the encoded int64 sequence never
// starts or ends with 0x00, so that present int64s can be
// distinguished from zero-padding in the top bits,
// so the length need not be separately encoded.
// Inverting the int64s increases the chance that a
// 4-int64 encoding will still be ≥ len(text).
// In particular, if the first int64 is ASCII (<= 0x7E, so +1 <= 0x7F)
// then the high bit of the inversion will be set,
// making it clearly not a valid length (it would be a negative one).
//
// cx holds the pre-inverted encoding (the packed incremented int64s).
// This stanza (until the blank line) is the "LMS-substring iterator",
// described in placeLMS_64 above, with one line added to maintain cx.
, , := int64(0), int64(0), false
for := len() - 1; >= 0; -- {
, = [],
if < {
= true
} else if > && {
= false
// Index j = i+1 is the start of an LMS-substring.
// Compute length or encoded text to store in sa[j/2].
:= + 1
var int64
if == 0 {
= 0
} else {
= int64( - )
}
[>>1] =
= + 1
}
}
}
func assignID_8_64( []byte, []int64, int) int {
:= 0
:= int64(-1) // impossible
:= int64(0)
for , := range [len()-:] {
// Is the LMS-substring at index j new, or is it the same as the last one we saw?
:= [/2]
if != {
goto
}
if uint64() >= uint64(len()) {
// “Length” is really encoded full text, and they match.
goto
}
{
// Compare actual texts.
:= int()
:= [:][:]
:= [:][:]
for := 0; < ; ++ {
if [] != [] {
goto
}
}
goto
}
:
++
=
=
:
[/2] = int64()
}
return
}
func assignID_32( []int32, []int32, int) int {
:= 0
:= int32(-1) // impossible
:= int32(0)
for , := range [len()-:] {
// Is the LMS-substring at index j new, or is it the same as the last one we saw?
:= [/2]
if != {
goto
}
if uint32() >= uint32(len()) {
// “Length” is really encoded full text, and they match.
goto
}
{
// Compare actual texts.
:= int()
:= [:][:]
:= [:][:]
for := 0; < ; ++ {
if [] != [] {
goto
}
}
goto
}
:
++
=
=
:
[/2] = int32()
}
return
}
func assignID_64( []int64, []int64, int) int {
:= 0
:= int64(-1) // impossible
:= int64(0)
for , := range [len()-:] {
// Is the LMS-substring at index j new, or is it the same as the last one we saw?
:= [/2]
if != {
goto
}
if uint64() >= uint64(len()) {
// “Length” is really encoded full text, and they match.
goto
}
{
// Compare actual texts.
:= int()
:= [:][:]
:= [:][:]
for := 0; < ; ++ {
if [] != [] {
goto
}
}
goto
}
:
++
=
=
:
[/2] = int64()
}
return
}
func map_64( []int64, int) {
:= len()
for := len() / 2; >= 0; -- {
:= []
if > 0 {
--
[] = - 1
}
}
}
func recurse_64(, []int64, , int) {
, , := [:], [:len()-], [len()-:]
// Set up temporary space for recursive call.
// We must pass sais_64 a tmp buffer with at least maxID entries.
//
// The subproblem is guaranteed to have length at most len(sa)/2,
// so that sa can hold both the subproblem and its suffix array.
// Nearly all the time, however, the subproblem has length < len(sa)/3,
// in which case there is a subproblem-sized middle of sa that
// we can reuse for temporary space (saTmp).
// When recurse_64 is called from sais_8_64, oldTmp is length 512
// (from text_64), and saTmp will typically be much larger, so we'll use saTmp.
// When deeper recursions come back to recurse_64, now oldTmp is
// the saTmp from the top-most recursion, it is typically larger than
// the current saTmp (because the current sa gets smaller and smaller
// as the recursion gets deeper), and we keep reusing that top-most
// large saTmp instead of the offered smaller ones.
//
// Why is the subproblem length so often just under len(sa)/3?
// See Nong, Zhang, and Chen, section 3.6 for a plausible explanation.
// In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern
// in the input, perfect alternation of larger and smaller input bytes.
// Real text doesn't do that. If each L-type index is randomly followed
// by either an L-type or S-type index, then half the substrings will
// be of the form SLS, but the other half will be longer. Of that half,
// half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on.
// Not counting the final S in each (which overlaps the first S in the next),
// This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3.
// The space we need is further reduced by the fact that many of the
// short patterns like SLS will often be the same character sequences
// repeated throughout the text, reducing maxID relative to numLMS.
//
// For short inputs, the averages may not run in our favor, but then we
// can often fall back to using the length-512 tmp available in the
// top-most call. (Also a short allocation would not be a big deal.)
//
// For pathological inputs, we fall back to allocating a new tmp of length
// max(maxID, numLMS/2). This level of the recursion needs maxID,
// and all deeper levels of the recursion will need no more than numLMS/2,
// so this one allocation is guaranteed to suffice for the entire stack
// of recursive calls.
:=
if len() < len() {
=
}
if len() < {
// TestSAIS/forcealloc reaches this code.
:=
if < /2 {
= / 2
}
= make([]int64, )
}
// sais_64 requires that the caller arrange to clear dst,
// because in general the caller may know dst is
// freshly-allocated and already cleared. But this one is not.
clear()
sais_64(, , , )
}
func unmap_8_64( []byte, []int64, int) {
:= [len()-:]
:= len()
// "LMS-substring iterator" (see placeLMS_8_64 above).
, , := byte(0), byte(0), false
for := len() - 1; >= 0; -- {
, = [],
if < {
= true
} else if > && {
= false
// Populate inverse map.
--
[] = int64( + 1)
}
}
// Apply inverse map to subproblem suffix array.
= [:]
for := 0; < len(); ++ {
[] = [[]]
}
}
func unmap_32( []int32, []int32, int) {
:= [len()-:]
:= len()
// "LMS-substring iterator" (see placeLMS_32 above).
, , := int32(0), int32(0), false
for := len() - 1; >= 0; -- {
, = [],
if < {
= true
} else if > && {
= false
// Populate inverse map.
--
[] = int32( + 1)
}
}
// Apply inverse map to subproblem suffix array.
= [:]
for := 0; < len(); ++ {
[] = [[]]
}
}
func unmap_64( []int64, []int64, int) {
:= [len()-:]
:= len()
// "LMS-substring iterator" (see placeLMS_64 above).
, , := int64(0), int64(0), false
for := len() - 1; >= 0; -- {
, = [],
if < {
= true
} else if > && {
= false
// Populate inverse map.
--
[] = int64( + 1)
}
}
// Apply inverse map to subproblem suffix array.
= [:]
for := 0; < len(); ++ {
[] = [[]]
}
}
func expand_8_64( []byte, , , []int64, int) {
bucketMax_8_64(, , )
= [:256] // eliminate bound check for bucket[c] below
// Loop backward through sa, always tracking
// the next index to populate from sa[:numLMS].
// When we get to one, populate it.
// Zero the rest of the slots; they have dead values in them.
:= - 1
:= []
:= []
:= [] - 1
[] =
for := len() - 1; >= 0; -- {
if != int() {
[] = 0
continue
}
[] =
// Load next entry to put down (if any).
if > 0 {
--
= [] // TODO bounds check
= []
= [] - 1
[] =
}
}
}
func expand_32( []int32, , , []int32, int) {
bucketMax_32(, , )
// Loop backward through sa, always tracking
// the next index to populate from sa[:numLMS].
// When we get to one, populate it.
// Zero the rest of the slots; they have dead values in them.
:= - 1
:= []
:= []
:= [] - 1
[] =
for := len() - 1; >= 0; -- {
if != int() {
[] = 0
continue
}
[] =
// Load next entry to put down (if any).
if > 0 {
--
= [] // TODO bounds check
= []
= [] - 1
[] =
}
}
}
func expand_64( []int64, , , []int64, int) {
bucketMax_64(, , )
// Loop backward through sa, always tracking
// the next index to populate from sa[:numLMS].
// When we get to one, populate it.
// Zero the rest of the slots; they have dead values in them.
:= - 1
:= []
:= []
:= [] - 1
[] =
for := len() - 1; >= 0; -- {
if != int() {
[] = 0
continue
}
[] =
// Load next entry to put down (if any).
if > 0 {
--
= [] // TODO bounds check
= []
= [] - 1
[] =
}
}
}
func induceL_8_64( []byte, , , []int64) {
// Initialize positions for left side of character buckets.
bucketMin_8_64(, , )
= [:256] // eliminate bounds check for bucket[cB] below
// This scan is similar to the one in induceSubL_8_64 above.
// That one arranges to clear all but the leftmost L-type indexes.
// This scan leaves all the L-type indexes and the original S-type
// indexes, but it negates the positive leftmost L-type indexes
// (the ones that induceS_8_64 needs to process).
// expand_8_64 left out the implicit entry sa[-1] == len(text),
// corresponding to the identified type-L index len(text)-1.
// Process it before the left-to-right scan of sa proper.
// See body in loop for commentary.
:= len() - 1
, := [-1], []
if < {
= -
}
// Cache recently used bucket index.
:=
:= []
[] = int64()
++
for := 0; < len(); ++ {
:= int([])
if <= 0 {
// Skip empty or negated entry (including negated zero).
continue
}
// Index j was on work queue, meaning k := j-1 is L-type,
// so we can now place k correctly into sa.
// If k-1 is L-type, queue k for processing later in this loop.
// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
// If k is zero, k-1 doesn't exist, so we only need to leave it
// for the caller. The caller can't tell the difference between
// an empty slot and a non-empty zero, but there's no need
// to distinguish them anyway: the final suffix array will end up
// with one zero somewhere, and that will be a real zero.
:= - 1
:= []
if > 0 {
if := [-1]; < {
= -
}
}
if != {
[] =
=
= []
}
[] = int64()
++
}
}
func induceL_32( []int32, , , []int32) {
// Initialize positions for left side of character buckets.
bucketMin_32(, , )
// This scan is similar to the one in induceSubL_32 above.
// That one arranges to clear all but the leftmost L-type indexes.
// This scan leaves all the L-type indexes and the original S-type
// indexes, but it negates the positive leftmost L-type indexes
// (the ones that induceS_32 needs to process).
// expand_32 left out the implicit entry sa[-1] == len(text),
// corresponding to the identified type-L index len(text)-1.
// Process it before the left-to-right scan of sa proper.
// See body in loop for commentary.
:= len() - 1
, := [-1], []
if < {
= -
}
// Cache recently used bucket index.
:=
:= []
[] = int32()
++
for := 0; < len(); ++ {
:= int([])
if <= 0 {
// Skip empty or negated entry (including negated zero).
continue
}
// Index j was on work queue, meaning k := j-1 is L-type,
// so we can now place k correctly into sa.
// If k-1 is L-type, queue k for processing later in this loop.
// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
// If k is zero, k-1 doesn't exist, so we only need to leave it
// for the caller. The caller can't tell the difference between
// an empty slot and a non-empty zero, but there's no need
// to distinguish them anyway: the final suffix array will end up
// with one zero somewhere, and that will be a real zero.
:= - 1
:= []
if > 0 {
if := [-1]; < {
= -
}
}
if != {
[] =
=
= []
}
[] = int32()
++
}
}
func induceL_64( []int64, , , []int64) {
// Initialize positions for left side of character buckets.
bucketMin_64(, , )
// This scan is similar to the one in induceSubL_64 above.
// That one arranges to clear all but the leftmost L-type indexes.
// This scan leaves all the L-type indexes and the original S-type
// indexes, but it negates the positive leftmost L-type indexes
// (the ones that induceS_64 needs to process).
// expand_64 left out the implicit entry sa[-1] == len(text),
// corresponding to the identified type-L index len(text)-1.
// Process it before the left-to-right scan of sa proper.
// See body in loop for commentary.
:= len() - 1
, := [-1], []
if < {
= -
}
// Cache recently used bucket index.
:=
:= []
[] = int64()
++
for := 0; < len(); ++ {
:= int([])
if <= 0 {
// Skip empty or negated entry (including negated zero).
continue
}
// Index j was on work queue, meaning k := j-1 is L-type,
// so we can now place k correctly into sa.
// If k-1 is L-type, queue k for processing later in this loop.
// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.
// If k is zero, k-1 doesn't exist, so we only need to leave it
// for the caller. The caller can't tell the difference between
// an empty slot and a non-empty zero, but there's no need
// to distinguish them anyway: the final suffix array will end up
// with one zero somewhere, and that will be a real zero.
:= - 1
:= []
if > 0 {
if := [-1]; < {
= -
}
}
if != {
[] =
=
= []
}
[] = int64()
++
}
}
func induceS_8_64( []byte, , , []int64) {
// Initialize positions for right side of character buckets.
bucketMax_8_64(, , )
= [:256] // eliminate bounds check for bucket[cB] below
:= byte(0)
:= []
for := len() - 1; >= 0; -- {
:= int([])
if >= 0 {
// Skip non-flagged entry.
// (This loop can't see an empty entry; 0 means the real zero index.)
continue
}
// Negative j is a work queue entry; rewrite to positive j for final suffix array.
= -
[] = int64()
// Index j was on work queue (encoded as -j but now decoded),
// meaning k := j-1 is L-type,
// so we can now place k correctly into sa.
// If k-1 is S-type, queue -k for processing later in this loop.
// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
// If k is zero, k-1 doesn't exist, so we only need to leave it
// for the caller.
:= - 1
:= []
if > 0 {
if := [-1]; <= {
= -
}
}
if != {
[] =
=
= []
}
--
[] = int64()
}
}
func induceS_32( []int32, , , []int32) {
// Initialize positions for right side of character buckets.
bucketMax_32(, , )
:= int32(0)
:= []
for := len() - 1; >= 0; -- {
:= int([])
if >= 0 {
// Skip non-flagged entry.
// (This loop can't see an empty entry; 0 means the real zero index.)
continue
}
// Negative j is a work queue entry; rewrite to positive j for final suffix array.
= -
[] = int32()
// Index j was on work queue (encoded as -j but now decoded),
// meaning k := j-1 is L-type,
// so we can now place k correctly into sa.
// If k-1 is S-type, queue -k for processing later in this loop.
// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
// If k is zero, k-1 doesn't exist, so we only need to leave it
// for the caller.
:= - 1
:= []
if > 0 {
if := [-1]; <= {
= -
}
}
if != {
[] =
=
= []
}
--
[] = int32()
}
}
func induceS_64( []int64, , , []int64) {
// Initialize positions for right side of character buckets.
bucketMax_64(, , )
:= int64(0)
:= []
for := len() - 1; >= 0; -- {
:= int([])
if >= 0 {
// Skip non-flagged entry.
// (This loop can't see an empty entry; 0 means the real zero index.)
continue
}
// Negative j is a work queue entry; rewrite to positive j for final suffix array.
= -
[] = int64()
// Index j was on work queue (encoded as -j but now decoded),
// meaning k := j-1 is L-type,
// so we can now place k correctly into sa.
// If k-1 is S-type, queue -k for processing later in this loop.
// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.
// If k is zero, k-1 doesn't exist, so we only need to leave it
// for the caller.
:= - 1
:= []
if > 0 {
if := [-1]; <= {
= -
}
}
if != {
[] =
=
= []
}
--
[] = int64()
}
}
The pages are generated with Golds v0.7.0-preview. (GOOS=linux GOARCH=amd64) Golds is a Go 101 project developed by Tapir Liu. PR and bug reports are welcome and can be submitted to the issue list. Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds. |