Source File
sais2.go
Belonging Package
index/suffixarray
// Copyright 2019 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.// Code generated by go generate; DO NOT EDIT.package suffixarrayfunc text_64( []byte, []int64) {if int(int64(len())) != len() || len() != len() {panic("suffixarray: misuse of text_64")}sais_8_64(, 256, , make([]int64, 2*256))}func sais_8_64( []byte, int, , []int64) {if len() != len() || len() < {panic("suffixarray: misuse of sais_8_64")}// Trivial base cases. Sorting 0 or 1 things is easy.if len() == 0 {return}if len() == 1 {[0] = 0return}// Establish slices indexed by text character// holding character frequency and bucket-sort offsets.// If there's only enough tmp for one slice,// we make it the bucket offsets and recompute// the character frequency each time we need it.var , []int64if len() >= 2* {, = [:], [:2*][0] = -1 // mark as uninitialized} else {, = nil, [:]}// The SAIS algorithm.// Each of these calls makes one scan through sa.// See the individual functions for documentation// about each's role in the algorithm.:= placeLMS_8_64(, , , )if <= 1 {// 0 or 1 items are already sorted. Do nothing.} else {induceSubL_8_64(, , , )induceSubS_8_64(, , , )length_8_64(, , ):= assignID_8_64(, , )if < {map_64(, )recurse_64(, , , )unmap_8_64(, , )} else {// If maxID == numLMS, then each LMS-substring// is unique, so the relative ordering of two LMS-suffixes// is determined by just the leading LMS-substring.// That is, the LMS-suffix sort order matches the// (simpler) LMS-substring sort order.// Copy the original LMS-substring order into the// suffix array destination.copy(, [len()-:])}expand_8_64(, , , , )}induceL_8_64(, , , )induceS_8_64(, , , )// Mark for caller that we overwrote tmp.[0] = -1}func sais_32( []int32, int, , []int32) {if len() != len() || len() < {panic("suffixarray: misuse of sais_32")}// Trivial base cases. Sorting 0 or 1 things is easy.if len() == 0 {return}if len() == 1 {[0] = 0return}// Establish slices indexed by text character// holding character frequency and bucket-sort offsets.// If there's only enough tmp for one slice,// we make it the bucket offsets and recompute// the character frequency each time we need it.var , []int32if len() >= 2* {, = [:], [:2*][0] = -1 // mark as uninitialized} else {, = nil, [:]}// The SAIS algorithm.// Each of these calls makes one scan through sa.// See the individual functions for documentation// about each's role in the algorithm.:= placeLMS_32(, , , )if <= 1 {// 0 or 1 items are already sorted. Do nothing.} else {induceSubL_32(, , , )induceSubS_32(, , , )length_32(, , ):= assignID_32(, , )if < {map_32(, )recurse_32(, , , )unmap_32(, , )} else {// If maxID == numLMS, then each LMS-substring// is unique, so the relative ordering of two LMS-suffixes// is determined by just the leading LMS-substring.// That is, the LMS-suffix sort order matches the// (simpler) LMS-substring sort order.// Copy the original LMS-substring order into the// suffix array destination.copy(, [len()-:])}expand_32(, , , , )}induceL_32(, , , )induceS_32(, , , )// Mark for caller that we overwrote tmp.[0] = -1}func sais_64( []int64, int, , []int64) {if len() != len() || len() < {panic("suffixarray: misuse of sais_64")}// Trivial base cases. Sorting 0 or 1 things is easy.if len() == 0 {return}if len() == 1 {[0] = 0return}// Establish slices indexed by text character// holding character frequency and bucket-sort offsets.// If there's only enough tmp for one slice,// we make it the bucket offsets and recompute// the character frequency each time we need it.var , []int64if len() >= 2* {, = [:], [:2*][0] = -1 // mark as uninitialized} else {, = nil, [:]}// The SAIS algorithm.// Each of these calls makes one scan through sa.// See the individual functions for documentation// about each's role in the algorithm.:= placeLMS_64(, , , )if <= 1 {// 0 or 1 items are already sorted. Do nothing.} else {induceSubL_64(, , , )induceSubS_64(, , , )length_64(, , ):= assignID_64(, , )if < {map_64(, )recurse_64(, , , )unmap_64(, , )} else {// If maxID == numLMS, then each LMS-substring// is unique, so the relative ordering of two LMS-suffixes// is determined by just the leading LMS-substring.// That is, the LMS-suffix sort order matches the// (simpler) LMS-substring sort order.// Copy the original LMS-substring order into the// suffix array destination.copy(, [len()-:])}expand_64(, , , , )}induceL_64(, , , )induceS_64(, , , )// Mark for caller that we overwrote tmp.[0] = -1}func freq_8_64( []byte, , []int64) []int64 {if != nil && [0] >= 0 {return // already computed}if == nil {=}= [:256] // eliminate bounds check for freq[c] belowclear()for , := range {[]++}return}func freq_32( []int32, , []int32) []int32 {if != nil && [0] >= 0 {return // already computed}if == nil {=}clear()for , := range {[]++}return}func freq_64( []int64, , []int64) []int64 {if != nil && [0] >= 0 {return // already computed}if == nil {=}clear()for , := range {[]++}return}func bucketMin_8_64( []byte, , []int64) {= freq_8_64(, , )= [:256] // establish len(freq) = 256, so 0 ≤ i < 256 below= [:256] // eliminate bounds check for bucket[i] below:= int64(0)for , := range {[] =+=}}func bucketMin_32( []int32, , []int32) {= freq_32(, , ):= int32(0)for , := range {[] =+=}}func bucketMin_64( []int64, , []int64) {= freq_64(, , ):= int64(0)for , := range {[] =+=}}func bucketMax_8_64( []byte, , []int64) {= freq_8_64(, , )= [:256] // establish len(freq) = 256, so 0 ≤ i < 256 below= [:256] // eliminate bounds check for bucket[i] below:= int64(0)for , := range {+=[] =}}func bucketMax_32( []int32, , []int32) {= freq_32(, , ):= int32(0)for , := range {+=[] =}}func bucketMax_64( []int64, , []int64) {= freq_64(, , ):= int64(0)for , := range {+=[] =}}func placeLMS_8_64( []byte, , , []int64) int {bucketMax_8_64(, , ):= 0:= int64(-1)= [:256] // eliminate bounds check for bucket[c1] below// The next stanza of code (until the blank line) loop backward// over text, stopping to execute a code body at each position i// such that text[i] is an L-character and text[i+1] is an S-character.// That is, i+1 is the position of the start of an LMS-substring.// These could be hoisted out into a function with a callback,// but at a significant speed cost. Instead, we just write these// seven lines a few times in this source file. The copies below// refer back to the pattern established by this original as the// "LMS-substring iterator".//// In every scan through the text, c0, c1 are successive characters of text.// In this backward scan, c0 == text[i] and c1 == text[i+1].// By scanning backward, we can keep track of whether the current// position is type-S or type-L according to the usual definition://// - position len(text) is type S with text[len(text)] == -1 (the sentinel)// - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.// - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.//// The backward scan lets us maintain the current type,// update it when we see c0 != c1, and otherwise leave it alone.// We want to identify all S positions with a preceding L.// Position len(text) is one such position by definition, but we have// nowhere to write it down, so we eliminate it by untruthfully// setting isTypeS = false at the start of the loop., , := byte(0), byte(0), falsefor := len() - 1; >= 0; -- {, = [],if < {= true} else if > && {= false// Bucket the index i+1 for the start of an LMS-substring.:= [] - 1[] =[] = int64( + 1)=++}}// We recorded the LMS-substring starts but really want the ends.// Luckily, with two differences, the start indexes and the end indexes are the same.// The first difference is that the rightmost LMS-substring's end index is len(text),// so the caller must pretend that sa[-1] == len(text), as noted above.// The second difference is that the first leftmost LMS-substring start index// does not end an earlier LMS-substring, so as an optimization we can omit// that leftmost LMS-substring start index (the last one we wrote).//// Exception: if numLMS <= 1, the caller is not going to bother with// the recursion at all and will treat the result as containing LMS-substring starts.// In that case, we don't remove the final entry.if > 1 {[] = 0}return}func placeLMS_32( []int32, , , []int32) int {bucketMax_32(, , ):= 0:= int32(-1)// The next stanza of code (until the blank line) loop backward// over text, stopping to execute a code body at each position i// such that text[i] is an L-character and text[i+1] is an S-character.// That is, i+1 is the position of the start of an LMS-substring.// These could be hoisted out into a function with a callback,// but at a significant speed cost. Instead, we just write these// seven lines a few times in this source file. The copies below// refer back to the pattern established by this original as the// "LMS-substring iterator".//// In every scan through the text, c0, c1 are successive characters of text.// In this backward scan, c0 == text[i] and c1 == text[i+1].// By scanning backward, we can keep track of whether the current// position is type-S or type-L according to the usual definition://// - position len(text) is type S with text[len(text)] == -1 (the sentinel)// - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.// - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.//// The backward scan lets us maintain the current type,// update it when we see c0 != c1, and otherwise leave it alone.// We want to identify all S positions with a preceding L.// Position len(text) is one such position by definition, but we have// nowhere to write it down, so we eliminate it by untruthfully// setting isTypeS = false at the start of the loop., , := int32(0), int32(0), falsefor := len() - 1; >= 0; -- {, = [],if < {= true} else if > && {= false// Bucket the index i+1 for the start of an LMS-substring.:= [] - 1[] =[] = int32( + 1)=++}}// We recorded the LMS-substring starts but really want the ends.// Luckily, with two differences, the start indexes and the end indexes are the same.// The first difference is that the rightmost LMS-substring's end index is len(text),// so the caller must pretend that sa[-1] == len(text), as noted above.// The second difference is that the first leftmost LMS-substring start index// does not end an earlier LMS-substring, so as an optimization we can omit// that leftmost LMS-substring start index (the last one we wrote).//// Exception: if numLMS <= 1, the caller is not going to bother with// the recursion at all and will treat the result as containing LMS-substring starts.// In that case, we don't remove the final entry.if > 1 {[] = 0}return}func placeLMS_64( []int64, , , []int64) int {bucketMax_64(, , ):= 0:= int64(-1)// The next stanza of code (until the blank line) loop backward// over text, stopping to execute a code body at each position i// such that text[i] is an L-character and text[i+1] is an S-character.// That is, i+1 is the position of the start of an LMS-substring.// These could be hoisted out into a function with a callback,// but at a significant speed cost. Instead, we just write these// seven lines a few times in this source file. The copies below// refer back to the pattern established by this original as the// "LMS-substring iterator".//// In every scan through the text, c0, c1 are successive characters of text.// In this backward scan, c0 == text[i] and c1 == text[i+1].// By scanning backward, we can keep track of whether the current// position is type-S or type-L according to the usual definition://// - position len(text) is type S with text[len(text)] == -1 (the sentinel)// - position i is type S if text[i] < text[i+1], or if text[i] == text[i+1] && i+1 is type S.// - position i is type L if text[i] > text[i+1], or if text[i] == text[i+1] && i+1 is type L.//// The backward scan lets us maintain the current type,// update it when we see c0 != c1, and otherwise leave it alone.// We want to identify all S positions with a preceding L.// Position len(text) is one such position by definition, but we have// nowhere to write it down, so we eliminate it by untruthfully// setting isTypeS = false at the start of the loop., , := int64(0), int64(0), falsefor := len() - 1; >= 0; -- {, = [],if < {= true} else if > && {= false// Bucket the index i+1 for the start of an LMS-substring.:= [] - 1[] =[] = int64( + 1)=++}}// We recorded the LMS-substring starts but really want the ends.// Luckily, with two differences, the start indexes and the end indexes are the same.// The first difference is that the rightmost LMS-substring's end index is len(text),// so the caller must pretend that sa[-1] == len(text), as noted above.// The second difference is that the first leftmost LMS-substring start index// does not end an earlier LMS-substring, so as an optimization we can omit// that leftmost LMS-substring start index (the last one we wrote).//// Exception: if numLMS <= 1, the caller is not going to bother with// the recursion at all and will treat the result as containing LMS-substring starts.// In that case, we don't remove the final entry.if > 1 {[] = 0}return}func induceSubL_8_64( []byte, , , []int64) {// Initialize positions for left side of character buckets.bucketMin_8_64(, , )= [:256] // eliminate bounds check for bucket[cB] below// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.// Because j-1 is type L, inserting it into sa now will sort it correctly.// But we want to distinguish a j-1 with j-2 of type L from type S.// We can process the former but want to leave the latter for the caller.// We record the difference by negating j-1 if it is preceded by type S.// Either way, the insertion (into the text[j-1] bucket) is guaranteed to// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,// and so on, in sorted but not necessarily adjacent order, until it finds// one preceded by an index of type S, at which point it must stop.//// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing// only the indexes of the leftmost L-type indexes for each LMS-substring.//// The suffix array sa therefore serves simultaneously as input, output,// and a miraculously well-tailored work queue.// placeLMS_8_64 left out the implicit entry sa[-1] == len(text),// corresponding to the identified type-L index len(text)-1.// Process it before the left-to-right scan of sa proper.// See body in loop for commentary.:= len() - 1, := [-1], []if < {= -}// Cache recently used bucket index:// we're processing suffixes in sorted order// and accessing buckets indexed by the// byte before the sorted order, which still// has very good locality.// Invariant: b is cached, possibly dirty copy of bucket[cB].:=:= [][] = int64()++for := 0; < len(); ++ {:= int([])if == 0 {// Skip empty entry.continue}if < 0 {// Leave discovered type-S index for caller.[] = int64(-)continue}[] = 0// Index j was on work queue, meaning k := j-1 is L-type,// so we can now place k correctly into sa.// If k-1 is L-type, queue k for processing later in this loop.// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.:= - 1, := [-1], []if < {= -}if != {[] === []}[] = int64()++}}func induceSubL_32( []int32, , , []int32) {// Initialize positions for left side of character buckets.bucketMin_32(, , )// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.// Because j-1 is type L, inserting it into sa now will sort it correctly.// But we want to distinguish a j-1 with j-2 of type L from type S.// We can process the former but want to leave the latter for the caller.// We record the difference by negating j-1 if it is preceded by type S.// Either way, the insertion (into the text[j-1] bucket) is guaranteed to// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,// and so on, in sorted but not necessarily adjacent order, until it finds// one preceded by an index of type S, at which point it must stop.//// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing// only the indexes of the leftmost L-type indexes for each LMS-substring.//// The suffix array sa therefore serves simultaneously as input, output,// and a miraculously well-tailored work queue.// placeLMS_32 left out the implicit entry sa[-1] == len(text),// corresponding to the identified type-L index len(text)-1.// Process it before the left-to-right scan of sa proper.// See body in loop for commentary.:= len() - 1, := [-1], []if < {= -}// Cache recently used bucket index:// we're processing suffixes in sorted order// and accessing buckets indexed by the// int32 before the sorted order, which still// has very good locality.// Invariant: b is cached, possibly dirty copy of bucket[cB].:=:= [][] = int32()++for := 0; < len(); ++ {:= int([])if == 0 {// Skip empty entry.continue}if < 0 {// Leave discovered type-S index for caller.[] = int32(-)continue}[] = 0// Index j was on work queue, meaning k := j-1 is L-type,// so we can now place k correctly into sa.// If k-1 is L-type, queue k for processing later in this loop.// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.:= - 1, := [-1], []if < {= -}if != {[] === []}[] = int32()++}}func induceSubL_64( []int64, , , []int64) {// Initialize positions for left side of character buckets.bucketMin_64(, , )// As we scan the array left-to-right, each sa[i] = j > 0 is a correctly// sorted suffix array entry (for text[j:]) for which we know that j-1 is type L.// Because j-1 is type L, inserting it into sa now will sort it correctly.// But we want to distinguish a j-1 with j-2 of type L from type S.// We can process the former but want to leave the latter for the caller.// We record the difference by negating j-1 if it is preceded by type S.// Either way, the insertion (into the text[j-1] bucket) is guaranteed to// happen at sa[i´] for some i´ > i, that is, in the portion of sa we have// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,// and so on, in sorted but not necessarily adjacent order, until it finds// one preceded by an index of type S, at which point it must stop.//// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,// and we flip sa[i] < 0 to -sa[i], so that the loop finishes with sa containing// only the indexes of the leftmost L-type indexes for each LMS-substring.//// The suffix array sa therefore serves simultaneously as input, output,// and a miraculously well-tailored work queue.// placeLMS_64 left out the implicit entry sa[-1] == len(text),// corresponding to the identified type-L index len(text)-1.// Process it before the left-to-right scan of sa proper.// See body in loop for commentary.:= len() - 1, := [-1], []if < {= -}// Cache recently used bucket index:// we're processing suffixes in sorted order// and accessing buckets indexed by the// int64 before the sorted order, which still// has very good locality.// Invariant: b is cached, possibly dirty copy of bucket[cB].:=:= [][] = int64()++for := 0; < len(); ++ {:= int([])if == 0 {// Skip empty entry.continue}if < 0 {// Leave discovered type-S index for caller.[] = int64(-)continue}[] = 0// Index j was on work queue, meaning k := j-1 is L-type,// so we can now place k correctly into sa.// If k-1 is L-type, queue k for processing later in this loop.// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.:= - 1, := [-1], []if < {= -}if != {[] === []}[] = int64()++}}func induceSubS_8_64( []byte, , , []int64) {// Initialize positions for right side of character buckets.bucketMax_8_64(, , )= [:256] // eliminate bounds check for bucket[cB] below// Analogous to induceSubL_8_64 above,// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.// Because j-1 is type S, inserting it into sa now will sort it correctly.// But we want to distinguish a j-1 with j-2 of type S from type L.// We can process the former but want to leave the latter for the caller.// We record the difference by negating j-1 if it is preceded by type L.// Either way, the insertion (into the text[j-1] bucket) is guaranteed to// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,// and so on, in sorted but not necessarily adjacent order, until it finds// one preceded by an index of type L, at which point it must stop.// That index (preceded by one of type L) is an LMS-substring start.//// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,// so that the loop finishes with the top of sa containing exactly// the LMS-substring start indexes, sorted by LMS-substring.// Cache recently used bucket index::= byte(0):= []:= len()for := len() - 1; >= 0; -- {:= int([])if == 0 {// Skip empty entry.continue}[] = 0if < 0 {// Leave discovered LMS-substring start index for caller.--[] = int64(-)continue}// Index j was on work queue, meaning k := j-1 is S-type,// so we can now place k correctly into sa.// If k-1 is S-type, queue k for processing later in this loop.// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.:= - 1:= []:= [-1]if > {= -}if != {[] === []}--[] = int64()}}func induceSubS_32( []int32, , , []int32) {// Initialize positions for right side of character buckets.bucketMax_32(, , )// Analogous to induceSubL_32 above,// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.// Because j-1 is type S, inserting it into sa now will sort it correctly.// But we want to distinguish a j-1 with j-2 of type S from type L.// We can process the former but want to leave the latter for the caller.// We record the difference by negating j-1 if it is preceded by type L.// Either way, the insertion (into the text[j-1] bucket) is guaranteed to// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,// and so on, in sorted but not necessarily adjacent order, until it finds// one preceded by an index of type L, at which point it must stop.// That index (preceded by one of type L) is an LMS-substring start.//// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,// so that the loop finishes with the top of sa containing exactly// the LMS-substring start indexes, sorted by LMS-substring.// Cache recently used bucket index::= int32(0):= []:= len()for := len() - 1; >= 0; -- {:= int([])if == 0 {// Skip empty entry.continue}[] = 0if < 0 {// Leave discovered LMS-substring start index for caller.--[] = int32(-)continue}// Index j was on work queue, meaning k := j-1 is S-type,// so we can now place k correctly into sa.// If k-1 is S-type, queue k for processing later in this loop.// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.:= - 1:= []:= [-1]if > {= -}if != {[] === []}--[] = int32()}}func induceSubS_64( []int64, , , []int64) {// Initialize positions for right side of character buckets.bucketMax_64(, , )// Analogous to induceSubL_64 above,// as we scan the array right-to-left, each sa[i] = j > 0 is a correctly// sorted suffix array entry (for text[j:]) for which we know that j-1 is type S.// Because j-1 is type S, inserting it into sa now will sort it correctly.// But we want to distinguish a j-1 with j-2 of type S from type L.// We can process the former but want to leave the latter for the caller.// We record the difference by negating j-1 if it is preceded by type L.// Either way, the insertion (into the text[j-1] bucket) is guaranteed to// happen at sa[i´] for some i´ < i, that is, in the portion of sa we have// yet to scan. A single pass therefore sees indexes j, j-1, j-2, j-3,// and so on, in sorted but not necessarily adjacent order, until it finds// one preceded by an index of type L, at which point it must stop.// That index (preceded by one of type L) is an LMS-substring start.//// As we scan through the array, we clear the worked entries (sa[i] > 0) to zero,// and we flip sa[i] < 0 to -sa[i] and compact into the top of sa,// so that the loop finishes with the top of sa containing exactly// the LMS-substring start indexes, sorted by LMS-substring.// Cache recently used bucket index::= int64(0):= []:= len()for := len() - 1; >= 0; -- {:= int([])if == 0 {// Skip empty entry.continue}[] = 0if < 0 {// Leave discovered LMS-substring start index for caller.--[] = int64(-)continue}// Index j was on work queue, meaning k := j-1 is S-type,// so we can now place k correctly into sa.// If k-1 is S-type, queue k for processing later in this loop.// If k-1 is L-type (text[k-1] > text[k]), queue -k to save for the caller.:= - 1:= []:= [-1]if > {= -}if != {[] === []}--[] = int64()}}func length_8_64( []byte, []int64, int) {:= 0 // index of current LMS-substring end (0 indicates final LMS-substring)// The encoding of N text bytes into a “length” word// adds 1 to each byte, packs them into the bottom// N*8 bits of a word, and then bitwise inverts the result.// That is, the text sequence A B C (hex 41 42 43)// encodes as ^uint64(0x42_43_44).// LMS-substrings can never start or end with 0xFF.// Adding 1 ensures the encoded byte sequence never// starts or ends with 0x00, so that present bytes can be// distinguished from zero-padding in the top bits,// so the length need not be separately encoded.// Inverting the bytes increases the chance that a// 4-byte encoding will still be ≥ len(text).// In particular, if the first byte is ASCII (<= 0x7E, so +1 <= 0x7F)// then the high bit of the inversion will be set,// making it clearly not a valid length (it would be a negative one).//// cx holds the pre-inverted encoding (the packed incremented bytes).:= uint64(0) // byte-only// This stanza (until the blank line) is the "LMS-substring iterator",// described in placeLMS_8_64 above, with one line added to maintain cx., , := byte(0), byte(0), falsefor := len() - 1; >= 0; -- {, = [],= <<8 | uint64(+1) // byte-onlyif < {= true} else if > && {= false// Index j = i+1 is the start of an LMS-substring.// Compute length or encoded text to store in sa[j/2].:= + 1var int64if == 0 {= 0} else {= int64( - )if <= 64/8 && ^ >= uint64(len()) { // byte-only= int64(^) // byte-only} // byte-only}[>>1] == + 1= uint64( + 1) // byte-only}}}func length_32( []int32, []int32, int) {:= 0 // index of current LMS-substring end (0 indicates final LMS-substring)// The encoding of N text int32s into a “length” word// adds 1 to each int32, packs them into the bottom// N*8 bits of a word, and then bitwise inverts the result.// That is, the text sequence A B C (hex 41 42 43)// encodes as ^uint32(0x42_43_44).// LMS-substrings can never start or end with 0xFF.// Adding 1 ensures the encoded int32 sequence never// starts or ends with 0x00, so that present int32s can be// distinguished from zero-padding in the top bits,// so the length need not be separately encoded.// Inverting the int32s increases the chance that a// 4-int32 encoding will still be ≥ len(text).// In particular, if the first int32 is ASCII (<= 0x7E, so +1 <= 0x7F)// then the high bit of the inversion will be set,// making it clearly not a valid length (it would be a negative one).//// cx holds the pre-inverted encoding (the packed incremented int32s).// This stanza (until the blank line) is the "LMS-substring iterator",// described in placeLMS_32 above, with one line added to maintain cx., , := int32(0), int32(0), falsefor := len() - 1; >= 0; -- {, = [],if < {= true} else if > && {= false// Index j = i+1 is the start of an LMS-substring.// Compute length or encoded text to store in sa[j/2].:= + 1var int32if == 0 {= 0} else {= int32( - )}[>>1] == + 1}}}func length_64( []int64, []int64, int) {:= 0 // index of current LMS-substring end (0 indicates final LMS-substring)// The encoding of N text int64s into a “length” word// adds 1 to each int64, packs them into the bottom// N*8 bits of a word, and then bitwise inverts the result.// That is, the text sequence A B C (hex 41 42 43)// encodes as ^uint64(0x42_43_44).// LMS-substrings can never start or end with 0xFF.// Adding 1 ensures the encoded int64 sequence never// starts or ends with 0x00, so that present int64s can be// distinguished from zero-padding in the top bits,// so the length need not be separately encoded.// Inverting the int64s increases the chance that a// 4-int64 encoding will still be ≥ len(text).// In particular, if the first int64 is ASCII (<= 0x7E, so +1 <= 0x7F)// then the high bit of the inversion will be set,// making it clearly not a valid length (it would be a negative one).//// cx holds the pre-inverted encoding (the packed incremented int64s).// This stanza (until the blank line) is the "LMS-substring iterator",// described in placeLMS_64 above, with one line added to maintain cx., , := int64(0), int64(0), falsefor := len() - 1; >= 0; -- {, = [],if < {= true} else if > && {= false// Index j = i+1 is the start of an LMS-substring.// Compute length or encoded text to store in sa[j/2].:= + 1var int64if == 0 {= 0} else {= int64( - )}[>>1] == + 1}}}func assignID_8_64( []byte, []int64, int) int {:= 0:= int64(-1) // impossible:= int64(0)for , := range [len()-:] {// Is the LMS-substring at index j new, or is it the same as the last one we saw?:= [/2]if != {goto}if uint64() >= uint64(len()) {// “Length” is really encoded full text, and they match.goto}{// Compare actual texts.:= int():= [:][:]:= [:][:]for := 0; < ; ++ {if [] != [] {goto}}goto}:++==:[/2] = int64()}return}func assignID_32( []int32, []int32, int) int {:= 0:= int32(-1) // impossible:= int32(0)for , := range [len()-:] {// Is the LMS-substring at index j new, or is it the same as the last one we saw?:= [/2]if != {goto}if uint32() >= uint32(len()) {// “Length” is really encoded full text, and they match.goto}{// Compare actual texts.:= int():= [:][:]:= [:][:]for := 0; < ; ++ {if [] != [] {goto}}goto}:++==:[/2] = int32()}return}func assignID_64( []int64, []int64, int) int {:= 0:= int64(-1) // impossible:= int64(0)for , := range [len()-:] {// Is the LMS-substring at index j new, or is it the same as the last one we saw?:= [/2]if != {goto}if uint64() >= uint64(len()) {// “Length” is really encoded full text, and they match.goto}{// Compare actual texts.:= int():= [:][:]:= [:][:]for := 0; < ; ++ {if [] != [] {goto}}goto}:++==:[/2] = int64()}return}func map_64( []int64, int) {:= len()for := len() / 2; >= 0; -- {:= []if > 0 {--[] = - 1}}}func recurse_64(, []int64, , int) {, , := [:], [:len()-], [len()-:]// Set up temporary space for recursive call.// We must pass sais_64 a tmp buffer with at least maxID entries.//// The subproblem is guaranteed to have length at most len(sa)/2,// so that sa can hold both the subproblem and its suffix array.// Nearly all the time, however, the subproblem has length < len(sa)/3,// in which case there is a subproblem-sized middle of sa that// we can reuse for temporary space (saTmp).// When recurse_64 is called from sais_8_64, oldTmp is length 512// (from text_64), and saTmp will typically be much larger, so we'll use saTmp.// When deeper recursions come back to recurse_64, now oldTmp is// the saTmp from the top-most recursion, it is typically larger than// the current saTmp (because the current sa gets smaller and smaller// as the recursion gets deeper), and we keep reusing that top-most// large saTmp instead of the offered smaller ones.//// Why is the subproblem length so often just under len(sa)/3?// See Nong, Zhang, and Chen, section 3.6 for a plausible explanation.// In brief, the len(sa)/2 case would correspond to an SLSLSLSLSLSL pattern// in the input, perfect alternation of larger and smaller input bytes.// Real text doesn't do that. If each L-type index is randomly followed// by either an L-type or S-type index, then half the substrings will// be of the form SLS, but the other half will be longer. Of that half,// half (a quarter overall) will be SLLS; an eighth will be SLLLS, and so on.// Not counting the final S in each (which overlaps the first S in the next),// This works out to an average length 2×½ + 3×¼ + 4×⅛ + ... = 3.// The space we need is further reduced by the fact that many of the// short patterns like SLS will often be the same character sequences// repeated throughout the text, reducing maxID relative to numLMS.//// For short inputs, the averages may not run in our favor, but then we// can often fall back to using the length-512 tmp available in the// top-most call. (Also a short allocation would not be a big deal.)//// For pathological inputs, we fall back to allocating a new tmp of length// max(maxID, numLMS/2). This level of the recursion needs maxID,// and all deeper levels of the recursion will need no more than numLMS/2,// so this one allocation is guaranteed to suffice for the entire stack// of recursive calls.:=if len() < len() {=}if len() < {// TestSAIS/forcealloc reaches this code.:=if < /2 {= / 2}= make([]int64, )}// sais_64 requires that the caller arrange to clear dst,// because in general the caller may know dst is// freshly-allocated and already cleared. But this one is not.clear()sais_64(, , , )}func unmap_8_64( []byte, []int64, int) {:= [len()-:]:= len()// "LMS-substring iterator" (see placeLMS_8_64 above)., , := byte(0), byte(0), falsefor := len() - 1; >= 0; -- {, = [],if < {= true} else if > && {= false// Populate inverse map.--[] = int64( + 1)}}// Apply inverse map to subproblem suffix array.= [:]for := 0; < len(); ++ {[] = [[]]}}func unmap_32( []int32, []int32, int) {:= [len()-:]:= len()// "LMS-substring iterator" (see placeLMS_32 above)., , := int32(0), int32(0), falsefor := len() - 1; >= 0; -- {, = [],if < {= true} else if > && {= false// Populate inverse map.--[] = int32( + 1)}}// Apply inverse map to subproblem suffix array.= [:]for := 0; < len(); ++ {[] = [[]]}}func unmap_64( []int64, []int64, int) {:= [len()-:]:= len()// "LMS-substring iterator" (see placeLMS_64 above)., , := int64(0), int64(0), falsefor := len() - 1; >= 0; -- {, = [],if < {= true} else if > && {= false// Populate inverse map.--[] = int64( + 1)}}// Apply inverse map to subproblem suffix array.= [:]for := 0; < len(); ++ {[] = [[]]}}func expand_8_64( []byte, , , []int64, int) {bucketMax_8_64(, , )= [:256] // eliminate bound check for bucket[c] below// Loop backward through sa, always tracking// the next index to populate from sa[:numLMS].// When we get to one, populate it.// Zero the rest of the slots; they have dead values in them.:= - 1:= []:= []:= [] - 1[] =for := len() - 1; >= 0; -- {if != int() {[] = 0continue}[] =// Load next entry to put down (if any).if > 0 {--= [] // TODO bounds check= []= [] - 1[] =}}}func expand_32( []int32, , , []int32, int) {bucketMax_32(, , )// Loop backward through sa, always tracking// the next index to populate from sa[:numLMS].// When we get to one, populate it.// Zero the rest of the slots; they have dead values in them.:= - 1:= []:= []:= [] - 1[] =for := len() - 1; >= 0; -- {if != int() {[] = 0continue}[] =// Load next entry to put down (if any).if > 0 {--= [] // TODO bounds check= []= [] - 1[] =}}}func expand_64( []int64, , , []int64, int) {bucketMax_64(, , )// Loop backward through sa, always tracking// the next index to populate from sa[:numLMS].// When we get to one, populate it.// Zero the rest of the slots; they have dead values in them.:= - 1:= []:= []:= [] - 1[] =for := len() - 1; >= 0; -- {if != int() {[] = 0continue}[] =// Load next entry to put down (if any).if > 0 {--= [] // TODO bounds check= []= [] - 1[] =}}}func induceL_8_64( []byte, , , []int64) {// Initialize positions for left side of character buckets.bucketMin_8_64(, , )= [:256] // eliminate bounds check for bucket[cB] below// This scan is similar to the one in induceSubL_8_64 above.// That one arranges to clear all but the leftmost L-type indexes.// This scan leaves all the L-type indexes and the original S-type// indexes, but it negates the positive leftmost L-type indexes// (the ones that induceS_8_64 needs to process).// expand_8_64 left out the implicit entry sa[-1] == len(text),// corresponding to the identified type-L index len(text)-1.// Process it before the left-to-right scan of sa proper.// See body in loop for commentary.:= len() - 1, := [-1], []if < {= -}// Cache recently used bucket index.:=:= [][] = int64()++for := 0; < len(); ++ {:= int([])if <= 0 {// Skip empty or negated entry (including negated zero).continue}// Index j was on work queue, meaning k := j-1 is L-type,// so we can now place k correctly into sa.// If k-1 is L-type, queue k for processing later in this loop.// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.// If k is zero, k-1 doesn't exist, so we only need to leave it// for the caller. The caller can't tell the difference between// an empty slot and a non-empty zero, but there's no need// to distinguish them anyway: the final suffix array will end up// with one zero somewhere, and that will be a real zero.:= - 1:= []if > 0 {if := [-1]; < {= -}}if != {[] === []}[] = int64()++}}func induceL_32( []int32, , , []int32) {// Initialize positions for left side of character buckets.bucketMin_32(, , )// This scan is similar to the one in induceSubL_32 above.// That one arranges to clear all but the leftmost L-type indexes.// This scan leaves all the L-type indexes and the original S-type// indexes, but it negates the positive leftmost L-type indexes// (the ones that induceS_32 needs to process).// expand_32 left out the implicit entry sa[-1] == len(text),// corresponding to the identified type-L index len(text)-1.// Process it before the left-to-right scan of sa proper.// See body in loop for commentary.:= len() - 1, := [-1], []if < {= -}// Cache recently used bucket index.:=:= [][] = int32()++for := 0; < len(); ++ {:= int([])if <= 0 {// Skip empty or negated entry (including negated zero).continue}// Index j was on work queue, meaning k := j-1 is L-type,// so we can now place k correctly into sa.// If k-1 is L-type, queue k for processing later in this loop.// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.// If k is zero, k-1 doesn't exist, so we only need to leave it// for the caller. The caller can't tell the difference between// an empty slot and a non-empty zero, but there's no need// to distinguish them anyway: the final suffix array will end up// with one zero somewhere, and that will be a real zero.:= - 1:= []if > 0 {if := [-1]; < {= -}}if != {[] === []}[] = int32()++}}func induceL_64( []int64, , , []int64) {// Initialize positions for left side of character buckets.bucketMin_64(, , )// This scan is similar to the one in induceSubL_64 above.// That one arranges to clear all but the leftmost L-type indexes.// This scan leaves all the L-type indexes and the original S-type// indexes, but it negates the positive leftmost L-type indexes// (the ones that induceS_64 needs to process).// expand_64 left out the implicit entry sa[-1] == len(text),// corresponding to the identified type-L index len(text)-1.// Process it before the left-to-right scan of sa proper.// See body in loop for commentary.:= len() - 1, := [-1], []if < {= -}// Cache recently used bucket index.:=:= [][] = int64()++for := 0; < len(); ++ {:= int([])if <= 0 {// Skip empty or negated entry (including negated zero).continue}// Index j was on work queue, meaning k := j-1 is L-type,// so we can now place k correctly into sa.// If k-1 is L-type, queue k for processing later in this loop.// If k-1 is S-type (text[k-1] < text[k]), queue -k to save for the caller.// If k is zero, k-1 doesn't exist, so we only need to leave it// for the caller. The caller can't tell the difference between// an empty slot and a non-empty zero, but there's no need// to distinguish them anyway: the final suffix array will end up// with one zero somewhere, and that will be a real zero.:= - 1:= []if > 0 {if := [-1]; < {= -}}if != {[] === []}[] = int64()++}}func induceS_8_64( []byte, , , []int64) {// Initialize positions for right side of character buckets.bucketMax_8_64(, , )= [:256] // eliminate bounds check for bucket[cB] below:= byte(0):= []for := len() - 1; >= 0; -- {:= int([])if >= 0 {// Skip non-flagged entry.// (This loop can't see an empty entry; 0 means the real zero index.)continue}// Negative j is a work queue entry; rewrite to positive j for final suffix array.= -[] = int64()// Index j was on work queue (encoded as -j but now decoded),// meaning k := j-1 is L-type,// so we can now place k correctly into sa.// If k-1 is S-type, queue -k for processing later in this loop.// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.// If k is zero, k-1 doesn't exist, so we only need to leave it// for the caller.:= - 1:= []if > 0 {if := [-1]; <= {= -}}if != {[] === []}--[] = int64()}}func induceS_32( []int32, , , []int32) {// Initialize positions for right side of character buckets.bucketMax_32(, , ):= int32(0):= []for := len() - 1; >= 0; -- {:= int([])if >= 0 {// Skip non-flagged entry.// (This loop can't see an empty entry; 0 means the real zero index.)continue}// Negative j is a work queue entry; rewrite to positive j for final suffix array.= -[] = int32()// Index j was on work queue (encoded as -j but now decoded),// meaning k := j-1 is L-type,// so we can now place k correctly into sa.// If k-1 is S-type, queue -k for processing later in this loop.// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.// If k is zero, k-1 doesn't exist, so we only need to leave it// for the caller.:= - 1:= []if > 0 {if := [-1]; <= {= -}}if != {[] === []}--[] = int32()}}func induceS_64( []int64, , , []int64) {// Initialize positions for right side of character buckets.bucketMax_64(, , ):= int64(0):= []for := len() - 1; >= 0; -- {:= int([])if >= 0 {// Skip non-flagged entry.// (This loop can't see an empty entry; 0 means the real zero index.)continue}// Negative j is a work queue entry; rewrite to positive j for final suffix array.= -[] = int64()// Index j was on work queue (encoded as -j but now decoded),// meaning k := j-1 is L-type,// so we can now place k correctly into sa.// If k-1 is S-type, queue -k for processing later in this loop.// If k-1 is L-type (text[k-1] > text[k]), queue k to save for the caller.// If k is zero, k-1 doesn't exist, so we only need to leave it// for the caller.:= - 1:= []if > 0 {if := [-1]; <= {= -}}if != {[] === []}--[] = int64()}}
![]() |
The pages are generated with Golds v0.7.9-preview. (GOOS=linux GOARCH=amd64) Golds is a Go 101 project developed by Tapir Liu. PR and bug reports are welcome and can be submitted to the issue list. Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds. |