erigon-pulse/sais/gsa/gsaca.go

170 lines
3.0 KiB
Go
Raw Normal View History

package gsa
/*
#include "gsacak.h"
#cgo CFLAGS: -DTERMINATOR=0 -DM64=1 -Dm64=1
*/
import "C"
import (
2022-03-27 04:36:30 +00:00
"fmt"
"unsafe"
)
// Implementation from https://github.com/felipelouza/gsufsort
2022-03-27 04:36:30 +00:00
// see also: https://almob.biomedcentral.com/track/pdf/10.1186/s13015-020-00177-y.pdf
// see also: https://almob.biomedcentral.com/track/pdf/10.1186/s13015-017-0117-9.pdf
2022-03-28 02:02:01 +00:00
func PrintArrays(str []byte, sa []uint, lcp []int, da []int32) {
// remove terminator
2022-03-28 02:02:01 +00:00
n := len(sa) - 1
sa = sa[1:]
lcp = lcp[1:]
da = da[1:]
2022-03-27 04:36:30 +00:00
fmt.Printf("i\t")
fmt.Printf("sa\t")
2022-03-27 04:36:30 +00:00
if lcp != nil {
fmt.Printf("lcp\t")
}
if da != nil {
fmt.Printf("gsa\t")
}
2022-03-27 04:36:30 +00:00
fmt.Printf("suffixes\t")
fmt.Printf("\n")
for i := 0; i < n; i++ {
fmt.Printf("%d\t", i)
fmt.Printf("%d\t", sa[i])
2022-03-27 04:36:30 +00:00
if lcp != nil {
fmt.Printf("%d\t", lcp[i])
}
if da != nil { // gsa
2022-03-27 04:36:30 +00:00
value := sa[i]
if da[i] != 0 {
2022-03-28 02:02:01 +00:00
value = sa[i] - sa[da[i]-1] - 1
2022-03-27 04:36:30 +00:00
}
fmt.Printf("(%d %d)\t", da[i], value)
}
//bwt
// char c = (SA[i])? T[SA[i]-1]-1:terminal;
// if(c==0) c = '$';
// printf("%c\t",c);
2022-03-27 04:36:30 +00:00
for j := sa[i]; int(j) < n; j++ {
if str[j] == 1 {
fmt.Printf("$")
break
} else if str[j] == 0 {
fmt.Printf("#")
} else {
fmt.Printf("%c", str[j]-1)
}
}
fmt.Printf("\n")
}
}
2022-03-28 02:02:01 +00:00
//nolint
// SA2GSA - example func to convert SA+DA to GSA
func SA2GSA(sa []uint, da []int32) []uint {
// remove terminator
sa = sa[1:]
da = da[1:]
n := len(sa) - 1
2022-03-28 03:52:39 +00:00
gsa := make([]uint, n)
copy(gsa, sa)
2022-03-28 02:02:01 +00:00
for i := 0; i < n; i++ {
if da[i] != 0 {
gsa[i] = sa[i] - sa[da[i]-1] - 1
}
}
return gsa
}
2022-03-28 03:52:39 +00:00
func PrintRepeats(str []byte, sa []uint, da []int32) {
sa = sa[1:]
da = da[1:]
n := len(sa) - 1
var repeats int
for i := 0; i < len(da)-1; i++ {
repeats++
if da[i] < da[i+1] { // same suffix
continue
}
// new suffix
fmt.Printf(" repeats: %d\t", repeats)
for j := sa[i]; int(j) < n; j++ {
if str[j] == 1 {
//fmt.Printf("$")
break
} else if str[j] == 0 {
fmt.Printf("#")
} else {
fmt.Printf("%c", str[j]-1)
}
}
fmt.Printf("\n")
repeats = 0
}
}
func GSA(data []byte, sa []uint, lcp []int, da []int32) error {
tPtr := unsafe.Pointer(&data[0]) // source "text"
var lcpPtr, saPtr, daPtr unsafe.Pointer
if sa != nil {
saPtr = unsafe.Pointer(&sa[0])
}
if lcp != nil {
lcpPtr = unsafe.Pointer(&lcp[0])
}
if da != nil {
daPtr = unsafe.Pointer(&da[0])
}
depth := C.gsacak(
(*C.uchar)(tPtr),
(*C.uint_t)(saPtr),
(*C.int_t)(lcpPtr),
(*C.int_da)(daPtr),
C.uint_t(len(data)),
)
_ = depth
return nil
}
func ConcatAll(R [][]byte) (str []byte, n int) {
for i := 0; i < len(R); i++ {
n += len(R[i]) + 1
}
n++ //add 0 at the end
str = make([]byte, n)
var l, max int
k := len(R)
for i := 0; i < k; i++ {
m := len(R[i])
if m > max {
max = m
}
for j := 0; j < m; j++ {
if R[i][j] < 255 && R[i][j] > 1 {
str[l] = R[i][j] + 1
l++
}
}
if m > 0 {
if str[l-1] > 1 {
str[l] = 1
l++
} //add 1 as separator (ignores empty entries)
}
}
str[l] = 0
l++
n = l
return str, n
}