mirror of
https://gitlab.com/pulsechaincom/erigon-pulse.git
synced 2024-12-28 14:47:16 +00:00
parent
8ffada811a
commit
2c61236c58
40
sais/gsa/gsa_test.go
Normal file
40
sais/gsa/gsa_test.go
Normal file
@ -0,0 +1,40 @@
|
||||
package gsa
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func ExampleGSA() {
|
||||
R := [][]byte{[]byte("hihihi")}
|
||||
str, n := ConcatAll(R)
|
||||
sa2 := make([]uint, SaSize(n))
|
||||
lcp := make([]int, LcpSize(n))
|
||||
_ = GSA(str, sa2, lcp, nil)
|
||||
for i := 0; i < n; i++ {
|
||||
j := sa2[i]
|
||||
for ; int(j) < n; j++ {
|
||||
if str[j] == 1 {
|
||||
fmt.Printf("$")
|
||||
break
|
||||
} else if str[j] == 0 {
|
||||
fmt.Printf("#")
|
||||
} else {
|
||||
fmt.Printf("%c", str[j]-1)
|
||||
}
|
||||
}
|
||||
fmt.Printf("\n")
|
||||
}
|
||||
fmt.Printf("%d\n", sa2)
|
||||
}
|
||||
|
||||
func TestGSA(t *testing.T) {
|
||||
R := [][]byte{{4, 5, 6, 4, 5, 6, 4, 5, 6}}
|
||||
str, n := ConcatAll(R)
|
||||
sa := make([]uint, SaSize(n))
|
||||
lcp := make([]int, n)
|
||||
_ = GSA(str, sa, lcp, nil)
|
||||
assert.Equal(t, []uint{10, 9, 6, 3, 0, 7, 4, 1, 8, 5, 2}, sa[:n])
|
||||
}
|
76
sais/gsa/gsaca.go
Normal file
76
sais/gsa/gsaca.go
Normal file
@ -0,0 +1,76 @@
|
||||
package gsa
|
||||
|
||||
/*
|
||||
#include "gsacak.h"
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// Implementation from https://github.com/felipelouza/gsufsort
|
||||
|
||||
func SaSize(l int) int {
|
||||
var a uint
|
||||
return l * int(unsafe.Sizeof(a))
|
||||
}
|
||||
func LcpSize(l int) int {
|
||||
var a uint
|
||||
return l * int(unsafe.Sizeof(a))
|
||||
}
|
||||
func GSA(data []byte, sa []uint, lcp []int, da []int32) error {
|
||||
tPtr := unsafe.Pointer(&data[0]) // source "text"
|
||||
var lcpPtr, saPtr, daPtr unsafe.Pointer
|
||||
if sa != nil {
|
||||
saPtr = unsafe.Pointer(&sa[0])
|
||||
}
|
||||
if lcp != nil {
|
||||
lcpPtr = unsafe.Pointer(&lcp[0])
|
||||
}
|
||||
if da != nil {
|
||||
daPtr = unsafe.Pointer(&da[0])
|
||||
}
|
||||
depth := C.gsacak(
|
||||
(*C.uchar)(tPtr),
|
||||
(*C.uint_t)(saPtr),
|
||||
(*C.int_t)(lcpPtr),
|
||||
(*C.int_da)(daPtr),
|
||||
C.uint_t(len(data)),
|
||||
)
|
||||
_ = depth
|
||||
return nil
|
||||
}
|
||||
|
||||
func ConcatAll(R [][]byte) (str []byte, n int) {
|
||||
for i := 0; i < len(R); i++ {
|
||||
n += len(R[i]) + 1
|
||||
}
|
||||
|
||||
n++ //add 0 at the end
|
||||
str = make([]byte, n)
|
||||
var l, max int
|
||||
k := len(R)
|
||||
|
||||
for i := 0; i < k; i++ {
|
||||
m := len(R[i])
|
||||
if m > max {
|
||||
max = m
|
||||
}
|
||||
for j := 0; j < m; j++ {
|
||||
if R[i][j] < 255 && R[i][j] > 1 {
|
||||
str[l] = R[i][j] + 1
|
||||
l++
|
||||
}
|
||||
}
|
||||
if m > 0 {
|
||||
if str[l-1] > 1 {
|
||||
str[l] = 1
|
||||
l++
|
||||
} //add 1 as separator (ignores empty entries)
|
||||
}
|
||||
}
|
||||
str[l] = 0
|
||||
l++
|
||||
n = l
|
||||
return str, n
|
||||
}
|
2536
sais/gsa/gsacak.c
Normal file
2536
sais/gsa/gsacak.c
Normal file
File diff suppressed because it is too large
Load Diff
147
sais/gsa/gsacak.h
Normal file
147
sais/gsa/gsacak.h
Normal file
@ -0,0 +1,147 @@
|
||||
// vim: noai:ts=2:sw=2
|
||||
|
||||
/*
|
||||
* Authors: Felipe A. Louza, Simon Gog, Guilherme P. Telles
|
||||
* contact: louza@ic.unicamp.br
|
||||
* 03/04/2017
|
||||
*/
|
||||
|
||||
/*
|
||||
* This code is a modification of SACA-K algorithm by G. Nong, which can be
|
||||
* retrieved at: http://code.google.com/p/ge-nong/
|
||||
*
|
||||
* Our version of SACA-K, called gSACA-K, maintain the theoretical bounds of the
|
||||
* original algorithm to construct the generalized suffix array.
|
||||
*
|
||||
* Our algorithm gSACA-K can also computes the LCP-array and the Document-array
|
||||
* with no additional costs.
|
||||
*
|
||||
* gsacak(s, SA, NULL, NULL, n) //computes only SA
|
||||
* gsacak(s, SA, LCP, NULL, n) //computes SA and LCP
|
||||
* gsacak(s, SA, NULL, DA, n) //computes SA and DA
|
||||
* gsacak(s, SA, LCP, DA, n) //computes SA, LCP and DA
|
||||
*
|
||||
*/
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
#ifndef GSACAK_H
|
||||
#define GSACAK_H
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <limits.h>
|
||||
#include <inttypes.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
#define max(a,b) ((a) > (b) ? (a) : (b))
|
||||
|
||||
#ifndef DEBUG
|
||||
#define DEBUG 0
|
||||
#endif
|
||||
|
||||
#ifndef M64
|
||||
#define M64 1
|
||||
#endif
|
||||
|
||||
#if M64
|
||||
typedef int64_t int_t;
|
||||
typedef uint64_t uint_t;
|
||||
#define PRIdN PRId64
|
||||
#define U_MAX UINT64_MAX
|
||||
#define I_MAX INT64_MAX
|
||||
#define I_MIN INT64_MIN
|
||||
#else
|
||||
typedef int32_t int_t;
|
||||
typedef uint32_t uint_t;
|
||||
#define PRIdN PRId32
|
||||
#define U_MAX UINT32_MAX
|
||||
#define I_MAX INT32_MAX
|
||||
#define I_MIN INT32_MIN
|
||||
#endif
|
||||
|
||||
/*! @option type of s[0,n-1] for integer alphabets
|
||||
*
|
||||
* @constraint sizeof(int_t) >= sizeof(int_text)
|
||||
*/
|
||||
typedef uint32_t int_text; //4N bytes for s[0..n-1]
|
||||
#define PRIdT PRIu32
|
||||
|
||||
/*! @option type for array DA
|
||||
*/
|
||||
typedef int32_t int_da;
|
||||
#define PRIdA PRId32
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
/** @brief computes the suffix array of string s[0..n-1]
|
||||
*
|
||||
* @param s input string with s[n-1]=0
|
||||
* @param SA suffix array
|
||||
* @param n string length
|
||||
* @return -1 if an error occured, otherwise the depth of the recursive calls.
|
||||
*/
|
||||
int sacak(unsigned char *s, uint_t *SA, uint_t n);
|
||||
|
||||
/** @brief computes the suffix array of string s[0..n-1]
|
||||
*
|
||||
* @param k alphabet size+1 (0 is reserved)
|
||||
*/
|
||||
int sacak_int(int_text *s, uint_t *SA, uint_t n, uint_t k);
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
/** @brief Computes the suffix array SA (LCP, DA) of T^cat in s[0..n-1]
|
||||
*
|
||||
* @param s input concatenated string, using separators s[i]=1 and with s[n-1]=0
|
||||
* @param SA Suffix array
|
||||
* @param LCP LCP array
|
||||
* @param DA Document array
|
||||
* @param n String length
|
||||
*
|
||||
* @return depth of the recursive calls.
|
||||
*/
|
||||
int gsacak(unsigned char *s, uint_t *SA, int_t *LCP, int_da *DA, uint_t n);
|
||||
|
||||
/** @brief Computes the suffix array SA (LCP, DA) of T^cat in s[0..n-1]
|
||||
*
|
||||
* @param s input concatenated string, using separators s[i]=1 and with s[n-1]=0
|
||||
* @param SA Suffix array
|
||||
* @param LCP LCP array
|
||||
* @param DA Document array
|
||||
* @param n String length
|
||||
* @param k alphabet size+2 (0 and 1 are reserved)
|
||||
*
|
||||
* @return depth of the recursive calls.
|
||||
*/
|
||||
int gsacak_int(int_text *s, uint_t *SA, int_t *LCP, int_da *DA, uint_t n, uint_t k);
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
|
||||
#define m64 1
|
||||
|
||||
#if m64
|
||||
typedef int64_t int_t;
|
||||
typedef uint64_t uint_t;
|
||||
#define PRIdN PRId64
|
||||
#else
|
||||
typedef int32_t int_t;
|
||||
typedef uint32_t uint_t;
|
||||
#define PRIdN PRId32
|
||||
#endif
|
||||
|
||||
typedef uint32_t int_text;
|
||||
|
||||
|
||||
|
||||
int_t SACA_K(int_t *s, uint_t *SA,
|
||||
uint_t n, unsigned int K,
|
||||
uint_t m, int cs, int level);
|
||||
|
||||
int_t gSACA_K(uint_t *s, uint_t *SA,
|
||||
uint_t n, unsigned int K,
|
||||
int cs, uint_t separator, int level);
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user