From 2c61236c5880afbb71953576e3788b1e71f2d6b4 Mon Sep 17 00:00:00 2001 From: Alex Sharov Date: Sat, 26 Mar 2022 20:47:27 +0700 Subject: [PATCH] Generalized suffix array (#396) * gsa * gsa * save * save --- sais/gsa/gsa_test.go | 40 + sais/gsa/gsaca.go | 76 ++ sais/gsa/gsacak.c | 2536 ++++++++++++++++++++++++++++++++++++++++++ sais/gsa/gsacak.h | 147 +++ 4 files changed, 2799 insertions(+) create mode 100644 sais/gsa/gsa_test.go create mode 100644 sais/gsa/gsaca.go create mode 100644 sais/gsa/gsacak.c create mode 100644 sais/gsa/gsacak.h diff --git a/sais/gsa/gsa_test.go b/sais/gsa/gsa_test.go new file mode 100644 index 000000000..71817d0b2 --- /dev/null +++ b/sais/gsa/gsa_test.go @@ -0,0 +1,40 @@ +package gsa + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/assert" +) + +func ExampleGSA() { + R := [][]byte{[]byte("hihihi")} + str, n := ConcatAll(R) + sa2 := make([]uint, SaSize(n)) + lcp := make([]int, LcpSize(n)) + _ = GSA(str, sa2, lcp, nil) + for i := 0; i < n; i++ { + j := sa2[i] + for ; int(j) < n; j++ { + if str[j] == 1 { + fmt.Printf("$") + break + } else if str[j] == 0 { + fmt.Printf("#") + } else { + fmt.Printf("%c", str[j]-1) + } + } + fmt.Printf("\n") + } + fmt.Printf("%d\n", sa2) +} + +func TestGSA(t *testing.T) { + R := [][]byte{{4, 5, 6, 4, 5, 6, 4, 5, 6}} + str, n := ConcatAll(R) + sa := make([]uint, SaSize(n)) + lcp := make([]int, n) + _ = GSA(str, sa, lcp, nil) + assert.Equal(t, []uint{10, 9, 6, 3, 0, 7, 4, 1, 8, 5, 2}, sa[:n]) +} diff --git a/sais/gsa/gsaca.go b/sais/gsa/gsaca.go new file mode 100644 index 000000000..509cc8c2a --- /dev/null +++ b/sais/gsa/gsaca.go @@ -0,0 +1,76 @@ +package gsa + +/* +#include "gsacak.h" +*/ +import "C" +import ( + "unsafe" +) + +// Implementation from https://github.com/felipelouza/gsufsort + +func SaSize(l int) int { + var a uint + return l * int(unsafe.Sizeof(a)) +} +func LcpSize(l int) int { + var a uint + return l * int(unsafe.Sizeof(a)) +} +func GSA(data []byte, sa []uint, lcp []int, da []int32) error { + tPtr := unsafe.Pointer(&data[0]) // source "text" + var lcpPtr, saPtr, daPtr unsafe.Pointer + if sa != nil { + saPtr = unsafe.Pointer(&sa[0]) + } + if lcp != nil { + lcpPtr = unsafe.Pointer(&lcp[0]) + } + if da != nil { + daPtr = unsafe.Pointer(&da[0]) + } + depth := C.gsacak( + (*C.uchar)(tPtr), + (*C.uint_t)(saPtr), + (*C.int_t)(lcpPtr), + (*C.int_da)(daPtr), + C.uint_t(len(data)), + ) + _ = depth + return nil +} + +func ConcatAll(R [][]byte) (str []byte, n int) { + for i := 0; i < len(R); i++ { + n += len(R[i]) + 1 + } + + n++ //add 0 at the end + str = make([]byte, n) + var l, max int + k := len(R) + + for i := 0; i < k; i++ { + m := len(R[i]) + if m > max { + max = m + } + for j := 0; j < m; j++ { + if R[i][j] < 255 && R[i][j] > 1 { + str[l] = R[i][j] + 1 + l++ + } + } + if m > 0 { + if str[l-1] > 1 { + str[l] = 1 + l++ + } //add 1 as separator (ignores empty entries) + } + } + str[l] = 0 + l++ + n = l + return str, n +} diff --git a/sais/gsa/gsacak.c b/sais/gsa/gsacak.c new file mode 100644 index 000000000..054ec2166 --- /dev/null +++ b/sais/gsa/gsacak.c @@ -0,0 +1,2536 @@ +// vim: noai:ts=2:sw=2 + +#include "gsacak.h" + +// set only the highest bit as 1, i.e. 1000... +//const unsigned int EMPTY_k=((unsigned int)1)<<(sizeof(unsigned int)*8-1); +const uint_t EMPTY_k=((uint_t)1)<<(sizeof(uint_t)*8-1); + +// get s[i] at a certain level +#define chr(i) (cs==sizeof(int_t)?((int_t*)s)[i]:(cs==sizeof(int_text)?((int_text*)s)[i]:((unsigned char *)s)[i])) + +#define true 1 +#define false 0 + +#define DEPTH 0 // compute time and size of reduced problem for each recursion call +#define PHASES 0 // compute time for each phase +#define RMQ_L 2 //variants = (1, trivial) (2, using Gog's stack) +#define RMQ_S 2 //variants = (1, trivial) (2, using Gog's stack) + +#define STACK_SIZE_L 894 //to use 10Kb of working space +#define STACK_SIZE_S 894 //to use 10Kb of working space + +#define EMPTY_STRING 0 //check if there is an empty string in the input collection + +typedef struct _pair{ + uint_t idx; + int_t lcp; +} t_pair_k; + +int compare_k (const void * a, const void * b){ + if(*(const uint_t *)a < *(const uint_t *)b) return -1; + if(*(const uint_t *)a > *(const uint_t *)b) return 1; +return 0; +} + +void stack_push_k(t_pair_k* STACK, int_t *top, uint_t idx, int_t lcp){ + + STACK[*top].idx=idx; + STACK[*top].lcp=lcp; + + (*top)++; +} + +void compute_lcp_phi_sparse(int_t *s, uint_t *SA1, + uint_t *RA, int_t *LCP, int_t *PLCP, + uint_t n1, int cs, uint_t separator) { + + uint_t i; + + PLCP[SA1[0]]=0;//PLCP* (lms) is stored in PLCP array + for(i=1; i0; i--) { + j=SA[i]; SA[i]=0; + SA[bkt[chr(j)]--]=j; + } + SA[0]=n-1; // set the single sentinel suffix. +} + +void putSuffix0_generalized(uint_t *SA, + uint_t *s, uint_t *bkt, + uint_t n, unsigned int K, int_t n1, int cs, uint_t separator) { + uint_t i, j; + + // find the end of each bucket. + getBuckets_k((int_t*)s, bkt, n, K, true, cs); + + int_t tmp=bkt[separator]--;// shifts one position left of bkt[separator] + + // put the suffixes into their buckets. + for(i=n1-1; i>0; i--) { + j=SA[i]; SA[i]=0; + SA[bkt[chr(j)]--]=j; + } + + // SA[0]=n-1; // set the single sentinel suffix. + + SA[tmp]=SA[0]-1;// insert the last separator at the end of bkt[separator] + +} + +void putSuffix0_generalized_LCP(uint_t *SA, int_t *LCP, + uint_t *s, uint_t *bkt, + uint_t n, unsigned int K, int_t n1, int cs, uint_t separator) { + uint_t i, j; + int_t l; + + // find the end of each bucket. + getBuckets_k((int_t*)s, bkt, n, K, true, cs); + + int_t tmp=bkt[separator]--;// shifts one position left of bkt[separator] + + // put the suffixes into their buckets. + for(i=n1-1; i>0; i--) { + j=SA[i]; SA[i]=U_MAX; + l=LCP[i]; LCP[i]=0; + + SA[bkt[chr(j)]]=j; + LCP[bkt[chr(j)]--]=l; + } + + // SA[0]=n-1; // set the single sentinel suffix. + + SA[tmp]=SA[0]-1;// insert the last separator at the end of bkt[separator] + +} + +void putSuffix0_generalized_DA(uint_t *SA, int_da *DA, + uint_t *s, uint_t *bkt, + uint_t n, unsigned int K, int_t n1, int cs, uint_t separator) { + uint_t i, j; + + // find the end of each bucket. + getBuckets_k((int_t*)s, bkt, n, K, true, cs); + + int_t tmp=bkt[separator]--;// shifts one position left of bkt[separator] + + // put the suffixes into their buckets. + for(i=n1-1; i>0; i--) { + j=SA[i]; SA[i]=0; + SA[bkt[chr(j)]]=j; + DA[bkt[chr(j)]--]=DA[i]; + } + + // SA[0]=n-1; // set the single sentinel suffix. + + SA[tmp]=SA[0]-1;// insert the last separator at the end of bkt[separator] + DA[tmp]= (int_da) tmp-1; + +} + + +void putSuffix0_generalized_LCP_DA(uint_t *SA, int_t *LCP, int_da *DA, + uint_t *s, uint_t *bkt, + uint_t n, unsigned int K, int_t n1, int cs, uint_t separator) { + uint_t i, j; + int_t l; + + // find the end of each bucket. + getBuckets_k((int_t*)s, bkt, n, K, true, cs); + + int_t tmp=bkt[separator]--;// shifts one position left of bkt[separator] + + // put the suffixes into their buckets. + for(i=n1-1; i>0; i--) { + j=SA[i]; SA[i]=U_MAX; + l=LCP[i]; LCP[i]=0; + + SA[bkt[chr(j)]]=j; + DA[bkt[chr(j)]]=DA[i]; + LCP[bkt[chr(j)]--]=l; + } + + // SA[0]=n-1; // set the single sentinel suffix. + + SA[tmp]=SA[0]-1;// insert the last separator at the end of bkt[separator] + DA[tmp]= (int_da) tmp-1; +} + +/*****************************************************************************/ + +void induceSAl0(uint_t *SA, + int_t *s, uint_t *bkt, + uint_t n, unsigned int K, int_t suffix, int cs) { + uint_t i, j; + + // find the head of each bucket. + getBuckets_k(s, bkt, n, K, false, cs); + + bkt[0]++; // skip the virtual sentinel. + for(i=0; i0) { + j=SA[i]-1; + if(chr(j)>=chr(j+1)) { + SA[bkt[chr(j)]]=j; + bkt[chr(j)]++; + if(!suffix && i>0) SA[i]=0; + } + } +} + +void induceSAs0(uint_t *SA, + int_t *s, uint_t *bkt, + uint_t n, unsigned int K, int_t suffix, int cs) { + uint_t i, j; + + // find the end of each bucket. + getBuckets_k(s, bkt, n, K, true, cs); + + for(i=n-1; i>0; i--) + if(SA[i]>0) { + j=SA[i]-1; + if(chr(j)<=chr(j+1) && bkt[chr(j)]0) { + j=SA[i]-1; + if(chr(j)>=chr(j+1) ) { + if(chr(j)!=separator)//gsa-is + SA[bkt[chr(j)]++]=j; + if(!suffix && i>0) SA[i]=0; + } + } +} + +void induceSAs0_generalized(uint_t *SA, + uint_t *s, uint_t *bkt, + uint_t n, uint_t K, int_t suffix, int cs, uint_t separator) { + uint_t i, j; + + // find the end of each bucket. + getBuckets_k((int_t*)s, bkt, n, K, true, cs); + + for(i=n-1; i>0; i--) + if(SA[i]>0) { + j=SA[i]-1; + if(chr(j)<=chr(j+1) && bkt[chr(j)]LCP[i]) M[k] = max(0,LCP[i]); + #elif RMQ_L == 2 + int_t min_lcp=0; + uint_t last; + + if(!SA[i]) last = 0; + else{ + last = last_occ[chr(SA[i]-1)]; + last_occ[chr(SA[i]-1)] = i+1; + } + + int_t lcp=max(0,LCP[i]); + while(STACK[(top)-1].lcp>=lcp) (top)--; + + stack_push_k(STACK, &top, i+1, lcp); + j = top-1; + + while(STACK[j].idx>last) j--; + min_lcp=STACK[(j+1)].lcp; + + #endif + + if(SA[i]>0) { + j=SA[i]-1; + if(chr(j)>=chr(j+1)) + if(chr(j)!=separator){//gsa-is + SA[bkt[chr(j)]]=j; + + #if RMQ_L == 1 + LCP[bkt[chr(j)]]+=M[chr(j)]+1; + M[chr(j)] = I_MAX; + #elif RMQ_L == 2 + LCP[bkt[chr(j)]]+=min_lcp+1; + #endif + + bkt[chr(j)]++; + } + if(bkt[chr(SA[i])]-1STACK_SIZE_L){//if stack is full + + int_t j; + memcpy(tmp, last_occ, K*sizeof(uint_t)); + qsort(tmp, K, sizeof(uint_t), compare_k); + + int_t curr=1, end=1; + STACK[top].idx=U_MAX; + for(j=0;j=STACK_SIZE_L){ + fprintf(stderr,"ERROR: induceSAl0_LCP\n"); + exit(1); + } + + top = end; + } + #endif + } + } + #if RMQ_L == 1 + free(M); + #elif RMQ_L == 2 + free(STACK); + free(last_occ); + free(tmp); + #endif + +} + +void induceSAs0_generalized_LCP(uint_t *SA, int_t* LCP, + uint_t *s, uint_t *bkt, + uint_t n, uint_t K, int cs, uint_t separator) { + uint_t i, j; + + // find the end of each bucket. + getBuckets_k((int_t*)s, bkt, n, K, true, cs); + + #if RMQ_S == 1 + int_t *M=(int_t *)malloc(sizeof(int_t)*K); + for(i=0;i0; i--){ + if(SA[i]>0) { + j=SA[i]-1; + if(chr(j)<=chr(j+1) && bkt[chr(j)]=0) + LCP[bkt[chr(j)]+1]=M[chr(j)]+1; + + if(LCP[bkt[chr(j)]]>0) + LCP[bkt[chr(j)]]=I_MAX; + + #elif RMQ_S == 2 + int_t min = I_MAX, end = top-1; + + int_t last=last_occ[chr(j)]; + while(STACK[end].idx<=last) end--; + + min=STACK[(end+1)].lcp; + last_occ[chr(j)] = i; + + if(LCP[bkt[chr(j)]+1]>=0) + LCP[bkt[chr(j)]+1]=min+1; + #endif + + + #if RMQ_S == 1 + M[chr(j)] = I_MAX; + #endif + + bkt[chr(j)]--; + + if(SA[bkt[chr(j)]]!=U_MAX) {//L/S-seam + int_t l=0; + while(chr(SA[bkt[chr(j)]+1]+l)==chr(SA[bkt[chr(j)]]+l))++l; + LCP[bkt[chr(j)]+1]=l; + } + } + } + + if(LCP[i]<0) LCP[i]=0; + #if RMQ_S == 1 + int_t k; + for(k=0; kLCP[i]) M[k] = LCP[i]; + #elif RMQ_S == 2 + + int_t lcp=max(0,LCP[i]); + + while(STACK[(top)-1].lcp>=lcp) (top)--; + stack_push_k(STACK, &top, i, lcp); + + if(top>=STACK_SIZE_S){ + + int_t j; + memcpy(tmp, last_occ, K*sizeof(uint_t)); + qsort(tmp, K, sizeof(uint_t), compare_k); + + int_t curr=1, end=1; + + for(j=K-1;j>=0; j--){ + + if(tmp[j] < STACK[end-1].idx){ + + while(STACK[curr].idx>tmp[j] && curr < top) curr++; + if(curr>=top) break; + stack_push_k(STACK, &end, STACK[curr].idx, STACK[curr].lcp); + curr++; + } + } + + if(end>=STACK_SIZE_S){ + fprintf(stderr,"ERROR: induceSAl0_LCP\n"); + exit(1); + } + top = end; + } + #endif + + } + + LCP[0]=0; + + //variant 1 + #if RMQ_S == 1 + free(M); + #elif RMQ_S == 2 + free(STACK); + free(last_occ); + free(tmp); + #endif +} + + +/*****************************************************************************/ + +void induceSAl0_generalized_DA(uint_t *SA, int_da* DA, + uint_t *s, uint_t *bkt, + uint_t n, unsigned int K, int cs, uint_t separator) { + uint_t i, j; + + // find the head of each bucket. + getBuckets_k((int_t*)s, bkt, n, K, false, cs); + + bkt[0]++; // skip the virtual sentinel. + for(i=0; i0) { + j=SA[i]-1; + if(chr(j)>=chr(j+1) ) { + if(chr(j)!=separator){//gsa-is + SA[bkt[chr(j)]]=j; + DA[bkt[chr(j)]++]=DA[i]; + } + } + } +} + +void induceSAs0_generalized_DA(uint_t *SA, int_da* DA, + uint_t *s, uint_t *bkt, + uint_t n, uint_t K, int cs, uint_t separator) { + uint_t i, j; + + // find the end of each bucket. + getBuckets_k((int_t*)s, bkt, n, K, true, cs); + + for(i=n-1; i>0; i--) + if(SA[i]>0) { + j=SA[i]-1; + if(chr(j)<=chr(j+1) && bkt[chr(j)]LCP[i]) M[k] = max(0,LCP[i]); + #elif RMQ_L == 2 + int_t min_lcp=0; + uint_t last; + + if(!SA[i]) last = 0; + else{ + last = last_occ[chr(SA[i]-1)]; + last_occ[chr(SA[i]-1)] = i+1; + } + + int_t lcp=max(0,LCP[i]); + while(STACK[(top)-1].lcp>=lcp) (top)--; + + stack_push_k(STACK, &top, i+1, lcp); + j = top-1; + + while(STACK[j].idx>last) j--; + min_lcp=STACK[(j+1)].lcp; + + #endif + + if(SA[i]>0) { + j=SA[i]-1; + if(chr(j)>=chr(j+1)) + if(chr(j)!=separator){//gsa-is + SA[bkt[chr(j)]]=j; + DA[bkt[chr(j)]]=DA[i]; + + #if RMQ_L == 1 + LCP[bkt[chr(j)]]+=M[chr(j)]+1; + M[chr(j)] = I_MAX; + #elif RMQ_L == 2 + LCP[bkt[chr(j)]]+=min_lcp+1; + #endif + + bkt[chr(j)]++; + } + if(bkt[chr(SA[i])]-1STACK_SIZE_L){//if stack is full + + int_t j; + memcpy(tmp, last_occ, K*sizeof(uint_t)); + qsort(tmp, K, sizeof(uint_t), compare_k); + + int_t curr=1, end=1; + STACK[top].idx=U_MAX; + for(j=0;j=STACK_SIZE_L){ + fprintf(stderr,"ERROR: induceSAl0_LCP\n"); + exit(1); + } + + top = end; + } + #endif + } + } + #if RMQ_L == 1 + free(M); + #elif RMQ_L == 2 + free(STACK); + free(last_occ); + free(tmp); + #endif + +} + +void induceSAs0_generalized_LCP_DA(uint_t *SA, int_t* LCP, int_da* DA, + uint_t *s, uint_t *bkt, + uint_t n, uint_t K, int cs, uint_t separator) { + uint_t i, j; + + // find the end of each bucket. + getBuckets_k((int_t*)s, bkt, n, K, true, cs); + + #if RMQ_S == 1 + int_t *M=(int_t *)malloc(sizeof(int_t)*K); + for(i=0;i0; i--){ + if(SA[i]>0) { + j=SA[i]-1; + if(chr(j)<=chr(j+1) && bkt[chr(j)]=0) + LCP[bkt[chr(j)]+1]=M[chr(j)]+1; + + if(LCP[bkt[chr(j)]]>0) + LCP[bkt[chr(j)]]=I_MAX; + + #elif RMQ_S == 2 + int_t min = I_MAX, end = top-1; + + int_t last=last_occ[chr(j)]; + while(STACK[end].idx<=last) end--; + + min=STACK[(end+1)].lcp; + last_occ[chr(j)] = i; + + if(LCP[bkt[chr(j)]+1]>=0) + LCP[bkt[chr(j)]+1]=min+1; + #endif + + + #if RMQ_S == 1 + M[chr(j)] = I_MAX; + #endif + + bkt[chr(j)]--; + + if(SA[bkt[chr(j)]]!=U_MAX) {//L/S-seam + int_t l=0; + while(chr(SA[bkt[chr(j)]+1]+l)==chr(SA[bkt[chr(j)]]+l))++l; + LCP[bkt[chr(j)]+1]=l; + } + } + } + + if(LCP[i]<0) LCP[i]=0; + #if RMQ_S == 1 + int_t k; + for(k=0; kLCP[i]) M[k] = LCP[i]; + #elif RMQ_S == 2 + + int_t lcp=max(0,LCP[i]); + + while(STACK[(top)-1].lcp>=lcp) (top)--; + stack_push_k(STACK, &top, i, lcp); + + if(top>=STACK_SIZE_S){ + + int_t j; + memcpy(tmp, last_occ, K*sizeof(uint_t)); + qsort(tmp, K, sizeof(uint_t), compare_k); + + int_t curr=1, end=1; + + for(j=K-1;j>=0; j--){ + + if(tmp[j] < STACK[end-1].idx){ + + while(STACK[curr].idx>tmp[j] && curr < top) curr++; + if(curr>=top) break; + stack_push_k(STACK, &end, STACK[curr].idx, STACK[curr].lcp); + curr++; + } + } + + if(end>=STACK_SIZE_S){ + fprintf(stderr,"ERROR: induceSAl0_LCP\n"); + exit(1); + } + top = end; + } + #endif + + } + + LCP[0]=0; + + //variant 1 + #if RMQ_S == 1 + free(M); + #elif RMQ_S == 2 + free(STACK); + free(last_occ); + free(tmp); + #endif +} + + +/*****************************************************************************/ + + +void putSubstr0(uint_t *SA, + int_t *s, uint_t *bkt, + uint_t n, unsigned int K, int cs) { + uint_t i, cur_t, succ_t; + + // find the end of each bucket. + getBuckets_k(s, bkt, n, K, true, cs); + + // set each item in SA as empty. + for(i=0; i0; i--) { + cur_t=(chr(i-1)0; i--) { + cur_t=(chr(i-1)0; i--) { + j=SA[i]; SA[i]=EMPTY_k; + cur=chr(j); + if(cur!=pre) { + pre=cur; pos=cur; + } + SA[pos--]=j; + } +} + +void induceSAl1(int_t *SA, int_t *s, + int_t n, int_t suffix, int cs) { + int_t h, i, j, step=1; + + for(i=0; i=c1; + if(!isL) continue; + + // s[j] is L-type. + + int_t d=SA[c]; + if(d>=0) { + // SA[c] is borrowed by the left + // neighbor bucket. + // shift-left the items in the + // left neighbor bucket. + int_t foo, bar; + foo=SA[c]; + for(h=c-1; SA[h]>=0||SA[h]==EMPTY_k; h--) + { bar=SA[h]; SA[h]=foo; foo=bar; } + SA[h]=foo; + if(hn-1 || SA[pos]!=EMPTY_k) { + // we are running into the right + // neighbor bucket. + // shift-left one step the items + // of bucket(SA, S, j). + for(h=0; h<-d; h++) + SA[c+h]=SA[c+h+1]; + pos--; + if(c(c2=chr(j+2)) || (c1==c2 && c10) { + int_t i1=(step==0)?i-1:i; + SA[i1]=EMPTY_k; + } + } + + // scan to shift-left the items in each bucket + // with its head being reused as a counter. + for(i=1; i0; i-=step) { + step=1; j=SA[i]-1; + if(SA[i]<=0) continue; + int_t c=chr(j), c1=chr(j+1); + int_t isS=(ci); + if(!isS) continue; + + // s[j] is S-type + + int_t d=SA[c]; + if(d>=0) { + // SA[c] is borrowed by the right + // neighbor bucket. + // shift-right the items in the + // right neighbor bucket. + int_t foo, bar; + foo=SA[c]; + for(h=c+1; SA[h]>=0||SA[h]==EMPTY_k; h++) + { bar=SA[h]; SA[h]=foo; foo=bar; } + SA[h]=foo; + if(h>i) step=0; + + d=EMPTY_k; + } + + if(d==EMPTY_k) { // SA[c] is empty. + if(SA[c-1]==EMPTY_k) { + SA[c]=-1; // init the counter. + SA[c-1]=j; + } + else + SA[c]=j; // a size-1 bucket. + } + else { // SA[c] is reused as a counter. + int_t pos=c+d-1; + if(SA[pos]!=EMPTY_k) { + // we are running into the left + // neighbor bucket. + // shift-right one step the items + // of bucket(SA, S, j). + for(h=0; h<-d; h++) + SA[c-h]=SA[c-h-1]; + pos++; + if(c>i) step=0; + } + else + SA[c]--; + + SA[pos]=j; + } + + if(!suffix) { + int_t i1=(step==0)?i+1:i; + SA[i1]=EMPTY_k; + } + } + + // scan to shift-right the items in each bucket + // with its head being reused as a counter. + if(!suffix) + for(i=n-1; i>0; i--) { + j=SA[i]; + if(j<0 && j!=EMPTY_k) { // is SA[i] a counter? + for(h=0; h<-j; h++) + SA[i-h]=SA[i-h-1]; + SA[i-h]=EMPTY_k; + } + } +} + +void putSubstr1(int_t *SA, int_t *s, int_t n, int cs) { + int_t h, i, j; + + for(i=0; i0; i--) { + c=c1; t=t1; + c1=chr(i-1); + t1=c1=0) { + // SA[c] is borrowed by the right + // neighbor bucket. + // shift-right the items in the + // right neighbor bucket. + int_t foo, bar; + foo=SA[c]; + for(h=c+1; SA[h]>=0; h++) + { bar=SA[h]; SA[h]=foo; foo=bar; } + SA[h]=foo; + + SA[c]=EMPTY_k; + } + + int_t d=SA[c]; + if(d==EMPTY_k) { // SA[c] is empty. + if(SA[c-1]==EMPTY_k) { + SA[c]=-1; // init the counter. + SA[c-1]=i; + } + else + SA[c]=i; // a size-1 bucket. + } + else { // SA[c] is reused as a counter + int_t pos=c+d-1; + if(SA[pos]!=EMPTY_k) { + // we are running into the left + // neighbor bucket. + // shift-right one step the items + // of bucket(SA, S, i). + for(h=0; h<-d; h++) + SA[c-h]=SA[c-h-1]; + pos++; + } + else + SA[c]--; + + SA[pos]=i; + } + } + } + + // scan to shift-right the items in each bucket + // with its head being reused as a counter. + for(i=n-1; i>0; i--) { + j=SA[i]; + if(j<0 && j!=EMPTY_k) { // is SA[i] a counter? + for(h=0; h<-j; h++) + SA[i-h]=SA[i-h-1]; + SA[i-h]=EMPTY_k; + } + } + + // put the single sentinel LMS-substring. + SA[0]=n-1; +} + +uint_t getLengthOfLMS(int_t *s, + uint_t n, int level, uint_t x, int cs) { + if(x==n-1) return 1; + + uint_t dist=0, i=1; + while(1) { + if(chr(x+i)n-1 || chr(x+i)>chr(x+i-1)) break; + if(x+i==n-1 || chr(x+i)=n1; i--) + if(SA[i]!=EMPTY_k) SA[j--]=SA[i]; + + // rename each S-type character of the + // interim s1 as the end of its bucket + // to produce the final s1. + succ_t=1; + for(i=n1-1; i>0; i--) { + int_t ch=s1[i], ch1=s1[i-1]; + cur_t=(ch1< ch || (ch1==ch && succ_t==1))?1:0; + if(cur_t==1) { + s1[i-1]+=SA[s1[i-1]]-1; + } + succ_t=cur_t; + } + + return name_ctr; +} + +/*****************************************************************************/ + +uint_t nameSubstr_generalized(uint_t *SA, + uint_t *s, uint_t *s1, uint_t n, + uint_t m, uint_t n1, int level, int cs, uint_t separator) { + uint_t i, j, cur_t, succ_t; + + // init the name array buffer + for(i=n1; i=n1; i--) + if(SA[i]!=EMPTY_k) SA[j--]=SA[i]; + + // rename each S-type character of the + // interim s1 as the end of its bucket + // to produce the final s1. + succ_t=1; + for(i=n1-1; i>0; i--) { + int_t ch=s1[i], ch1=s1[i-1]; + cur_t=(ch1< ch || (ch1==ch && succ_t==1))?1:0; + if(cur_t==1) { + s1[i-1]+=SA[s1[i-1]]-1; + } + succ_t=cur_t; + } + + return name_ctr; +} + +/*****************************************************************************/ + +uint_t nameSubstr_generalized_LCP(uint_t *SA, int_t *LCP, + uint_t *s, uint_t *s1, uint_t n, + uint_t m, uint_t n1, int level, int cs, uint_t separator) { + uint_t i, j, cur_t, succ_t; + + // init the name array buffer + for(i=n1; i=n1; i--) + if(SA[i]!=EMPTY_k) SA[j--]=SA[i]; + + // rename each S-type character of the + // interim s1 as the end of its bucket + // to produce the final s1. + succ_t=1; + for(i=n1-1; i>0; i--) { + int_t ch=s1[i], ch1=s1[i-1]; + cur_t=(ch1< ch || (ch1==ch && succ_t==1))?1:0; + if(cur_t==1) { + s1[i-1]+=SA[s1[i-1]]-1; + } + succ_t=cur_t; + } + + return name_ctr; +} + +/*****************************************************************************/ + +void getSAlms(uint_t *SA, + int_t *s, + uint_t *s1, uint_t n, + uint_t n1, int level, int cs) { + uint_t i, j, cur_t, succ_t; + + j=n1-1; s1[j--]=n-1; + succ_t=0; // s[n-2] must be L-type + for(i=n-2; i>0; i--) { + cur_t=(chr(i-1)0; i--) if(chr(i)==separator)k++; + DA[n1-1]=(int_da) k; +/**/ + + j=n1-1; s1[j--]=n-1; + succ_t=0; // s[n-2] must be L-type + for(i=n-2; i>0; i--) { + + if(chr(i)==separator)k--; + + cur_t=(chr(i-1)0) || (level&&((int_t *)SA)[i]>0)) + SA[n1++]=SA[i]; + + uint_t *SA1=SA, *s1=SA+m-n1; + uint_t name_ctr; + name_ctr=nameSubstr(SA,s,s1,n,m,n1,level,cs); + + #if PHASES + if(!level){ + printf("phase 1:\n"); + time_stop(t_start_phase, c_start_phase); + } + #endif + + #if PHASES + if(!level){ + t_start_phase = time(NULL); + c_start_phase = clock(); + } + #endif + + // stage 2: solve the reduced problem. + int_t depth=1; + // recurse if names are not yet unique. + if(name_ctr0; i--) + if(chr(i)==separator) + SA[bkt[chr(i)]--]=i; + + // now, all the LMS-substrings are sorted and + // stored sparsely in SA. + + // compact all the sorted substrings into + // the first n1 items of SA. + // 2*n1 must be not larger than n. + uint_t n1=0; + for(i=0; i0)) + SA[n1++]=SA[i]; + + uint_t *SA1=SA, *s1=SA+m-n1; + uint_t name_ctr; + name_ctr=nameSubstr_generalized(SA,s,s1,n,m,n1,level,cs,separator); + + #if PHASES + printf("phase 1:\n"); + time_stop(t_start_phase, c_start_phase); + #endif + + #if PHASES + t_start_phase = time(NULL); + c_start_phase = clock(); + #endif + + // stage 2: solve the reduced problem. + int_t depth=1; + // recurse if names are not yet unique. + if(name_ctr0; i--) + if(chr(i)==separator) + SA[bkt[chr(i)]--]=i; + + #if DEBUG + printf("S-type (separators)\n"); + for(i=0; i0)) + SA[n1++]=SA[i]; + + uint_t *SA1=SA, *s1=SA+m-n1; + uint_t name_ctr; + + #if DEBUG + printf("\nSA\n"); + for(i=0; i0; i--) + if(chr(i)==separator) + SA[bkt[chr(i)]--]=i; + + #if DEBUG + printf("S-type (separators)\n"); + for(i=0; i0)) + SA[n1++]=SA[i]; + + uint_t *SA1=SA, *s1=SA+m-n1; + uint_t name_ctr; + + #if DEBUG + printf("\nSA\n"); + for(i=0; i0; i--) + if(chr(i)==separator) + SA[bkt[chr(i)]--]=i; + + #if DEBUG + printf("S-type (separators)\n"); + for(i=0; i0)) + SA[n1++]=SA[i]; + + uint_t *SA1=SA, *s1=SA+m-n1; + uint_t name_ctr; + + #if DEBUG + printf("\nSA\n"); + for(i=0; i +#include +#include +#include +#include +#include + +#define max(a,b) ((a) > (b) ? (a) : (b)) + +#ifndef DEBUG + #define DEBUG 0 +#endif + +#ifndef M64 + #define M64 1 +#endif + +#if M64 + typedef int64_t int_t; + typedef uint64_t uint_t; + #define PRIdN PRId64 + #define U_MAX UINT64_MAX + #define I_MAX INT64_MAX + #define I_MIN INT64_MIN +#else + typedef int32_t int_t; + typedef uint32_t uint_t; + #define PRIdN PRId32 + #define U_MAX UINT32_MAX + #define I_MAX INT32_MAX + #define I_MIN INT32_MIN +#endif + +/*! @option type of s[0,n-1] for integer alphabets + * + * @constraint sizeof(int_t) >= sizeof(int_text) + */ +typedef uint32_t int_text; //4N bytes for s[0..n-1] +#define PRIdT PRIu32 + +/*! @option type for array DA + */ +typedef int32_t int_da; +#define PRIdA PRId32 + +/******************************************************************************/ + +/** @brief computes the suffix array of string s[0..n-1] + * + * @param s input string with s[n-1]=0 + * @param SA suffix array + * @param n string length + * @return -1 if an error occured, otherwise the depth of the recursive calls. + */ +int sacak(unsigned char *s, uint_t *SA, uint_t n); + +/** @brief computes the suffix array of string s[0..n-1] + * + * @param k alphabet size+1 (0 is reserved) + */ +int sacak_int(int_text *s, uint_t *SA, uint_t n, uint_t k); + +/******************************************************************************/ + +/** @brief Computes the suffix array SA (LCP, DA) of T^cat in s[0..n-1] + * + * @param s input concatenated string, using separators s[i]=1 and with s[n-1]=0 + * @param SA Suffix array + * @param LCP LCP array + * @param DA Document array + * @param n String length + * + * @return depth of the recursive calls. + */ +int gsacak(unsigned char *s, uint_t *SA, int_t *LCP, int_da *DA, uint_t n); + +/** @brief Computes the suffix array SA (LCP, DA) of T^cat in s[0..n-1] + * + * @param s input concatenated string, using separators s[i]=1 and with s[n-1]=0 + * @param SA Suffix array + * @param LCP LCP array + * @param DA Document array + * @param n String length + * @param k alphabet size+2 (0 and 1 are reserved) + * + * @return depth of the recursive calls. + */ +int gsacak_int(int_text *s, uint_t *SA, int_t *LCP, int_da *DA, uint_t n, uint_t k); + +/******************************************************************************/ + + +#define m64 1 + +#if m64 + typedef int64_t int_t; + typedef uint64_t uint_t; + #define PRIdN PRId64 +#else + typedef int32_t int_t; + typedef uint32_t uint_t; + #define PRIdN PRId32 +#endif + +typedef uint32_t int_text; + + + +int_t SACA_K(int_t *s, uint_t *SA, + uint_t n, unsigned int K, + uint_t m, int cs, int level); + +int_t gSACA_K(uint_t *s, uint_t *SA, + uint_t n, unsigned int K, + int cs, uint_t separator, int level); + +#endif