-
-
Notifications
You must be signed in to change notification settings - Fork 50.5k
Expand file tree
/
Copy pathsuffix_automaton.py
More file actions
129 lines (107 loc) · 3.38 KB
/
suffix_automaton.py
File metadata and controls
129 lines (107 loc) · 3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
Suffix Automaton
----------------
A suffix automaton (SAM) is a minimal deterministic finite automaton (DFA)
that recognizes all substrings of a given string.
It can be built in O(n) time and space, where n is the length of the string.
Suffix automatons are useful for:
- counting distinct substrings
- checking substring existence
- finding longest common substrings between strings
Reference:
https://cp-algorithms.com/string/suffix-automaton.html
Example:
>>> sa = build_suffix_automaton("ababa")
>>> is_substring(sa, "aba")
True
>>> is_substring(sa, "abc")
False
>>> count_distinct_substrings(sa)
9
"""
from typing import TypedDict
class State(TypedDict):
length: int
link: int
next: dict[str, int]
def build_suffix_automaton(s: str) -> list[State]:
"""
Build the suffix automaton for the given string.
Each state is represented as a dictionary with:
'length' -> length of the longest substring for this state
'link' -> suffix link (integer)
'next' -> transitions (dict: char -> state index)
>>> sa = build_suffix_automaton("ababa")
>>> isinstance(sa, list)
True
>>> all(isinstance(state, dict) for state in sa)
True
"""
sa: list[State] = [{"length": 0, "link": -1, "next": {}}]
last = 0
for ch in s:
cur = len(sa)
sa.append({"length": sa[last]["length"] + 1, "link": 0, "next": {}})
p = last
while p >= 0 and ch not in sa[p]["next"]:
sa[p]["next"][ch] = cur
p = sa[p]["link"]
if p == -1:
sa[cur]["link"] = 0
else:
q = sa[p]["next"][ch]
if sa[p]["length"] + 1 == sa[q]["length"]:
sa[cur]["link"] = q
else:
clone = len(sa)
sa.append(
{
"length": sa[p]["length"] + 1,
"link": sa[q]["link"],
"next": sa[q]["next"].copy(),
}
)
while p >= 0 and sa[p]["next"].get(ch) == q:
sa[p]["next"][ch] = clone
p = sa[p]["link"]
sa[q]["link"] = sa[cur]["link"] = clone
last = cur
return sa
def is_substring(sa: list[State], substring: str) -> bool:
"""
Checks whether the given substring exists in the automaton.
>>> sa = build_suffix_automaton("ababa")
>>> is_substring(sa, "aba")
True
>>> is_substring(sa, "bab")
True
>>> is_substring(sa, "abc")
False
"""
state = 0
for ch in substring:
if ch not in sa[state]["next"]:
return False
state = sa[state]["next"][ch]
return True
def count_distinct_substrings(sa: list[State]) -> int:
"""
Returns the number of distinct substrings in the original string
represented by the suffix automaton.
The number of distinct substrings is:
sum(len[v] - len[link[v]]) for all states v != 0
>>> sa = build_suffix_automaton("ababa")
>>> count_distinct_substrings(sa)
9
>>> count_distinct_substrings(build_suffix_automaton("aaaa"))
4
>>> count_distinct_substrings(build_suffix_automaton("abc"))
6
"""
total = 0
for v in range(1, len(sa)):
total += sa[v]["length"] - sa[sa[v]["link"]]["length"]
return total
if __name__ == "__main__":
from doctest import testmod
testmod()