-
-
Notifications
You must be signed in to change notification settings - Fork 50.5k
Expand file tree
/
Copy pathrabin_karp.py
More file actions
243 lines (196 loc) · 7.61 KB
/
rabin_karp.py
File metadata and controls
243 lines (196 loc) · 7.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
"""
Rabin-Karp String Matching Algorithm
The Rabin-Karp algorithm uses hashing to find patterns in text.
It employs a rolling hash technique for efficient pattern searching.
Time Complexity:
- Average case: O(n + m) where n is text length, m is pattern length
- Worst case: O(nm) when many spurious hits occur
Space Complexity: O(1) for single pattern, O(k) for k patterns
Applications:
- Plagiarism detection
- DNA sequence matching
- Multiple pattern searching
- Finding duplicate content
"""
def rabin_karp_search(
text: str, pattern: str, base: int = 256, modulus: int = 101
) -> list[int]:
"""
Search for a pattern in text using Rabin-Karp algorithm.
Args:
text: The text to search in
pattern: The pattern to search for
base: The base for hash calculation (default: 256 for ASCII)
modulus: The modulus for hash calculation (prime number)
Returns:
List of starting indices where pattern is found
Examples:
>>> rabin_karp_search("hello world hello", "hello")
[0, 12]
>>> rabin_karp_search("aaaa", "aa")
[0, 1, 2]
>>> rabin_karp_search("abc", "xyz")
[]
>>> rabin_karp_search("", "a")
[]
>>> rabin_karp_search("a", "")
[]
>>> rabin_karp_search("abcdefg", "cde")
[2]
>>> rabin_karp_search("ABABDABACDABABCABAB", "ABABCABAB")
[10]
>>> rabin_karp_search("test test test", "test")
[0, 5, 10]
"""
if not pattern or not text or len(pattern) > len(text):
return []
n = len(text)
m = len(pattern)
matches = []
# Calculate hash value for pattern and first window of text
pattern_hash = 0
text_hash = 0
h = 1
# The value of h would be "pow(base, m-1) % modulus"
for _ in range(m - 1):
h = (h * base) % modulus
# Calculate initial hash values
for i in range(m):
pattern_hash = (base * pattern_hash + ord(pattern[i])) % modulus
text_hash = (base * text_hash + ord(text[i])) % modulus
# Slide the pattern over text one by one
for i in range(n - m + 1):
# Check if hash values match and verify to avoid spurious hits
if pattern_hash == text_hash and text[i : i + m] == pattern:
matches.append(i)
# Calculate hash for next window (rolling hash)
if i < n - m:
# Remove leading character and add trailing character
text_hash = (
base * (text_hash - ord(text[i]) * h) + ord(text[i + m])
) % modulus
# Handle negative hash values
if text_hash < 0:
text_hash += modulus
return matches
def rabin_karp_multiple(
text: str, patterns: list[str], base: int = 256, modulus: int = 101
) -> dict[str, list[int]]:
"""
Search for multiple patterns in text using Rabin-Karp algorithm.
This is more efficient than running single pattern search multiple times
because we only scan the text once.
Args:
text: The text to search in
patterns: List of patterns to search for
base: The base for hash calculation
modulus: The modulus for hash calculation
Returns:
Dictionary mapping each pattern to list of indices where found
Examples:
>>> result = rabin_karp_multiple("hello world hello", ["hello", "world"])
>>> result == {"hello": [0, 12], "world": [6]}
True
>>> result = rabin_karp_multiple("aaaa", ["aa", "aaa"])
>>> result == {"aa": [0, 1, 2], "aaa": [0, 1]}
True
>>> result = rabin_karp_multiple("test", ["abc", "xyz"])
>>> result == {"abc": [], "xyz": []}
True
>>> result = rabin_karp_multiple("", ["a", "b"])
>>> result == {"a": [], "b": []}
True
>>> result = rabin_karp_multiple("abcdef", ["ab", "cd", "ef"])
>>> result == {"ab": [0], "cd": [2], "ef": [4]}
True
"""
if not text or not patterns:
return {pattern: [] for pattern in patterns}
# Group patterns by length for efficient processing
patterns_by_length: dict[int, list[str]] = {}
for pattern in patterns:
if pattern: # Skip empty patterns
length = len(pattern)
if length not in patterns_by_length:
patterns_by_length[length] = []
patterns_by_length[length].append(pattern)
results: dict[str, list[int]] = {pattern: [] for pattern in patterns}
# Process each group of patterns with same length
for pattern_length, pattern_group in patterns_by_length.items():
if pattern_length > len(text):
continue
# Calculate pattern hashes
pattern_hashes = {}
for pattern in pattern_group:
pattern_hash = 0
for char in pattern:
pattern_hash = (base * pattern_hash + ord(char)) % modulus
pattern_hashes[pattern] = pattern_hash
# Calculate hash for first window
text_hash = 0
h = 1
for _ in range(pattern_length - 1):
h = (h * base) % modulus
for i in range(pattern_length):
text_hash = (base * text_hash + ord(text[i])) % modulus
# Slide the window over text
for i in range(len(text) - pattern_length + 1):
# Check if current hash matches any pattern hash
for pattern, pattern_hash in pattern_hashes.items():
# Verify to avoid spurious hits
if (
text_hash == pattern_hash
and text[i : i + pattern_length] == pattern
):
results[pattern].append(i)
# Calculate hash for next window
if i < len(text) - pattern_length:
text_hash = (
base * (text_hash - ord(text[i]) * h)
+ ord(text[i + pattern_length])
) % modulus
if text_hash < 0:
text_hash += modulus
return results
def rabin_karp_search_optimized(
text: str, pattern: str, base: int = 256, modulus: int = 1_000_000_007
) -> list[int]:
"""
Optimized version with larger modulus to reduce collisions.
Using a larger prime modulus (10^9 + 7) significantly reduces
the probability of hash collisions, improving average-case performance.
Args:
text: The text to search in
pattern: The pattern to search for
base: The base for hash calculation
modulus: Large prime modulus (default: 10^9 + 7)
Returns:
List of starting indices where pattern is found
Examples:
>>> rabin_karp_search_optimized("hello world", "world")
[6]
>>> rabin_karp_search_optimized("aaabaaaa", "aaaa")
[4]
>>> rabin_karp_search_optimized("abc", "d")
[]
"""
return rabin_karp_search(text, pattern, base, modulus)
if __name__ == "__main__":
import doctest
doctest.testmod()
# Performance demonstration
text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit" * 100
pattern = "consectetur"
print("Rabin-Karp String Matching Algorithm Demo")
print("=" * 50)
# Single pattern search
matches = rabin_karp_search(text, pattern)
print(f"\nSearching for '{pattern}' in text ({len(text)} chars)")
print(f"Found {len(matches)} matches at indices: {matches[:5]}...")
# Multiple pattern search
patterns = ["Lorem", "ipsum", "consectetur", "adipiscing"]
results = rabin_karp_multiple(text, patterns)
print(f"\nSearching for {len(patterns)} patterns:")
for p, indices in results.items():
print(f" '{p}': {len(indices)} matches")
print("\n✓ All tests passed!")