forked from TheAlgorithms/Java
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathAhoCorasick.java
More file actions
249 lines (209 loc) · 10.9 KB
/
AhoCorasick.java
File metadata and controls
249 lines (209 loc) · 10.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
/*
* Aho-Corasick String Matching Algorithm Implementation
*
* This code implements the Aho-Corasick algorithm, which is used for efficient
* string matching in a given text. It can find multiple patterns simultaneously
* and records their positions in the text.
*
* Author: Prabhat-Kumar-42
* GitHub: https://github.com/Prabhat-Kumar-42
*/
package com.thealgorithms.strings;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;
public final class AhoCorasick {
private AhoCorasick() {
}
// Trie Node Class
private static class Node {
// Represents a character in the trie
private HashMap<Character, Node> child = new HashMap<>(); // Child nodes of the current node
private Node suffixLink; // Suffix link to another node in the trie
private Node outputLink; // Output link to another node in the trie
private int patternInd; // Index of the pattern that ends at this node
Node() {
this.suffixLink = null;
this.outputLink = null;
this.patternInd = -1;
}
public HashMap<Character, Node> getChild() {
return child;
}
public Node getSuffixLink() {
return suffixLink;
}
public void setSuffixLink(final Node suffixLink) {
this.suffixLink = suffixLink;
}
public Node getOutputLink() {
return outputLink;
}
public void setOutputLink(final Node outputLink) {
this.outputLink = outputLink;
}
public int getPatternInd() {
return patternInd;
}
public void setPatternInd(final int patternInd) {
this.patternInd = patternInd;
}
}
// Trie Class
public static class Trie {
private Node root = null; // Root node of the trie
private final String[] patterns; // patterns according to which Trie is constructed
public Trie(final String[] patterns) {
root = new Node(); // Initialize the root of the trie
this.patterns = patterns;
buildTrie();
buildSuffixAndOutputLinks();
}
// builds AhoCorasick Trie
private void buildTrie() {
// Loop through each input pattern and building Trie
for (int i = 0; i < patterns.length; i++) {
Node curr = root; // Start at the root of the trie for each pattern
// Loop through each character in the current pattern
for (int j = 0; j < patterns[i].length(); j++) {
char c = patterns[i].charAt(j); // Get the current character
// Check if the current node has a child for the current character
if (curr.getChild().containsKey(c)) {
curr = curr.getChild().get(c); // Update the current node to the child node
} else {
// If no child node exists, create a new one and add it to the current node's children
Node nn = new Node();
curr.getChild().put(c, nn);
curr = nn; // Update the current node to the new child node
}
}
curr.setPatternInd(i); // Store the index of the pattern in the current leaf node
}
}
private void initializeSuffixLinksForChildNodesOfTheRoot(Queue<Node> q) {
for (char rc : root.getChild().keySet()) {
Node childNode = root.getChild().get(rc);
q.add(childNode); // Add child node to the queue
childNode.setSuffixLink(root); // Set suffix link to the root
}
}
private void buildSuffixAndOutputLinks() {
root.setSuffixLink(root); // Initialize the suffix link of the root to itself
Queue<Node> q = new LinkedList<>(); // Initialize a queue for BFS traversal
initializeSuffixLinksForChildNodesOfTheRoot(q);
while (!q.isEmpty()) {
Node currentState = q.poll(); // Get the current node for processing
// Iterate through child nodes of the current node
for (char cc : currentState.getChild().keySet()) {
Node currentChild = currentState.getChild().get(cc); // Get the child node
Node parentSuffix = currentState.getSuffixLink(); // Get the parent's suffix link
// Calculate the suffix link for the child based on the parent's suffix link
while (!parentSuffix.getChild().containsKey(cc) && parentSuffix != root) {
parentSuffix = parentSuffix.getSuffixLink();
}
// Set the calculated suffix link or default to root
if (parentSuffix.getChild().containsKey(cc)) {
currentChild.setSuffixLink(parentSuffix.getChild().get(cc));
} else {
currentChild.setSuffixLink(root);
}
q.add(currentChild); // Add the child node to the queue for further processing
}
// Establish output links for nodes to efficiently identify patterns within patterns
if (currentState.getSuffixLink().getPatternInd() >= 0) {
currentState.setOutputLink(currentState.getSuffixLink());
} else {
currentState.setOutputLink(currentState.getSuffixLink().getOutputLink());
}
}
}
private ArrayList<ArrayList<Integer>> initializePositionByStringIndexValue() {
ArrayList<ArrayList<Integer>> positionByStringIndexValue = new ArrayList<>(patterns.length); // Stores positions where patterns are found in the text
for (int i = 0; i < patterns.length; i++) {
positionByStringIndexValue.add(new ArrayList<Integer>());
}
return positionByStringIndexValue;
}
// Searches for patterns in the input text and records their positions
public ArrayList<ArrayList<Integer>> searchIn(final String text) {
var positionByStringIndexValue = initializePositionByStringIndexValue(); // Initialize a list to store positions of the current pattern
Node parent = root; // Start searching from the root node
PatternPositionRecorder positionRecorder = new PatternPositionRecorder(positionByStringIndexValue);
for (int i = 0; i < text.length(); i++) {
char ch = text.charAt(i); // Get the current character in the text
// Check if the current node has a child for the current character
if (parent.getChild().containsKey(ch)) {
parent = parent.getChild().get(ch); // Update the current node to the child node
positionRecorder.recordPatternPositions(parent, i); // Use the method in PatternPositionRecorder to record positions
} else {
// If no child node exists for the character, backtrack using suffix links
while (parent != root && !parent.getChild().containsKey(ch)) {
parent = parent.getSuffixLink();
}
if (parent.getChild().containsKey(ch)) {
i--; // Decrement i to reprocess the same character
}
}
}
setUpStartPoints(positionByStringIndexValue);
return positionByStringIndexValue;
}
// by default positionByStringIndexValue contains end-points. This function converts those
// endpoints to start points
private void setUpStartPoints(ArrayList<ArrayList<Integer>> positionByStringIndexValue) {
for (int i = 0; i < patterns.length; i++) {
for (int j = 0; j < positionByStringIndexValue.get(i).size(); j++) {
int endpoint = positionByStringIndexValue.get(i).get(j);
positionByStringIndexValue.get(i).set(j, endpoint - patterns[i].length() + 1);
}
}
}
}
// Class to handle pattern position recording
private static class PatternPositionRecorder {
private ArrayList<ArrayList<Integer>> positionByStringIndexValue;
// Constructor to initialize the recorder with the position list
PatternPositionRecorder(final ArrayList<ArrayList<Integer>> positionByStringIndexValue) {
this.positionByStringIndexValue = positionByStringIndexValue;
}
/**
* Records positions for a pattern when it's found in the input text and follows
* output links to record positions of other patterns.
*
* @param parent The current node representing a character in the pattern trie.
* @param currentPosition The current position in the input text.
*/
public void recordPatternPositions(final Node parent, final int currentPosition) {
// Check if the current node represents the end of a pattern
if (parent.getPatternInd() > -1) {
// Add the current position to the list of positions for the found pattern
positionByStringIndexValue.get(parent.getPatternInd()).add(currentPosition);
}
Node outputLink = parent.getOutputLink();
// Follow output links to find and record positions of other patterns
while (outputLink != null) {
// Add the current position to the list of positions for the pattern linked by outputLink
positionByStringIndexValue.get(outputLink.getPatternInd()).add(currentPosition);
outputLink = outputLink.getOutputLink();
}
}
}
// method to search for patterns in text
public static Map<String, ArrayList<Integer>> search(final String text, final String[] patterns) {
final var trie = new Trie(patterns);
final var positionByStringIndexValue = trie.searchIn(text);
return convert(positionByStringIndexValue, patterns);
}
// method for converting results to a map
private static Map<String, ArrayList<Integer>> convert(final ArrayList<ArrayList<Integer>> positionByStringIndexValue, final String[] patterns) {
Map<String, ArrayList<Integer>> positionByString = new HashMap<>();
for (int i = 0; i < patterns.length; i++) {
String pattern = patterns[i];
ArrayList<Integer> positions = positionByStringIndexValue.get(i);
positionByString.put(pattern, new ArrayList<>(positions));
}
return positionByString;
}
}