Skip to content

Commit 9e695c2

Browse files
committed
feat: add ahocorasick filter
1 parent 98f0955 commit 9e695c2

File tree

13 files changed

+694
-122
lines changed

13 files changed

+694
-122
lines changed

README-zh_cn.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
- 支持运行过程中动态修改数据源
2424
- 支持多种过滤算法
2525
- **DFA** 使用 `trie tree` 数据结构匹配敏感词
26+
- **AC 自动机**
2627

2728
## ⚙ Usage
2829

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ English | [中文](README-zh_cn.md)
1414
- `IsSensitive()` Check whether the text has sensitive word
1515
- `FindOne()` return first sensitive word that has been found in the text
1616
- `FindAll()` return all sensitive word that has been found in the text
17-
- `FindAllCount()` return all sensitive word with its count that has been found in the text
17+
- `FindAllCount()` return all sensitive[README-zh_cn.md](README-zh_cn.md) word with its count that has been found in the text
1818
- support multiple data sources with dynamic modification
1919
- support memory storage
2020
- support mysql storage
@@ -23,7 +23,7 @@ English | [中文](README-zh_cn.md)
2323
- support dynamic add/del sensitive word while running
2424
- support multiple filter algorithms
2525
- **DFA** use `trie tree` to filter sensitive words
26-
26+
- **Aho–Corasick algorithm**
2727
## ⚙ Usage
2828

2929
```go

filter/ahocorasick.go

Lines changed: 338 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,338 @@
1+
package filter
2+
3+
import (
4+
"github.com/sgoware/ds/queue/arrayqueue"
5+
)
6+
7+
type acNode struct {
8+
value rune
9+
children map[rune]*acNode
10+
word *string
11+
fail *acNode
12+
}
13+
14+
func newAcNode(r rune) *acNode {
15+
return &acNode{
16+
value: r,
17+
children: make(map[rune]*acNode),
18+
word: nil,
19+
}
20+
}
21+
22+
type AcModel struct {
23+
root *acNode
24+
}
25+
26+
func NewAcModel() *AcModel {
27+
return &AcModel{
28+
root: newAcNode(0),
29+
}
30+
}
31+
32+
func (m *AcModel) AddWords(words ...string) {
33+
for _, word := range words {
34+
m.AddWord(word)
35+
}
36+
37+
m.buildFailPointers()
38+
}
39+
40+
func (m *AcModel) AddWord(word string) {
41+
now := m.root
42+
runes := []rune(word)
43+
44+
for _, r := range runes {
45+
if next, ok := now.children[r]; ok {
46+
now = next
47+
} else {
48+
next = newAcNode(r)
49+
now.children[r] = next
50+
now = next
51+
}
52+
}
53+
54+
now.word = new(string)
55+
*now.word = word
56+
}
57+
58+
func (m *AcModel) DelWords(words ...string) {
59+
for _, word := range words {
60+
m.DelWord(word)
61+
}
62+
63+
m.buildFailPointers()
64+
}
65+
66+
func (m *AcModel) DelWord(word string) {
67+
var lastLeaf *acNode
68+
var lastLeafNextRune rune
69+
now := m.root
70+
runes := []rune(word)
71+
72+
for _, r := range runes {
73+
if next, ok := now.children[r]; !ok {
74+
return
75+
} else {
76+
if now.word != nil {
77+
lastLeaf = now
78+
lastLeafNextRune = r
79+
}
80+
now = next
81+
}
82+
}
83+
84+
delete(lastLeaf.children, lastLeafNextRune)
85+
}
86+
87+
func (m *AcModel) buildFailPointers() {
88+
q := arrayqueue.New(m.root)
89+
90+
for q.Len() > 0 {
91+
temp, _ := q.Top()
92+
q.Pop()
93+
for _, node := range temp.(*acNode).children {
94+
if temp.(*acNode) == m.root {
95+
node.fail = m.root
96+
} else {
97+
p := temp.(*acNode).fail
98+
for p != nil {
99+
if next, found := p.children[node.value]; found {
100+
node.fail = next
101+
break
102+
}
103+
p = p.fail
104+
}
105+
if p == nil {
106+
node.fail = m.root
107+
}
108+
}
109+
110+
q.Push(node)
111+
}
112+
}
113+
}
114+
115+
func (m *AcModel) Listen(addChan, delChan <-chan string) {
116+
go func() {
117+
var words []string
118+
119+
for word := range addChan {
120+
words = append(words, word)
121+
if len(addChan) == 0 {
122+
m.AddWords(words...)
123+
word = word[:0]
124+
}
125+
}
126+
}()
127+
128+
go func() {
129+
var words []string
130+
131+
for word := range delChan {
132+
words = append(words, word)
133+
if len(delChan) == 0 {
134+
m.DelWords(words...)
135+
word = word[:0]
136+
}
137+
}
138+
}()
139+
}
140+
141+
func (m *AcModel) FindAll(text string) []string {
142+
var matches []string
143+
var found bool
144+
145+
now := m.root
146+
var temp *acNode
147+
runes := []rune(text)
148+
149+
for pos := 0; pos < len(runes); pos++ {
150+
_, found = now.children[runes[pos]]
151+
if !found && now != m.root {
152+
now = now.fail
153+
for ; !found && now != m.root; now, found = now.children[runes[pos]] {
154+
now = now.fail
155+
}
156+
}
157+
158+
// 若找到匹配成功的字符串结点, 则指向那个结点, 否则指向根结点
159+
if next, ok := now.children[runes[pos]]; ok {
160+
now = next
161+
} else {
162+
now = m.root
163+
}
164+
165+
temp = now
166+
167+
for temp != m.root {
168+
if temp.word != nil {
169+
matches = append(matches, *temp.word)
170+
}
171+
temp = temp.fail
172+
}
173+
}
174+
175+
var res []string
176+
set := make(map[string]struct{})
177+
178+
for _, word := range matches {
179+
if _, ok := set[word]; !ok {
180+
set[word] = struct{}{}
181+
res = append(res, word)
182+
}
183+
}
184+
185+
return res
186+
}
187+
188+
func (m *AcModel) FindAllCount(text string) map[string]int {
189+
res := make(map[string]int)
190+
var found bool
191+
var temp *acNode
192+
193+
now := m.root
194+
runes := []rune(text)
195+
196+
for pos := 0; pos < len(runes); pos++ {
197+
_, found = now.children[runes[pos]]
198+
if !found && now != m.root {
199+
now = now.fail
200+
for ; !found && now != m.root; now, found = now.children[runes[pos]] {
201+
now = now.fail
202+
}
203+
}
204+
205+
// 若找到匹配成功的字符串结点, 则指向那个结点, 否则指向根结点
206+
if next, ok := now.children[runes[pos]]; ok {
207+
now = next
208+
} else {
209+
now = m.root
210+
}
211+
212+
temp = now
213+
214+
for temp != m.root {
215+
if temp.word != nil {
216+
res[*temp.word]++
217+
}
218+
temp = temp.fail
219+
}
220+
}
221+
222+
return res
223+
}
224+
225+
func (m *AcModel) FindOne(text string) string {
226+
var found bool
227+
var temp *acNode
228+
229+
now := m.root
230+
runes := []rune(text)
231+
232+
for pos := 0; pos < len(runes); pos++ {
233+
_, found = now.children[runes[pos]]
234+
if !found && now != m.root {
235+
now = now.fail
236+
for ; !found && now != m.root; now, found = now.children[runes[pos]] {
237+
now = now.fail
238+
}
239+
}
240+
241+
// 若找到匹配成功的字符串结点, 则指向那个结点, 否则指向根结点
242+
if next, ok := now.children[runes[pos]]; ok {
243+
now = next
244+
} else {
245+
now = m.root
246+
}
247+
248+
temp = now
249+
250+
for temp != m.root {
251+
if temp.word != nil {
252+
return *temp.word
253+
}
254+
temp = temp.fail
255+
}
256+
}
257+
258+
return ""
259+
}
260+
261+
func (m *AcModel) IsSensitive(text string) bool {
262+
return m.FindOne(text) != ""
263+
}
264+
265+
func (m *AcModel) Replace(text string, repl rune) string {
266+
var found bool
267+
var temp *acNode
268+
269+
now := m.root
270+
runes := []rune(text)
271+
272+
for pos := 0; pos < len(runes); pos++ {
273+
_, found = now.children[runes[pos]]
274+
if !found && now != m.root {
275+
now = now.fail
276+
for ; !found && now != m.root; now, found = now.children[runes[pos]] {
277+
now = now.fail
278+
}
279+
}
280+
281+
// 若找到匹配成功的字符串结点, 则指向那个结点, 否则指向根结点
282+
if next, ok := now.children[runes[pos]]; ok {
283+
now = next
284+
} else {
285+
now = m.root
286+
}
287+
288+
temp = now
289+
290+
for temp != m.root {
291+
if temp.word != nil {
292+
for i := pos - len([]rune(*temp.word)) + 1; i <= pos; i++ {
293+
runes[i] = repl
294+
}
295+
}
296+
temp = temp.fail
297+
}
298+
}
299+
300+
return string(runes)
301+
}
302+
303+
func (m *AcModel) Remove(text string) string {
304+
var found bool
305+
var temp *acNode
306+
307+
now := m.root
308+
runes := []rune(text)
309+
310+
for pos := 0; pos < len(runes); pos++ {
311+
_, found = now.children[runes[pos]]
312+
if !found && now != m.root {
313+
now = now.fail
314+
for ; !found && now != m.root; now, found = now.children[runes[pos]] {
315+
now = now.fail
316+
}
317+
}
318+
319+
// 若找到匹配成功的字符串结点, 则指向那个结点, 否则指向根结点
320+
if next, ok := now.children[runes[pos]]; ok {
321+
now = next
322+
} else {
323+
now = m.root
324+
}
325+
326+
temp = now
327+
328+
for temp != m.root {
329+
if temp.word != nil {
330+
runes = append(runes[:pos-len([]rune(*temp.word))+1], runes[pos+1:]...)
331+
pos -= len([]rune(*temp.word))
332+
}
333+
temp = temp.fail
334+
}
335+
}
336+
337+
return string(runes)
338+
}

0 commit comments

Comments
 (0)