匹配屏蔽词:Aho-Corasick 算法
前言
Aho–Corasick 算法(AC 算法)是由 Alfred V. Aho 和 Margaret J.Corasick 发明的多模式字符串搜索算法,用于在输入的一串字符串中匹配字典中的子串。结合 KMP 算法的思想,在字典树的基础上,给每个节点加入失败指针,指向和已匹配字符串的后缀相等的,字典中的最长前缀,失配时保留已匹配的内容,极大提升了匹配的效率。
目前类似于博客、评论、聊天等文本敏感词匹配方案,传统方法中多使用 AC 算法,一般是从已有的敏感词文本中构建字典树,然后在对输入文本进行匹配。
构建字典树
构建字典树,如果需要的关键词是 she、her、he、his,很显然,有如下字典树:
graph LR
A(('')) --> B((s)) --> C((h)) --> D((e))
A --> E((h)) --> F((e)) --> G((r))
E-->H((i))-->I((s))
代码示例:
ts- type trieNodeType = {
- children: null | Map<string, trieNodeType>
- key: string
- pre: null | trieNodeType
- fail: null | trieNodeType
- end: boolean
- }
- const trieNodeFactory = (key: string, pre: null | trieNodeType = null, end: boolean = false): trieNodeType => {
- return {
- children: null, fail: null, key, pre, end
- }
- }
- class ACPatternMatchingMachine {
- private root = trieNodeFactory('')
- constructor (words: string[]) {
- words.forEach(word => this._addWord(word))
- this.generateFailRoute()
- }
- private _addWord (word: string) {
- let pre = this.root, len = word.length
- for (let i = 0; i < len; i++) {
- if (!pre.children) {
- pre.children = new Map()
- }
- let record = pre.children.get(word[i])
- if (!record) {
- record = trieNodeFactory(word[i], pre, i === len - 1)
- pre.children.set(word[i], record)
- } else {
- record.end = i === len - 1
- }
- pre = record
- }
- }
- private generateFailRoute () {}
- match (text: string) {}
- }
失败指针
下面开始构建失败指针,从根节点开始层序遍历
- 根节点及其子节点指向根节点,见下图虚线:
graph LR
A(('')) --> B((s)) --> C((h)) --> D((e))
A --> E((h)) --> F((e)) --> G((r))
E-->H((i))-->I((s))
A -.-> A
B -.-> A
E -.-> A
- 若亲节点的失败路径接受自己,则指向对应的节点。这里是意思是沿亲节点的失败路径遍历,存在节点的子节点与当前节点文本相同,找到则指向该子节点,见下图虚线。
graph LR
A(('')) --> B((s)) --> C((h)) --> D((e))
A --> E((h)) --> F((e)) --> G((r))
E-->H((i))-->I((s))
C -.-> E
D -.-> F
- 若亲节点的失败路径不接受自己,则指向根节点,见下图虚线。
graph LR
A(('')) --> B((s)) --> C((h)) --> D((e))
A --> E((h)) --> F((e)) --> G((r))
E-->H((i))-->I((s))
G -.-> A
F -.-> A
H -.-> A
I -.-> A
代码示例:
ts- class ACPatternMatchingMachine {
- // ...
- private generateFailRoute () {
- const queue: trieNodeType[] = [this.root]
- while (queue.length) {
- const cur = queue.shift()!
- this.generateFailRouteImpl(cur)
- cur.children && queue.push(...cur.children.values())
- }
-
- }
- private generateFailRouteImpl (record: trieNodeType) {
- if (record === this.root) {
- record.fail = record
- } else if (record.pre === this.root) {
- record.fail = this.root
- } else {
- const failRoute = record.pre!.fail!
- let cur: trieNodeType | null = failRoute
- while (cur) {
- const tmp = cur.children?.get(record.key)
- if (tmp) {
- record.fail = tmp
- break
- }
- if (cur === this.root) {
- break
- }
- cur = cur.fail
- }
- record.fail ??= this.root
- }
- }
- }
文本匹配
遍历文本,从字典树根节点开始,如果匹配到则往下走,匹配不到时,如果失败指针指向非根节点,则在失败指针处继续匹配当前字符,指向根节点则在根节点匹配下一个字符:
ts- class ACPatternMatchingMachine {
- // ...
- private wordCache = new Map<trieNodeType, string>()
-
- private getWord (record: trieNodeType): string {
- const cache = this.wordCache.get(record)
- if (cache) {
- return cache
- }
- let ans = '', cur: trieNodeType | null = record
- while (cur) {
- ans = cur.key + ans
- cur = cur.pre
- }
- this.wordCache.set(record, ans)
- return ans
- }
- match (text: string) {
- const ans: {
- word: string,
- start: number
- end: number
- }[] = []
- const len = text.length
- let word = ''
- let cur = this.root
- for (let i = 0; i < len; i++) {
- const record = cur.children?.get(text[i])
- if (record) {
- cur = record
- word += record.key
- if (record.end) {
- ans.push({
- word,
- end: i,
- start: i - word.length + 1
- })
- }
- } else {
- cur = cur.fail!
- word = this.getWord(cur)
- cur.fail !== this.root && i--
- }
- }
- return ans
- }
- }
最终实现
ts- type trieNodeType = {
- children: null | Map<string, trieNodeType>
- key: string
- pre: null | trieNodeType
- fail: null | trieNodeType
- end: boolean
- }
- const trieNodeFactory = (key: string, pre: null | trieNodeType = null, end: boolean = false): trieNodeType => {
- return {
- children: null,
- fail: null,
- key,
- pre,
- end
- }
- }
- class ACPatternMatchingMachine {
- private root = trieNodeFactory('')
- private wordCache = new Map<trieNodeType, string>()
- constructor (words: string[]) {
- words.forEach(word => this._addWord(word))
- this.generateFailRoute()
- }
- private _addWord (word: string, addNodeCallback?: Function) {
- let pre = this.root, len = word.length
- for (let i = 0; i < len; i++) {
- if (!pre.children) {
- pre.children = new Map()
- }
- let record = pre.children.get(word[i])
- if (!record) {
- record = trieNodeFactory(word[i], pre, i === len - 1)
- pre.children.set(word[i], record)
- typeof addNodeCallback === 'function' && addNodeCallback(record)
- } else {
- record.end = i === len - 1
- }
- pre = record
- }
- }
- addWord (word: string) {
- this._addWord(word, this.generateFailRouteImpl.bind(this))
- }
- private getWord (record: trieNodeType): string {
- const cache = this.wordCache.get(record)
- if (cache) {
- return cache
- }
- let ans = '', cur: trieNodeType | null = record
- while (cur) {
- ans = cur.key + ans
- cur = cur.pre
- }
- this.wordCache.set(record, ans)
- return ans
- }
- private generateFailRoute () {
- const queue: trieNodeType[] = [this.root]
- while (queue.length) {
- const cur = queue.shift()!
- this.generateFailRouteImpl(cur)
- cur.children && queue.push(...cur.children.values())
- }
-
- }
- private generateFailRouteImpl (record: trieNodeType) {
- if (record === this.root) {
- record.fail = record
- } else if (record.pre === this.root) {
- record.fail = this.root
- } else {
- const failRoute = record.pre!.fail!
- let cur: trieNodeType | null = failRoute
- while (cur) {
- const tmp = cur.children?.get(record.key)
- if (tmp) {
- record.fail = tmp
- break
- }
- if (cur === this.root) {
- break
- }
- cur = cur.fail
- }
- record.fail ??= this.root
- }
- }
- match (text: string) {
- const ans: {
- word: string,
- start: number
- end: number
- }[] = []
- const len = text.length
- let word = ''
- let cur = this.root
- for (let i = 0; i < len; i++) {
- const record = cur.children?.get(text[i])
- if (record) {
- cur = record
- word += record.key
- if (record.end) {
- ans.push({
- word,
- end: i,
- start: i - word.length + 1
- })
- }
- } else {
- cur = cur.fail!
- word = this.getWord(cur)
- cur.fail !== this.root && i--
- }
- }
- return ans
- }
- }
总结
本文简述了在匹配屏蔽词中多用的 Aho-Corasick 算法的实现,通过失败指针,可以大限度利用已匹配信息,提高匹配效率。
0 条评论未登录用户
Ctrl or + Enter 评论
