coverPiccoverPic

匹配屏蔽词:Aho-Corasick 算法

前言

Aho–Corasick 算法(AC 算法)是由 Alfred V. Aho 和 Margaret J.Corasick 发明的多模式字符串搜索算法,用于在输入的一串字符串中匹配字典中的子串。结合 KMP 算法的思想,在字典树的基础上,给每个节点加入失败指针,指向和已匹配字符串的后缀相等的,字典中的最长前缀,失配时保留已匹配的内容,极大提升了匹配的效率。

目前类似于博客、评论、聊天等文本敏感词匹配方案,传统方法中多使用 AC 算法,一般是从已有的敏感词文本中构建字典树,然后在对输入文本进行匹配。

构建字典树

构建字典树,如果需要的关键词是 she、her、he、his,很显然,有如下字典树:

graph LR
A(('')) --> B((s)) --> C((h)) --> D((e))
A --> E((h)) --> F((e)) --> G((r))
E-->H((i))-->I((s))

代码示例:

ts
  1. type trieNodeType = {
  2. children: null | Map<string, trieNodeType>
  3. key: string
  4. pre: null | trieNodeType
  5. fail: null | trieNodeType
  6. end: boolean
  7. }
  8. const trieNodeFactory = (key: string, pre: null | trieNodeType = null, end: boolean = false): trieNodeType => {
  9. return {
  10. children: null, fail: null, key, pre, end
  11. }
  12. }
  13. class ACPatternMatchingMachine {
  14. private root = trieNodeFactory('')
  15. constructor (words: string[]) {
  16. words.forEach(word => this._addWord(word))
  17. this.generateFailRoute()
  18. }
  19. private _addWord (word: string) {
  20. let pre = this.root, len = word.length
  21. for (let i = 0; i < len; i++) {
  22. if (!pre.children) {
  23. pre.children = new Map()
  24. }
  25. let record = pre.children.get(word[i])
  26. if (!record) {
  27. record = trieNodeFactory(word[i], pre, i === len - 1)
  28. pre.children.set(word[i], record)
  29. } else {
  30. record.end = i === len - 1
  31. }
  32. pre = record
  33. }
  34. }
  35. private generateFailRoute () {}
  36. match (text: string) {}
  37. }

失败指针

下面开始构建失败指针,从根节点开始层序遍历

  1. 根节点及其子节点指向根节点,见下图虚线:
graph LR
A(('')) --> B((s)) --> C((h)) --> D((e))
A --> E((h)) --> F((e)) --> G((r))
E-->H((i))-->I((s))
A -.-> A
B -.-> A
E -.-> A
  1. 若亲节点的失败路径接受自己,则指向对应的节点。这里是意思是沿亲节点的失败路径遍历,存在节点的子节点与当前节点文本相同,找到则指向该子节点,见下图虚线。
graph LR
A(('')) --> B((s)) --> C((h)) --> D((e))
A --> E((h)) --> F((e)) --> G((r))
E-->H((i))-->I((s))
C -.-> E
D -.-> F
  1. 若亲节点的失败路径不接受自己,则指向根节点,见下图虚线。
graph LR
A(('')) --> B((s)) --> C((h)) --> D((e))
A --> E((h)) --> F((e)) --> G((r))
E-->H((i))-->I((s))
G -.-> A
F -.-> A
H -.-> A
I -.-> A

代码示例:

ts
  1. class ACPatternMatchingMachine {
  2. // ...
  3. private generateFailRoute () {
  4. const queue: trieNodeType[] = [this.root]
  5. while (queue.length) {
  6. const cur = queue.shift()!
  7. this.generateFailRouteImpl(cur)
  8. cur.children && queue.push(...cur.children.values())
  9. }
  10. }
  11. private generateFailRouteImpl (record: trieNodeType) {
  12. if (record === this.root) {
  13. record.fail = record
  14. } else if (record.pre === this.root) {
  15. record.fail = this.root
  16. } else {
  17. const failRoute = record.pre!.fail!
  18. let cur: trieNodeType | null = failRoute
  19. while (cur) {
  20. const tmp = cur.children?.get(record.key)
  21. if (tmp) {
  22. record.fail = tmp
  23. break
  24. }
  25. if (cur === this.root) {
  26. break
  27. }
  28. cur = cur.fail
  29. }
  30. record.fail ??= this.root
  31. }
  32. }
  33. }

文本匹配

遍历文本,从字典树根节点开始,如果匹配到则往下走,匹配不到时,如果失败指针指向非根节点,则在失败指针处继续匹配当前字符,指向根节点则在根节点匹配下一个字符:

ts
  1. class ACPatternMatchingMachine {
  2. // ...
  3. private wordCache = new Map<trieNodeType, string>()
  4. private getWord (record: trieNodeType): string {
  5. const cache = this.wordCache.get(record)
  6. if (cache) {
  7. return cache
  8. }
  9. let ans = '', cur: trieNodeType | null = record
  10. while (cur) {
  11. ans = cur.key + ans
  12. cur = cur.pre
  13. }
  14. this.wordCache.set(record, ans)
  15. return ans
  16. }
  17. match (text: string) {
  18. const ans: {
  19. word: string,
  20. start: number
  21. end: number
  22. }[] = []
  23. const len = text.length
  24. let word = ''
  25. let cur = this.root
  26. for (let i = 0; i < len; i++) {
  27. const record = cur.children?.get(text[i])
  28. if (record) {
  29. cur = record
  30. word += record.key
  31. if (record.end) {
  32. ans.push({
  33. word,
  34. end: i,
  35. start: i - word.length + 1
  36. })
  37. }
  38. } else {
  39. cur = cur.fail!
  40. word = this.getWord(cur)
  41. cur.fail !== this.root && i--
  42. }
  43. }
  44. return ans
  45. }
  46. }

最终实现

ts
  1. type trieNodeType = {
  2. children: null | Map<string, trieNodeType>
  3. key: string
  4. pre: null | trieNodeType
  5. fail: null | trieNodeType
  6. end: boolean
  7. }
  8. const trieNodeFactory = (key: string, pre: null | trieNodeType = null, end: boolean = false): trieNodeType => {
  9. return {
  10. children: null,
  11. fail: null,
  12. key,
  13. pre,
  14. end
  15. }
  16. }
  17. class ACPatternMatchingMachine {
  18. private root = trieNodeFactory('')
  19. private wordCache = new Map<trieNodeType, string>()
  20. constructor (words: string[]) {
  21. words.forEach(word => this._addWord(word))
  22. this.generateFailRoute()
  23. }
  24. private _addWord (word: string, addNodeCallback?: Function) {
  25. let pre = this.root, len = word.length
  26. for (let i = 0; i < len; i++) {
  27. if (!pre.children) {
  28. pre.children = new Map()
  29. }
  30. let record = pre.children.get(word[i])
  31. if (!record) {
  32. record = trieNodeFactory(word[i], pre, i === len - 1)
  33. pre.children.set(word[i], record)
  34. typeof addNodeCallback === 'function' && addNodeCallback(record)
  35. } else {
  36. record.end = i === len - 1
  37. }
  38. pre = record
  39. }
  40. }
  41. addWord (word: string) {
  42. this._addWord(word, this.generateFailRouteImpl.bind(this))
  43. }
  44. private getWord (record: trieNodeType): string {
  45. const cache = this.wordCache.get(record)
  46. if (cache) {
  47. return cache
  48. }
  49. let ans = '', cur: trieNodeType | null = record
  50. while (cur) {
  51. ans = cur.key + ans
  52. cur = cur.pre
  53. }
  54. this.wordCache.set(record, ans)
  55. return ans
  56. }
  57. private generateFailRoute () {
  58. const queue: trieNodeType[] = [this.root]
  59. while (queue.length) {
  60. const cur = queue.shift()!
  61. this.generateFailRouteImpl(cur)
  62. cur.children && queue.push(...cur.children.values())
  63. }
  64. }
  65. private generateFailRouteImpl (record: trieNodeType) {
  66. if (record === this.root) {
  67. record.fail = record
  68. } else if (record.pre === this.root) {
  69. record.fail = this.root
  70. } else {
  71. const failRoute = record.pre!.fail!
  72. let cur: trieNodeType | null = failRoute
  73. while (cur) {
  74. const tmp = cur.children?.get(record.key)
  75. if (tmp) {
  76. record.fail = tmp
  77. break
  78. }
  79. if (cur === this.root) {
  80. break
  81. }
  82. cur = cur.fail
  83. }
  84. record.fail ??= this.root
  85. }
  86. }
  87. match (text: string) {
  88. const ans: {
  89. word: string,
  90. start: number
  91. end: number
  92. }[] = []
  93. const len = text.length
  94. let word = ''
  95. let cur = this.root
  96. for (let i = 0; i < len; i++) {
  97. const record = cur.children?.get(text[i])
  98. if (record) {
  99. cur = record
  100. word += record.key
  101. if (record.end) {
  102. ans.push({
  103. word,
  104. end: i,
  105. start: i - word.length + 1
  106. })
  107. }
  108. } else {
  109. cur = cur.fail!
  110. word = this.getWord(cur)
  111. cur.fail !== this.root && i--
  112. }
  113. }
  114. return ans
  115. }
  116. }

总结

本文简述了在匹配屏蔽词中多用的 Aho-Corasick 算法的实现,通过失败指针,可以大限度利用已匹配信息,提高匹配效率。

0 条评论未登录用户
Ctrl or + Enter 评论
© 2023-2025 LittleRangiferTarandus, All rights reserved.
🌸 Run