示意图:
import collections# Tire结点
class TireNode(object):def __init__(self):self.children = collections.defaultdict(TireNode)self.is_word = False# Tire Tree
class Tire(object):def __init__(self):self.root = TireNode()def insert(self,word):current = self.rootfor char in word:# defaultdict的索引操作:# 1.key在defaultdict时:return current.children[key]# 2.key不在defaultdict时,分为两步:# (1) current.children[key] = TireNode()# (2) return current.children[key]current = current.children[char]#标记位变为True,表明从根节点到这里是一个词current.is_word = Truedef search(self,word):current = self.rootfor char in word:current = current.children.get(char)# 词典里没有该前缀的词语if not current:return False# 判断是否是一个合法词语return current.is_worddef startsWith(self,word):current = self.rootfor char in word:current = current.children.get(char)# 词典里没有该前缀的词语if not current:return Falsereturn True# 匹配过程:# 南京长江大桥# 南京长江大# ...# 南def enumerateMatch(self,sentence,space = '_'):current = self.rootsentence = list(sentence)matched = []# 依次从后向前匹配while len(sentence) - 1:if self.search(sentence):matched.append('_'.join(sentence))del sentence[-1]return matched t = Tire()
gazetteer = ['南京', '南京市', '市长', '长江', '长江大桥', '大桥']
for gaz in gazetteer:t.insert(gaz)
# 测试insert与search:
matchs = ['南京', '南京市', '市长', '长江', '长江大桥', '大桥','江大桥','小黑']
print('###测试insert与search:\n')
for match in matchs:print(match,':',t.search(match))words = ['南', '京', '市', '长', '江', '大', '桥']
print('###测试enumerateMatch:\n')
print(t.enumerateMatch(words))print('###应用Tire Tree,找到每一个char的对应的word:\n')
words = ['南', '京', '市', '长', '江', '大', '桥']
matched = []
for _ in range(len(words)):matched.append([])for index in range(len(words)):current_word = words[index:]print(index,''.join(current_word),end = ':')result = t.enumerateMatch(current_word)for res in result:matched[index + len(res.split('_')) - 1].append(res)print(' ',result)
print('matched:',matched)
输出:
###测试insert与search:
南京 : True
南京市 : True
市长 : True
长江 : True
长江大桥 : True
大桥 : True
江大桥 : False
小黑 : False
###测试enumerateMatch:
[‘南_京_市’, ‘南_京’]
###应用Tire Tree,找到每一个char的对应的word:
0 南京市长江大桥: [‘南_京_市’, ‘南_京’]
1 京市长江大桥: []
2 市长江大桥: [‘市_长’]
3 长江大桥: [‘长_江_大_桥’, ‘长_江’]
4 江大桥: []
5 大桥: [‘大_桥’]
6 桥: []
matched: [[], [‘南_京’], [‘南_京_市’], [‘市_长’], [‘长_江’], [], [‘长_江_大_桥’, ‘大_桥’]]