当前位置: 代码迷 >> 综合 >> jieba库:Tokenizer()类详解(一)初始化
  详细解决方案

jieba库:Tokenizer()类详解(一)初始化

热度:91   发布时间:2023-11-21 17:31:48.0

2021SC@SDUSC


看到代码:

class Tokenizer(object):# 类初始化时对数据进行初始化。def __init__(self, dictionary=DEFAULT_DICT):# 进程锁,用于防止进程对同一个对象进行操作造成资源的争用,甚至导致死锁,或者读写混乱。self.lock = threading.RLock()'''字典的选取,默认值为本文件夹下的dict.txt, 源码:DEFAULT_DICT = NoneDEFAULT_DICT_NAME = "dict.txt"'''if dictionary == DEFAULT_DICT:self.dictionary = dictionaryelse:self.dictionary = _get_abs_path(dictionary)# FREQ是用于存放词汇的词频的字典。self.FREQ = {}# 总词频(所有词频的标量和),用于关键词提取算法中进行权值排序self.total = 0# user_word_tag_tab用于存放词汇的词性。self.user_word_tag_tab = {}# jieba词典使用延迟加载技术,使用时加载,如果需要提前加载,可以使用jieba.initialize()加载self.initialized = Falseself.tmp_dir = Noneself.cache_file = None'''通过重写类的 __repr__() 方法,输出某个实例化对象时,其调用的就是该对象的 __repr__() 方法,输出的是该方法的返回值。'''def __repr__(self):return '<Tokenizer dictionary=%r>' % self.dictionary# 静态方法@staticmethod# 获取文件(词典)的每个字的词频(字典)和所有字的总词频(int)def gen_pfdict(f):lfreq = {}ltotal = 0# 确认f为文件。源码:'''def resolve_filename(f):try:return f.nameexcept AttributeError:return repr(f)'''f_name = resolve_filename(f)# 逐行读取词典,对所有的词进行统计for lineno, line in enumerate(f, 1):try:line = line.strip().decode('utf-8')# 读取前两个参数,词和词频word, freq = line.split(' ')[:2]freq = int(freq)# 把‘词汇’:词频 加入字典lfreq。 词频累加起来赋给 ltotallfreq[word] = freqltotal += freq# 把词汇的子词都添加到lfreq中,词频为0for ch in xrange(len(word)):wfrag = word[:ch + 1]if wfrag not in lfreq:lfreq[wfrag] = 0except ValueError:raise ValueError('invalid dictionary entry in %s at Line %s: %s' % (f_name, lineno, line))f.close()return lfreq, ltotal# 初始化,加载词典def initialize(self, dictionary=None):# 词典选择,默认为 本文件夹下 dict.txt,可自定义文件。if dictionary:abs_path = _get_abs_path(dictionary)if self.dictionary == abs_path and self.initialized:returnelse:self.dictionary = abs_pathself.initialized = Falseelse:abs_path = self.dictionary# 使用锁。with self.lock:try:with DICT_WRITING[abs_path]:passexcept KeyError:pass# 如果已经初始化就returnif self.initialized:returndefault_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary'))t1 = time.time()if self.cache_file:cache_file = self.cache_file# default dictionaryelif abs_path == DEFAULT_DICT:cache_file = "jieba.cache"# custom dictionaryelse:cache_file = "jieba.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest()cache_file = os.path.join(self.tmp_dir or tempfile.gettempdir(), cache_file)# prevent absolute path in self.cache_filetmpdir = os.path.dirname(cache_file)load_from_cache_fail = Trueif os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT oros.path.getmtime(cache_file) > os.path.getmtime(abs_path)):default_logger.debug("Loading model from cache %s" % cache_file)# 如果cache_file是文件,则打开,并且把load_from_cache_fail置为False,表明从cache加载字典成功try:with open(cache_file, 'rb') as cf:self.FREQ, self.total = marshal.load(cf)load_from_cache_fail = Falseexcept Exception:load_from_cache_fail = True# 如果加载失败if load_from_cache_fail:wlock = DICT_WRITING.get(abs_path, threading.RLock())DICT_WRITING[abs_path] = wlockwith wlock:self.FREQ, self.total = self.gen_pfdict(self.get_dict_file())default_logger.debug("Dumping model to file cache %s" % cache_file)try:# prevent moving across different filesystemsfd, fpath = tempfile.mkstemp(dir=tmpdir)with os.fdopen(fd, 'wb') as temp_cache_file:marshal.dump((self.FREQ, self.total), temp_cache_file)_replace_file(fpath, cache_file)except Exception:default_logger.exception("Dump cache file failed.")try:del DICT_WRITING[abs_path]except KeyError:pass# 初始化分词器完成,置initialized为Trueself.initialized = Truedefault_logger.debug("Loading model cost %.3f seconds." % (time.time() - t1))default_logger.debug("Prefix dict has been built successfully.")# 检查是否初始化def check_initialized(self):if not self.initialized:self.initialize()

  相关解决方案