当前位置: 代码迷 >> 综合 >> bert2vec+kmeans
  详细解决方案

bert2vec+kmeans

热度:20   发布时间:2023-09-22 17:31:47.0

from bert_serving.client import BertClient
from sklearn.cluster import KMeans

#ivy_nie
bc = BertClient()
def wordsCluster(text, vectorSize, classCount):
‘’’
text:输入文本的本地路径
vectorSize:词向量大小
classCount:k值
‘’’
name = []
data = open(text, ‘r’, encoding=‘utf-8’)
for line in data.readlines():
line = line.replace(’\n’, ‘’)
if line not in name:
name.append(line)

true_labels = []
labels = open('doc.txt', 'r', encoding='utf-8')
for label in labels.readlines():label = label.replace('\n', '')true_labels.append(label)# 获取词对于的词向量
wordvector = []
with open('doc.txt', 'r', encoding='utf-8') as ff:lines = ff.readlines()for line in lines:print(line)key = bc.encode([line])[0]print(key)wordvector.append(key)
fff=open('result.txt','w',encoding='utf-8')
# 聚类
clf = KMeans(n_clusters=classCount)
s = clf.fit_predict(wordvector)
for i in range(0, 500):label_i = []for j in range(0, len(s)):if s[j] == i:label_i.append(name[j])print('label_' + str(i) + ':' + str(label_i))fff.write('label_' + str(i) + ':' + str(label_i)+'\n')

wordsCluster(’./wan.txt’, 300,500)