-- coding: utf-8 --
“”"
Created on Mon Oct 22 17:47:24 2018
@author: Python_test
“”"
import requests
import pandas as pd
import jieba
#import pkuseg
#seg = pkuseg.pkuseg()
#读取需要处理的excel档
f2 = pd.read_excel(‘test_20191210.xlsx’,sheet_name = “test”)
#mat = “”"
#我都唔明呢間學校嘅校方高層個腦諗乜,你點樣投誠都唔夠濠江中學咁根正苗紅?啦粵華之秘相信咁多位粵華仔都知道,由今個學期開始,每週一嘅週會都要升「中華人民共和國」嘅國旗。跟住係週會結束之際,係偉大嘅祖国嘅国旗之下,公然宣佈高二級要去天主教嘅教堂念玫瑰經。咁站係偉大祖国嘅立場,係咪犯左「煽動巔覆國家政權罪」呢?一邊係度愛国愛黨,一邊又係度信祖国唔鍾意嘅宗教?,學校會唔會
#"""
query = “”"
query(
KaTeX parse error: Expected '}', got 'EOF' at end of input: … sentences:sentences
)
}
“”"
#因api需要登陆,所以需要拿最新token的url
url = “http://python-api.access_token”#自设
for i in range(len(f2)):
sen = f2.iloc[i,0]
variables= {
“sentences”:sen
}
data = {
“query”:query,
“variables”: variables
}
try:
result = requests.post(url,json=data)
#print(result)
output = result.json()
# print(result.json())
a = output['data']['sentiment']['data']['result']
# b = output['data']['sentiment']['data']['score']
# c = output['data']['sentiment']['data']['proportion']d = output['data']['sentiment']['data']['words']f2.loc[i,'result'] = str(a)
# f2.loc[i,'score'] = str(b)
# f2.loc[i,'proportion'] = str(c)f2.loc[i,'words'] = str(d)except Exception as e:print(str(e))
# fx.loc[i,'runtime'] = float(t)#直接生成浮点型
# a=[t]
# a+=a
print(i)
print(output)
#将结果保存为excel
#f2.to_excel(“ers_时事sentiment(0103~0109)情感素材收集_James.xlsx”,index = False)
#seg結合自定義詞庫切詞
#f4 = pd.read_csv(“ers_sentiment_dict.txt”)[‘ciyu’].tolist()
f3 = pd.read_csv(“ers_sentiment_dict.txt”)
l =[]
for i in range(len(f3)):
test = f3.iloc[i][0].split(" ")[0]l.append(test)
f4 = l
#f3 = pd.read_csv(“universal_20191021.txt”)
# 獲取詞庫列表
##seg = pkuseg.pkuseg(user_dict=f3)
#f5 =pd.read_csv("") #
def addDict(words: list):
“”"
jieba.add_word(word, freq=None, tag=None)
freq and tag can be omitted, freq defaults to be a calculated value
that ensures the word can be cut out.
“”"
if words:
for i in keys:
if i:
jieba.add_word(i[0])
keys = pd.read_csv(“universal_20191204.txt”,header = None)
for i in range(len(keys)):
words = keys.loc[i][0].split(" ")[0]
addDict(words)
print(“开始添加:”,i,words)
jieba.load_userdict(“universal_20191204.txt”) ##词频要超过10000000才会优先切词
keys.to_excel(“test.xlsx”)
f1 = pd.read_excel(“test.xlsx”)
f1= f1.rename(columns = {0:“rawdata”})
f2 = pd.DataFrame(f1,columns =[“rawdata”,“Allcutword”,“Lucutword”,“compare”])
for i in range(len(keys)):
try:
sentence = keys.loc[i][0]
Allcutword = jieba.lcut(sentence,cut_all = True)
Lcutword = jieba.lcut(sentence)
f2.loc[i,“Allcutword”] = str(Allcutword)
f2.loc[i,“Lcutword”] = str(Lcutword)
print(“成功切词:{}”.format(i),sentence)
except:
print(“报错:{t}”.format(t =i),sentence)
f2.to_excel(“切词测试.xlsx”)
#def addDict(dict_list):
“”"
jieba.add_word(word, freq=None, tag=None)
freq and tag can be omitted, freq defaults to be a calculated value
that ensures the word can be cut out.
“”"
if dict_list:
for i in dict_list:
if i:
jieba.add_word(i)#jieba.add_word,和suggest_freq都是强制调高詞頻
#addDict(“universal_20191030.txt”)
#f5 = pd.read_csv(“universal_20191030.txt”)
#for i in range(len(f5)):
words= f5.iloc[i][0].split(" ")[0]
jieba.add_word(words)
#jieba.add_word(“universal_20191030.txt”)
#s = “衷心感谢”
#test1 = jieba.cut(s)
#for i in test1:
print(i)
#test2 = jieba.lcut(s)
#f2 = pd.read_excel(“sentiment(924~1023)result训练集James.xlsx”)
#f2[‘cutword’] = f2[“content”].apply(lambda x:seg.cut(x)) # 切詞
f2[‘cutword’] = f2[“contents”].apply(lambda x:jieba.lcut(x)) # 切詞
#xx = jieba.lcut(“?共築中國夢奮進新時代,返工頂你個肺”)
f2[‘newword’] = f2[‘cutword’].apply(lambda x: [i for i in x if i not in f4])
#e = [‘cutword’]
#f2.loc[‘cutword’] = str(e)
f2.to_excel(“testsentiment-result3.xlsx”,index = False)