当前位置: 代码迷 >> 综合 >> python 爬取有道翻译&&防止反爬相关知识&&{“errorCode“:50}
  详细解决方案

python 爬取有道翻译&&防止反爬相关知识&&{“errorCode“:50}

热度:32   发布时间:2024-02-11 01:21:12.0

反爬

  1. python访问被拒绝,要使用浏览器访问。
    解决方法:修改表头(加入User-Agent,可以是多个随机)
url = '....'
data = {...}  # 表头的最后一项
head = {}
data = urllib.parse.urlencode(data).encode('utf-8') 
# 浏览器开发者模式中的network中表头(header)的post请求中User-Agent添加到python请求
# 方法一
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58'
res = urllib.request.Request(url, data, head)
response = urllib.request.urlopen(res)
# 方法二(在请求之后)
req = urllib.request.Request(url, data)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58')
response = urllib.request.urlopen(req)
# 有时还要加入:
# head['Cookie'](如:='OUTFOX_SEARCH_USER_ID=-1764369496@10.108.160.18;')
# head['Referer'](如:='http://fanyi.youdao.com/')
  1. ip访问太频繁(服务器发送验证码拒绝)
    解决方法:
    方法一:使用time模块,time.sleep(n)休眠一段时间 放在循环中模拟真人(不建议,太慢了)
    方法二:使用代理(服务器看到的是代理的IP)(网上搜代理IP)
# 1.proxy_support = urllib.request.ProxyHandler({'类型':'ip:端口号'})
# 2.opener = urllib.request.build_opener(proxy_support)
# 3a.urllib.request.install_opener(opener) 安装opener,一劳永逸
# 3b.opener.open(url)使用特殊opener打开网页
import urllib.request
import random  # 可以多弄几个随机访问
url = 'https://www.csdn.net/'  # 查询IP的网站,用来测试
iplist =['163.125.221.229:8118', '183.166.20.6:9999', '125.108.80.240:9000', '125.108.123.95:9000']
proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)})
opener = urllib.request.build_opener(proxy_support)
opener.addheaders = [('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58')]
urllib.request.install_opener(opener)
response = urllib.request.urlopen(url)
# 若出现utf-8无法解码的,在后面加上ignore即可(会出现乱码)
html = response.read().decode('utf-8')  # 开发者模式看charset
print(html)

爬取有道翻译

import urllib.request
import urllib.parse  # 解析
import json
# translate_o?删除了_o 新版本有防爬,具体解决方法参考:https://tendcode.com/article/youdao-spider/
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
content = input('输入翻译内容:')
# 防止反爬 修改headers(表头)方法一
head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58'
data = {'i': content,'from': 'AUTO','to': 'AUTO','smartresult': 'dict','client': 'fanyideskweb','salt': '15972989870051','sign': '26393ce3bf4fc21109f3d258493f0ac1','lts': '1597298987005','bv': 'a612219d8ae465584a02998c7f4cede1','doctype': 'json','version': '2.1','keyfrom': 'fanyi.web','action': 'FY_BY_REALTlME'} 
# 解码,然后将python默认的Unicode编码成浏览器支持的utf-8
data = urllib.parse.urlencode(data).encode('utf-8') 
res = urllib.request.Request(url, data, head)
response = urllib.request.urlopen(res)
# 防止反爬 修改headers(表头)方法二(在请求之后)
''' req = urllib.request.Request(url, data) req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58') response = urllib.request.urlopen(req) '''
html = response.read().decode('utf-8')  # 反解码
# print(html):'{"type":"EN2ZH_CN","errorCode":0,"elapsedTime":1,
# "translateResult":[[{"src":"i love you","tgt":"我爱你"}]]}'
target = json.loads(html)  # 将json字符串转换位python的字典(去掉引号)
print('翻译结果:%s'%target['translateResult'][0][0]['tgt'])

参考后(https://tendcode.com/article/youdao-spider/):

import urllib.request
import hashlib
import time
import json
import randomclass Youdao(object):def __init__(self, msg):self.msg = msgself.url = 'http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule'self.D = "]BjuETDhU)zqSxf-=B#7m"self.salt = self.get_salt()self.sign = self.get_sign()def get_md(self, value):'''md5加密'''m = hashlib.md5()# m.update(value)m.update(value.encode('utf-8'))return m.hexdigest()def get_salt(self):'''根据当前时间戳获取salt参数'''s = int(time.time() * 1000) + random.randint(0, 10)return str(s)def get_sign(self):'''使用md5函数和其他参数,得到sign参数'''s = "fanyideskweb" + self.msg + self.salt + self.Dreturn self.get_md(s)def get_result(self):headers={}headers['Cookie']='OUTFOX_SEARCH_USER_ID=-1764369496@10.108.160.18;'headers['Referer']='http://fanyi.youdao.com/'headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58'data = {'i': self.msg,'from': 'AUTO','to': 'AUTO','smartresult': 'dict','client': 'fanyideskweb','salt': self.salt,'sign': self.sign,'doctype': 'json','version': '2.1','keyfrom': 'fanyi.web','action': 'FY_BY_CL1CKBUTTON','typoResult': 'true'}# 解码,然后将python默认的Unicode编码成浏览器支持的utf-8data = urllib.parse.urlencode(data).encode('utf-8') res = urllib.request.Request(self.url, data, headers)response = urllib.request.urlopen(res)# 防止反爬 修改headers(表头)方法二(在请求之后)'''req = urllib.request.Request(url, data)req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.58')response = urllib.request.urlopen(req)'''html = response.read().decode('utf-8')  # 反解码# print(html):'{"type":"EN2ZH_CN","errorCode":0,"elapsedTime":1,# "translateResult":[[{"src":"i love you","tgt":"我爱你"}]]}'target = json.loads(html)  # 将json字符串转换位python的字典(去掉引号)print('翻译结果:%s'%target['translateResult'][0][0]['tgt'])if __name__ == '__main__':content = input('输入翻译内容:')y = Youdao(content)y.get_result()
  相关解决方案