Python3 urllib(网络数据获取 模块)
本文由 Luzhuo 编写,转发请保留该信息.
原文: http://blog.csdn.net/Rozol/article/details/69941511
以下代码以Python3.6.1为例
Less is more!
#coding=utf-8
# urllibdemo.py urllib演示# urllib 用于处理Url相关的工具,用于从网络获取数据(网页源码/下载资源)
from urllib import request # 请求url, 支持 HTTP(0.9/1.0) / FTP / 本地文件 / URL
from urllib import parse # 解析url, 支持 file / ftp / gopher / hdl / http / https / imap / mailto / mms / news / nntp / prospero / rsync / rtsp / rtspu / sftp / shttp / sip / sips / snews / svn / svn+ssh / telnet / wais
from urllib import robotparser # 分析 robots.txt 文件
from urllib import error # 异常
import re # 正则模块
from bs4 import BeautifulSoup
import os# 演示(下载斗鱼首页的图片)
def demo():os.mkdir("images")# -- 获取网页源代码 --f = request.urlopen("https://www.douyu.com")data = f.read().decode("utf-8")# -- 获取网页源码中的图片地址 --# 方式一: 正则的方式images = re.findall(r'src="(.*?\.(jpg|png))"', data)tnum = 0for i in images:# 下载资源request.urlretrieve(i[0], "./images/%d.%s"%(tnum, i[1]))tnum += 1# 方式二: Beautiful Soup (安装: pip install beautifulsoup4) 提取html/xml标签中的内容soup = BeautifulSoup(data, "html.parser")images = soup.find_all("img") # 取标签tnum = 0for i in images:# 下载资源imgurl = i.get("src")if len(imgurl) > 3:request.urlretrieve(imgurl, "./images/%d.jpg"%tnum)tnum += 1# -- 关闭 --f.close# 参数详解
def fun():neturl = "http://luzhuo.me/blog/Base1.html"imgurl = "http://luzhuo.me/image/performers/%E5%85%B3%E6%99%93%E5%BD%A4.jpg"# --- urllib.parse --- 解析Url# - 编码 -neturl = "%s?%s" %(neturl, parse.urlencode({
"name":"luzhuo", "age": 21})) # Get传参url构建data = parse.urlencode({
"name":"luzhuo", "啊age": 21}).encode('ascii') # POST参参data构建# - 解码 -urls = parse.urlparse(imgurl) # => ParseResult(scheme='http', netloc='luzhuo.me', path='/image/performers/%E5%85%B3%E6%99%93%E5%BD%A4.jpg', params='', query='', fragment=''urls = parse.urlparse("//luzhuo.me/image/performers/%E5%85%B3%E6%99%93%E5%BD%A4.jpg?a=1")scheme = urls.scheme # 获取相应数据# - 替换 -url = parse.urljoin('http://luzhuo.me/blog/Base1.html', 'Fame.html') # 替换后部分 => http://luzhuo.me/blog/Fame.htmlurl = parse.urljoin('http://luzhuo.me/blog/Base1.htm', '//xxx/blog') # => http://xxx/blog# --- urllib.reques --- 请求数据try:# - Request - 构建req = request.Request(neturl) # GETreq = request.Request(neturl, headers = {
"1":"2"}) # 添加请求头req = request.Request(neturl, data=b'This is some datas.') # POST 添加POST请求数据req = request.Request(neturl, data) # POST 添加POST请求数据req = request.Request(neturl, data=b"This is some datas.", method="PUT") # PUT 其他类型的请求# 获取url = req.full_url # 获取Urlreqtype = req.type # 请求类型(如http)host = req.host # 主机名(如:luzhuo.me / luzhuo.me:8080)host = req.origin_req_host # 主机名(如:luzhuo.me)url = req.selector # url路径(如:/blog/Base1.html)data = req.data # 请求的实体,没有为Noceboolean = req.unverifiable # 是否是RFC 2965定义的不可验证的method = req.get_method() # 请求方式(如:GET / POST)# 修改req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36") # 添加请求头,键冲突将覆盖req.add_unredirected_header("Key", "value") # 添加不会重定向的请求头req.remove_header("Key") # 删除请求头req.get_header("Key") # 获取请求头, 无返回Nonereq.get_header("Key", "None.") # 获取请求头boolean = req.has_header("Key") # 是否有该请求头headers = req.header_items() # (所有)请求头列表req.set_proxy("220.194.55.160:3128", "http") # 设置代理(主机,类型)# 下载filename, headers = request.urlretrieve(imgurl) # 下载资源(不提供文件名不复制), 返回(文件名,头信息)元组filename, headers = request.urlretrieve(imgurl, filename="./xxx.jpg", reporthook=callback, data=None) # reporthook下载进度回调request.urlcleanup() # 清除下载的临时文件# - response - 请求结果res = request.urlopen(neturl) # GET 打开Url,返回responseres = request.urlopen(neturl, data=b'This is some datas.') # POST 添加POST请求数据res = request.urlopen(req) # 支持 Request 参数# 获取信息data = res.read().decode("utf-8") # 读取全部数据data = res.readline().decode("utf-8") # 读取行数据url = res.geturl() # 获取Urlinfo = res.info() # 元信息,如头信息code = res.getcode() # 状态码# 释放资源res.close# --- urllib.error ---except error.URLError as e:print(e)except error.HTTPError as e:# code / reason / headers 异常print(e)except error.ContentTooShortError as e:# 数据下载异常print(e)def robot():# --- urllib.robotparser --- robots.txtrp = robotparser.RobotFileParser()rp.set_url("https://www.zhihu.com/robots.txt") # 设置指向 robots.txt 文件的网址rp.read() # 获取数据给解析器boolean = rp.can_fetch("*", "http://www.musi-cal.com/") # 是否允许提取该urltime = rp.mtime() # 获取 robots.txt 的时间rp.modified() # 将 robots.txt 时间设为当前时间# 下载进度回调
def callback(datanum, datasize, filesize): # (数据块数量 数据块大小 文件大小)down = 100 * datanum * datasize / filesizeif down > 100:down = 100print ("%.2f%%"%down)import ssl# 访问https时, 未经过ssl证书验证的网页
# [SSL: CERTIFICATE_VERIFY_FAILED]
def https():url = 'https://www.12306.cn'# 忽略未经核实的ssl证书验证context = ssl._create_unverified_context()req = request.Request(url)res = request.urlopen(req, context=context)data = res.read().decode("utf-8")print(data)# Handler 处理器的使用
def handler():url = 'http://www.baidu.com'# url = 'https://www.12306.cn'# --- 几个常用的Handler ---# 支持处理 HTTP URL 的Handlerhandler = request.HTTPHandler() # debuglevel=1 打开调试信息, 自动输出调试信息# 支持处理 HTTPS URL 的Handlercontext = ssl._create_unverified_context() # 忽略未经核实的ssl证书验证handler = request.HTTPSHandler(context=context)# 支持代理的Handlerproxy = {"http": "113.221.47.13:39249", # 免费代理# "http": "账号:密码@183.56.177.130:808", # 授权代理}handler = request.ProxyHandler(proxy)# 存放cookie的HTTPCookieProcessor对象handler = request.HTTPCookieProcessor()# HTTP 身份验证的HandlerpasswordMgr = request.HTTPPasswordMgrWithDefaultRealm() # 保存 账号密码 映射的管理类passwordMgr.add_password(None, '192.168.0.1', 'user', 'password') # 添加账户密码,handler = request.HTTPBasicAuthHandler(passwordMgr)# 代理身份验证的Handlerhandler = request.ProxyBasicAuthHandler(passwordMgr) # 与`request.ProxyHandler(proxy)`效果相同# 构建自定义的openeropener = request.build_opener(handler) # 支持添加多个handlerreq = request.Request(url)res = opener.open(req)print(res.read().decode("utf-8"))# opener 的三种不同的请求方式(每种方式都有自己的使用场景)
def opener():url = 'https://www.douyu.com'# 方式一: 一般的使用方式res = request.urlopen(url)# 方式二: 通过自定义 opener 的方式handler = request.HTTPHandler()opener = request.build_opener(handler)req = request.Request(url)res = opener.open(req)# 方式三: 将opener设为全局使用, 之后所有的请求可通过urlopen()方法直接发送request.install_opener(opener)res = request.urlopen(url)print(res.read().decode("utf-8"))if __name__ == "__main__":demo()fun()robot()https()handler()opener()