python urllib.request etree爬取百度贴吧的图片并存储本地,源代码如下:
import re
import time
import urllib.request
from lxml import etree# ------ 获取网页源代码的方法 ---
def getHtml(url):# page = urllib.request.urlopen(url)# html = page.read()headers = {'User-Agent': 'User-Agent:Mozilla/5.0'}html1 = urllib.request.Request(url, headers=headers)html = urllib.request.urlopen(html1).read()return html# ------ getHtml()内输入任意帖子的URL ------
html = getHtml('https://tieba.baidu.com/index.html')
# ------ 修改html对象内的字符编码为UTF-8 ------
# html = html.decode('UTF-8')# ------ 获取帖子内所有图片地址的方法 ------
def getImg(html):# ------ 匹配网页内容找到图片地址 ------tree = etree.HTML(html)imglist = tree.xpath('//img')return imglist# reg = r'src="([.*\S]*\.jpg)"'# imgname = r'alt="*"'# imgre = re.compile(reg);# imgnamelist = re.findall(imgname,html)# imglist = re.findall(imgre, html)# return imglist,imgnamelistimgList = getImg(html)
imgNamenum = 0
for one in imgList:# ------ 这里最好使用异常处理及多线程编程方式 ------try:imgPath = one.get('src')if imgPath[:4] != 'http':imgPath = 'https://tieba.baidu.com/' + imgPathimgName = one.get('alt')imgName = str(imgName)if imgName == 'None':imgName = str(time.time())f = open('D:\\Temp\\'+ str(imgName)+".jpg", 'wb')f.write((urllib.request.urlopen(imgPath)).read())print(imgPath)time.sleep(0.1)f.close()except Exception as e:print(imgPath+" error")imgNamenum += 1print("All Done!")
结果如下:
注:本文仅用于技术交流,不得用于商业用途。不遵守者,与本文作者无关。