一、requests模块流程
1.指定url
url = '要爬取的页面地址'
2.发起请求
headers = {
'User-Agent': '通过网页-检查-网络-获得headers'
}
response=requests.get(url=url,headers=headers).text
3.获取响应数据
page_text=response
4.持久化存储
with open('要存储的文件路径','打开文件的编辑模式')as fp:
fp.write(page_text)
二、数据解析
1.正则表达式
例:爬取美女图片完整代码如下
import re
import requests
import os
if __name__ == "__main__":
run_code = 0
if not os.path.exists('./image'):
os.mkdir('./image')
url = 'https://www.tupianzj.com/meinv/mm/jianshennvshen/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.222 Safari/537.36'
}
page_text=requests.get(url=url,headers=headers).text
ex='ext"><a href="(.*?)" target.*?><img'
html_list=re.findall(ex,page_text,re.S)
print('开始健身美女!!!')
del html_list[1]
print(html_list)
for url in html_list:
print(url)
print(type(url))
aa=url.split('/')
print(aa[2])
src='https://www.tupianzj.com/'+url
page_text1=requests.get(url=src,headers=headers).text
ex1="<a href='#'>1</a></li>(.*?)</ul>"
list=re.findall(ex1,page_text1,re.S)
str=''.join(list)
ex2="<a href='(.*?)'>"
html_list1=re.findall(ex2,str,re.S)
print(html_list1)
two='_2.'
one='.'
str1=html_list1[0].replace(two,one)
html_list1.insert(0,str1)
html_list1.pop()
print(html_list1)
for item in html_list1:
print(item)
src1='https://www.tupianzj.com/meinv/'+aa[2]+'/'+item
print(src1)
page_text2=requests.get(url=src1,headers=headers).text
ex3='bigpic.*? src="(.*?)" id.*?alt'
img_src=re.findall(ex3,page_text2,re.S)
print(img_src[0])
img_data=requests.get(url=img_src[0],headers=headers).content
name=img_src[0].split('/')
img_name=name[-1]
print(img_name)
imgPath='./image/'+img_name
print(imgPath)
print(type(imgPath))
with open(imgPath,'wb')as fp:
fp.write(img_data)
print(img_name+'下载成功!!!')
print('爬取结束')
2.bs4(只用于python)
(1)实例化一个BeautifulSoup对象,将页面源码加载到对象中
(2)通过调用BeautifulSoup对象中相关属性方法进行标签定位,数据提取
例:爬取雪中悍刀行小说完整代码如下:
from bs4 import BeautifulSoup
import requests
import os
if __name__=="__main__":
run_code=0
# 本地html文件
# fp=opne('./test.html','r',encoding='utf-8')
# soup=BeautifulSoup(fp,'lxml')
# 互联网
# page_text=response.text
# soup=BeautifulSoup(page_text,'lxml')
url='https://pluspng.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.222 Safari/537.36'
}
page_text=requests.get(url=url,headers=headers).text
soup=BeautifulSoup(page_text,'lxml')
soup.div
soup.find('div') #两者都选择第一个div
soup.findAll('div',class_='wrapper')#选择class为wrapper的所有标签
soup.select('.sols')#.为类选择器
soup.select('.overlay-content>form>input')[0]
soup.select('div')[1].text#text标签下所有文本,string只是直系
soup.div['id']
url1='http://book.zongheng.com/chapter/189169/3431546.html'
url2='http://book.zongheng.com/chapter/189169/36978052.html'
list1=[]
while True:
if(url1!=url2):
page_text1=requests.get(url=url1,headers=headers).text
soup1=BeautifulSoup(page_text1,'lxml')
p=soup1.select('.content>p')
with open('./雪中悍刀行.txt','a',encoding='utf-8')as fp:
fp.write(soup1.select('.title_txtbox')[0].text+'\n')
fp.close()
for item in p:
with open('./雪中悍刀行.txt','a',encoding='utf-8')as fp:
fp.write(' '+item.text+'\n')
fp.close()
url1=soup1.select('.nextchapter')[0]['href']
print(soup1.select('.title_txtbox')[0].text+'下载完成')
else:
print("找完了!")
os._exit(0)
3.xpath
(1)实例化一个etree对象,将页面源码加载到对象中
(2)通过调用etree对象中xpath方法结合xpath表达式进行标签定位,数据提取
例:爬取图片完整代码如下:
from email.mime import image
import os
from urllib import response
import requests
from lxml import etree
if __name__=="__main__":
run_code=0
# xpath 实例化对象
# tree=etree.parse('xxx.html',parser=etree.HTMLParser(encoding='utf-8'))
# r=tree.xpath('/html/head/meta')
# a=tree.xpath('/html//meta')#//表示多个层级
# b=tree.xpath('//div[@class="xxx"]/a[3]')#tag[@attrname=""] 索引从1开始
# 文本=tree.xpath('/div/a[1]/text()')[0] #//text()获取标签下所有文本内容,包括标签内标签的文本
# 属性=tree.xpath('/div/@src')
url='https://pic.netbian.com/4kmeinv/index.html'
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.62'
}
response=requests.get(url=url,headers=headers)
response.encoding='gbk'
page_text=response.text
tree=etree.HTML(page_text)
# <li><a href="/tupian/28674.html" target="_blank"><img src="/uploads/allimg/220116/214611-164234077159f5.jpg"><b>cosplay原神 珊瑚宫心海</b></a></li>
src=tree.xpath('//ul[@class="clearfix"]//img/@src')
name_list=tree.xpath('//ul[@class="clearfix"]//b/text()')
if not os.path.exists('./image'):
os.mkdir('./image')
for i in src:
index=src.index(i)
url1='https://pic.netbian.com/'+i
name=name_list[index]+'.jpg'
img=requests.get(url=url1,headers=headers).content
img_path='image/'+name
with open(img_path,'wb')as fp:
fp.write(img)
print(name+'下载成功')
fp.close