当前位置: 代码迷 >> 综合 >> python爬虫学习笔记day1 -requests模块,数据解析(正则,bs4,xpath)
  详细解决方案

python爬虫学习笔记day1 -requests模块,数据解析(正则,bs4,xpath)

热度:84   发布时间:2023-11-26 15:01:31.0

一、requests模块流程


1.指定url

url = '要爬取的页面地址'
2.发起请求

    headers = {

        'User-Agent': '通过网页-检查-网络-获得headers'

    }

    response=requests.get(url=url,headers=headers).text

3.获取响应数据

     page_text=response
4.持久化存储

with open('要存储的文件路径','打开文件的编辑模式')as fp:

                fp.write(page_text)


二、数据解析


1.正则表达式

例:爬取美女图片完整代码如下

import re

import requests

import os

if __name__ == "__main__":

    run_code = 0

    if not os.path.exists('./image'):

        os.mkdir('./image')

    url = 'https://www.tupianzj.com/meinv/mm/jianshennvshen/'

    headers = {

        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.222 Safari/537.36'

    }

    page_text=requests.get(url=url,headers=headers).text

    ex='ext"><a href="(.*?)" target.*?><img'

    html_list=re.findall(ex,page_text,re.S)

    print('开始健身美女!!!')

    del html_list[1]

    print(html_list)

    for url in html_list:

        print(url)

        print(type(url))

        aa=url.split('/')

        print(aa[2])

        src='https://www.tupianzj.com/'+url

        page_text1=requests.get(url=src,headers=headers).text

        ex1="<a href='#'>1</a></li>(.*?)</ul>"

        list=re.findall(ex1,page_text1,re.S)

        str=''.join(list)

        ex2="<a href='(.*?)'>"

        html_list1=re.findall(ex2,str,re.S)

        print(html_list1)

        two='_2.'

        one='.'

        str1=html_list1[0].replace(two,one)

        html_list1.insert(0,str1)

        html_list1.pop()

        print(html_list1)

        for item in html_list1:

            print(item)

            src1='https://www.tupianzj.com/meinv/'+aa[2]+'/'+item            

            print(src1)

            page_text2=requests.get(url=src1,headers=headers).text

            ex3='bigpic.*? src="(.*?)" id.*?alt'

            img_src=re.findall(ex3,page_text2,re.S)

            print(img_src[0])

            img_data=requests.get(url=img_src[0],headers=headers).content

            name=img_src[0].split('/')

            img_name=name[-1]

            print(img_name)

            imgPath='./image/'+img_name

            print(imgPath)

            print(type(imgPath))

            with open(imgPath,'wb')as fp:

                fp.write(img_data)

            print(img_name+'下载成功!!!')

    print('爬取结束')

2.bs4(只用于python) 

(1)实例化一个BeautifulSoup对象,将页面源码加载到对象中
 (2)通过调用BeautifulSoup对象中相关属性方法进行标签定位,数据提取

例:爬取雪中悍刀行小说完整代码如下:

from bs4 import BeautifulSoup

import requests

import os

if __name__=="__main__":

    run_code=0

    # 本地html文件

    # fp=opne('./test.html','r',encoding='utf-8')

    # soup=BeautifulSoup(fp,'lxml')

    # 互联网

    # page_text=response.text

    # soup=BeautifulSoup(page_text,'lxml')

    url='https://pluspng.com/'

    headers = {

       'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.222 Safari/537.36'

    }

    page_text=requests.get(url=url,headers=headers).text

    soup=BeautifulSoup(page_text,'lxml')

    soup.div

    soup.find('div') #两者都选择第一个div

    soup.findAll('div',class_='wrapper')#选择class为wrapper的所有标签

    soup.select('.sols')#.为类选择器

    soup.select('.overlay-content>form>input')[0]

    soup.select('div')[1].text#text标签下所有文本,string只是直系

    soup.div['id']

    url1='http://book.zongheng.com/chapter/189169/3431546.html'

    url2='http://book.zongheng.com/chapter/189169/36978052.html'

    list1=[]

    while True:

       if(url1!=url2):

          page_text1=requests.get(url=url1,headers=headers).text

          soup1=BeautifulSoup(page_text1,'lxml')

          p=soup1.select('.content>p')

          with open('./雪中悍刀行.txt','a',encoding='utf-8')as fp:

              fp.write(soup1.select('.title_txtbox')[0].text+'\n')

              fp.close()

          for item in p:

           with open('./雪中悍刀行.txt','a',encoding='utf-8')as fp:

              fp.write('   '+item.text+'\n')

              fp.close()

          url1=soup1.select('.nextchapter')[0]['href']

          print(soup1.select('.title_txtbox')[0].text+'下载完成')

       else:

          print("找完了!")

          os._exit(0)

3.xpath


 (1)实例化一个etree对象,将页面源码加载到对象中
 (2)通过调用etree对象中xpath方法结合xpath表达式进行标签定位,数据提取

例:爬取图片完整代码如下:

from email.mime import image

import os

from urllib import response

import requests

from lxml import etree

if __name__=="__main__":

    run_code=0

    # xpath 实例化对象

    # tree=etree.parse('xxx.html',parser=etree.HTMLParser(encoding='utf-8'))

    # r=tree.xpath('/html/head/meta')

    # a=tree.xpath('/html//meta')#//表示多个层级

    # b=tree.xpath('//div[@class="xxx"]/a[3]')#tag[@attrname=""] 索引从1开始

    # 文本=tree.xpath('/div/a[1]/text()')[0] #//text()获取标签下所有文本内容,包括标签内标签的文本

    # 属性=tree.xpath('/div/@src')

    url='https://pic.netbian.com/4kmeinv/index.html'

    headers={

        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.62'

    }

    response=requests.get(url=url,headers=headers)

    response.encoding='gbk'

    page_text=response.text

    tree=etree.HTML(page_text)

    # <li><a href="/tupian/28674.html" target="_blank"><img src="/uploads/allimg/220116/214611-164234077159f5.jpg"><b>cosplay原神 珊瑚宫心海</b></a></li>

    src=tree.xpath('//ul[@class="clearfix"]//img/@src')

    name_list=tree.xpath('//ul[@class="clearfix"]//b/text()')

    if not os.path.exists('./image'):

         os.mkdir('./image')

    for i in src:

        index=src.index(i)

        url1='https://pic.netbian.com/'+i

        name=name_list[index]+'.jpg'

        img=requests.get(url=url1,headers=headers).content

        img_path='image/'+name

        with open(img_path,'wb')as fp:

            fp.write(img)

            print(name+'下载成功')

            fp.close