问题描述
我想列出一个韩国音乐节的清单,所以我试着爬了一个卖音乐节门票的网站:
import requests
from bs4 import BeautifulSoup
INTERPARK_BASE_URL = 'http://ticket.interpark.com'
# Festival List Page
req = requests.get('http://ticket.interpark.com/TPGoodsList.asp?Ca=Liv&SubCa=Fes')
html = req.text
soup = BeautifulSoup(html, 'lxml')
for title_raw in soup.find_all('span', class_='fw_bold'):
title = str(title_raw.find('a').text)
url_raw = str(title_raw.find('a').get('href'))
url = INTERPARK_BASE_URL + url_raw
# Detail Page
req_detail = requests.get(url)
html_detail = req_detail.text
soup_detail = BeautifulSoup(html_detail, 'lxml')
details_1 = soup_detail.find('table', class_='table_goods_info')
details_2 = soup_detail.find('ul', class_='info_Lst')
image = soup_detail.find('div', class_='poster')
singers = str(details_1.find_all('td')[4].text)
place = str(details_1.find_all('td')[5].text)
date_text = str(details_2.find('span').text)
image_url = str(image.find('img').get('src'))
print(title)
print(url)
print(singers)
print(place)
print(date_text)
print(image_url)
我使用for循环浏览列表页面中的所有详细页面,但是加载每个详细页面太慢了。
如何加速我的代码?
1楼
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime as dt
import csv
def Soup(content):
soup = BeautifulSoup(content, 'html.parser')
return soup
def Main(url):
r = requests.get(url)
soup = Soup(r.content)
spans = soup.findAll('span', class_='fw_bold')
links = [f"{url[:27]}{span.a['href']}" for span in spans]
return links
def Parent():
links = Main(
"http://ticket.interpark.com/TPGoodsList.asp?Ca=Liv&SubCa=Fes")
with open("result.csv", 'w', newline="", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Singers", "Location", "Date", "ImageUrl"])
with requests.Session() as req:
for link in links:
r = req.get(link)
soup = Soup(r.content)
script = json.loads(
soup.find("script", type="application/ld+json").text)
name = script["name"]
print(f"Extracting: {name}")
singers = script["performer"]["name"]
location = script["location"]["name"]
datelist = list(script.values())[3:5]
datest = []
image = script["image"]
for date in datelist:
date = dt.strptime(date,
'%Y%m%d').strftime('%d-%m-%Y')
datest.append(date)
writer.writerow(
[name, singers, location, " : ".join(datest), *image])
Parent()