《青春有你2》参赛选手数据分析
本文章主要介绍《青春有你2》的数据分析流程。
任务描述:使用python爬取《青春有你2》所有参赛选手的信息,然后进行数据可视化分析
实践平台:windows
实践环境:Python2.7 + pandas + matplotlib
- 使用python爬取《青春有你2》所有参赛选手信息:
选手数据来源百度百科:https://baike.baidu.com/item/青春有你第二季/23802025?fromtitle=青春有你2&fromid=24266334
# pip install beautifulsoup4# fetch data
import requests
import json
from bs4 import BeautifulSoupurl = "https://baike.baidu.com/item/青春有你第二季/23802025?fromtitle=青春有你2&fromid=24266334"
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36','Cookie':'BIDUPSID=A89BD119C7B73C53021773EB9D924AD9; PSTM=1572051557; BAIDUID=A89BD119C7B73C53BD48595D9C682DE7:FG=1; MCITY=-289%3A; BDUSS=R2cDVoQVRoZkEyeVhSLVdudVNIakRZdFU5M3VoODd0QVFpRWFCYkVxMXhkUFZlSVFBQUFBJCQAAAAAAAAAAAEAAAAnp285TWVhZDIwMTQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHHnzV5x581eRG; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; ai-studio-ticket=D8284B48976948689935A404927822C1BAF6BBFC966E40B3B04C71226E350C68; H_PS_PSSID=31729_1435_31672_21107_31605_30824_31844_26350; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; delPer=0; PSINO=5; ZD_ENTRY=baidu'
}
html = requests.get(url, headers=headers)
#print(html.text)
soup = BeautifulSoup(html.text, "html.parser")
# print(soup.prettify())
# soup.title.string
data = []
xuanshou_table = soup.find_all("table")[-3]
for tr in xuanshou_table.find_all("tr"):item = []for td in tr.find_all("td"):item.append(td.get_text().replace("\n", ""))data.append(item)# print(data)
# save to json
with open("data.json", "w", encoding='utf-8') as file:file.write(json.dumps(data, indent=2,ensure_ascii=False))
2.将json
数据放入 pandas.DataFrame
# read data
import pandas as pd
import numpy as np with open("data.json", 'r', encoding='UTF-8') as file:json_array = json.loads(file.read())df = pd.DataFrame(json_array[1:], columns=json_array[0])
df
拼装数据
# 中国山东,中国四川,中国台湾,中国北京
zone_cnt_dict = df.groupby(['国家/地区']).count().sort_values(by=['姓名'], ascending=False)['姓名']
zone_cnt_dict
# 狮子座,摩羯座,白羊座
#xinzuo_cnt_dict = df.groupby(['星座']).count().sort_values(by='姓名', ascending=False)['姓名']
#xinzuo_cnt_dict
# 168, 170, 163
#shengan_cnt_dict = df.groupby(['身高']).count().sort_values(by='姓名', ascending=False)['姓名']
#shengan_cnt_dict
# 45-50
#weight_cnt_dict = df.groupby(['体重']).count().sort_values(by='姓名', ascending=False)['姓名']
#weight_cnt_dict
from matplotlib import pyplot as pyt from matplotlib.font_manager import FontProperties
# msyhbd.ttf 是微软雅黑字体,用于解决中文乱码的问题
myfont = FontProperties(fname=r"msyhbd.ttf",size=12)x = zone_cnt_dict.keys().tolist()
y = zone_cnt_dict.values.tolist()
plt.figure(figsize=(20, 15))
#
plt.xticks(rotation=45, fontsize=20, fontproperties=myfont)
plt.bar(x, y, align='center')
plt.legend()
plt.title("《青春有你2》参赛选手区域排名", fontproperties=myfont)
plt.ylabel("人数", fontproperties=myfont)
plt.xlabel("城市", fontproperties=myfont)
plt.show()
## 狮子座,摩羯座,白羊座
xinzuo_cnt_dict = df.groupby(['星座']).count().sort_values(by='姓名', ascending=False)['姓名']
xinzuo_cnt_dict
from matplotlib import pyplot as pyt from matplotlib.font_manager import FontProperties
myfont = FontProperties(fname=r"msyhbd.ttf",size=12)x = xinzuo_cnt_dict.keys().tolist()
y = xinzuo_cnt_dict.values.tolist()
plt.figure(figsize=(20, 15))
#
plt.xticks(rotation=45, fontsize=20, fontproperties=myfont)
plt.bar(x, y, align='center')
plt.legend()
plt.title("《青春有你2》参赛选手数星座排名", fontproperties=myfont)
plt.ylabel("人数", fontproperties=myfont)
plt.xlabel("星座", fontproperties=myfont)
plt.show()
# 168, 170, 163
shengan_cnt_dict = df.groupby(['身高']).count().sort_values(by='姓名', ascending=False)['姓名']
#shengan_cnt_dict
from matplotlib import pyplot as pyt from matplotlib.font_manager import FontProperties
myfont = FontProperties(fname=r"msyhbd.ttf",size=12)x = shengan_cnt_dict.keys().tolist()
y = shengan_cnt_dict.values.tolist()
plt.figure(figsize=(20, 15))
#
plt.xticks(rotation=45, fontsize=20, fontproperties=myfont)
plt.bar(x, y, align='center')
plt.legend()
plt.title("《青春有你2》参赛选手数身高排名", fontproperties=myfont)
plt.ylabel("人数", fontproperties=myfont)
plt.xlabel("身高", fontproperties=myfont)
plt.show()
# 45-50
weight_cnt_dict = df.groupby(['体重']).count().sort_values(by='姓名', ascending=False)['姓名']
#weight_cnt_dict
from matplotlib import pyplot as pyt from matplotlib.font_manager import FontProperties
myfont = FontProperties(fname=r"msyhbd.ttf",size=12)x = weight_cnt_dict.keys().tolist()
y = weight_cnt_dict.values.tolist()
plt.figure(figsize=(20, 15))
#
plt.xticks(rotation=45, fontsize=20, fontproperties=myfont)
plt.bar(x, y, align='center')
plt.legend()
plt.title("《青春有你2》参赛选手数体重排名", fontproperties=myfont)
plt.ylabel("人数", fontproperties=myfont)
plt.xlabel("体重", fontproperties=myfont)
plt.show()