数据来源:https://www.kaggle.com/damianpanek/sunday-eda/data
数据列名
1.电影时长分布
import pandas as pd
from matplotlib import pyplot as pltfile_path = './IMDB-Movie-Data.csv'df = pd.read_csv(file_path)time_data = df['Runtime (Minutes)'].values
max_time = time_data.max()
min_time = time_data.min()
group_time = (max_time - min_time) // 5# 可视化
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(15, 8))
plt.hist(time_data, group_time)
plt.xticks(range(min_time, max_time + 5, 5))
plt.xlabel('电影时长')
plt.ylabel('电影数量')
plt.show()
2.电影评分分布
range步长不能为浮点数
import pandas as pd
from matplotlib import pyplot as pltfile_path = './IMDB-Movie-Data.csv'
df = pd.read_csv(file_path)rating_data = df['Rating'].values
max_rating = rating_data.max()
min_rating = rating_data.min()
group = (max_rating - min_rating) // 0.5plt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(15, 8))
plt.hist(rating_data, int(group))_x = [min_rating]
i = min_rating
while i < max_rating + 0.5:i += 0.5_x.append(i)plt.xticks(_x)
plt.xlabel('评分')
plt.ylabel('电影数量')
plt.show()
3.电影体裁
import pandas as pd
from matplotlib import pyplot as plt
import numpy as npfile_path = './IMDB-Movie-Data.csv'
df = pd.read_csv(file_path)temp_list = df['Genre'].str.split(',').tolist()
genre_list = list(set(i for j in temp_list for i in j))zeros_df = pd.DataFrame(np.zeros((df.shape[0], len(genre_list))),columns=genre_list)
# 统计每个电影对应的标签
for i in range(df.shape[0]):zeros_df.loc[i, temp_list[i]] = 1# 每个分类电影的数量
genre_count = zeros_df.sum()
_x = genre_count.index
_y = genre_count.valuesplt.rcParams['font.sans-serif'] = ['SimHei']
plt.figure(figsize=(15, 8))
rects = plt.bar(range(len(_x)), _y)
plt.xticks(range(len(_x)), _x)
plt.xlabel('电影类型')
plt.ylabel('电影数量')
for rect in rects:height = rect.get_height()plt.text(rect.get_x() + rect.get_width() / 2, height + 2, str(height), ha='center')
plt.show()