机器学习（Hands on）第二章修正版完整代码_综合

前言

最近尝试学习机器学习有关知识，随着Python版本的更新，《Hands-On Machine Learning with Scikit-Learn & TensorFlow》书中部分代码并不适用，根据百度查到的一些改动做了总结（具体改动部分已经忘记，想要了解细节的读者可以去书中自己比较），下面是第二章完整代码：
代码

import numpy as np
import os
import pandas as pd
import tarfile
from six.moves import urllibDOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):if not os.path.isdir(housing_path): #如果文件夹不存在，则创建一个os.makedirs(housing_path)tgz_path = os.path.join(housing_path, "housing.tgz")urllib.request.urlretrieve(housing_url, tgz_path) #从 housing_url下载文件到 tgz_pathhousing_tgz = tarfile.open(tgz_path)housing_tgz.extractall(path=housing_path) #解压文件housing_tgz.close()
#使用Pandas来读取数据
def load_housing_data(housing_path=HOUSING_PATH):csv_path = os.path.join(housing_path, "housing.csv")return pd.read_csv(csv_path)
housing=load_housing_data()
import matplotlib.pyplot as plt#创建测试数据集
def split_train_test(data,test_ratio):shuffled_indices = np.random.permutation(len(data))test_set_size = int(len(data) * test_ratio)test_indices = shuffled_indices[:test_set_size]train_indices = shuffled_indices[test_set_size:]return data.iloc[train_indices],data.iloc[test_indices]
import hashlib#hash值的最后一个字节小于51的划入测试集
def test_set_check(identifier, test_ratio, hash):return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratiodef split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):ids = data[id_column]in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio, hash))return data.loc[~in_test_set], data.loc[in_test_set]
#利用行号创建标识符
#housing_with_id = housing.reset_index() # 加入 'index' 列
#train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, 'index')
#新的方法产生唯一标识符
#housing_with_id['id'] = housing['longitude']*1000 + housing['latitude']
#train_set, test_set = split_train_test_by_id(housing_with_id, 0.2,'id')
#Scikit-Learn自带的函数train_test_split()
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)#分层抽样，整理数据
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
#使用Scikit-Learn's StratifiedShuffleSplit()进行分层抽样
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):strat_train_set = housing.loc[train_index]strat_test_set = housing.loc[test_index]
#为了使数据恢复原来的样子，需要删除income_cat这一列。
for data in (strat_train_set, strat_test_set):data.drop(['income_cat'],axis=1,inplace=True)
#3可视化来发现数据的规律
#从蓝到红表示数值从高到低
housing.plot(kind='scatter', x='longitude', y='latitude', alpha=0.4,
s=housing['population']/100, label='population', c='median_house_value',cmap=plt.get_cmap('jet'), colorbar=True)
plt.legend()
#寻找相关性
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes],figsize=(12,8))
#特征组合
housing["rooms_per_household"] = housing['total_rooms'] / housing['households']
housing["bedrooms_per_room"] = housing["total_bedrooms"] / housing['total_rooms']
housing["population_per_household"]= housing["population"] / housing["households"]
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)
#把特征值和目标值分开，方便后续做特征转换
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()
#total_bedrooms属性中存在缺失值，缺失值的处理
#删除有缺失值的数据点
#删除整个total_bedrooms属性
#用值来填充缺失值(0,平均数，中位数等)
#housing.dropna(subset['total_bedrooms']) #option1
#housing.drop('total_bedrooms',axis=1) #option2
#median = housing['total_bedrooms'].median()
#housing['total_bedrooms'].fillna(median) #option3
#使用第三种方法来填充缺失值时，在测试集上也应该使用同样的中位数数值填充缺失值。使用Scikit-Learn的Imputer来实现缺失值的填充。
try:from sklearn.impute import SimpleImputer # Scikit-Learn 0.20+
except ImportError:from sklearn.preprocessing import Imputer as SimpleImputer#create an imputer instances
imputer = SimpleImputer(strategy='median') #specify median methodhousing_num = housing.drop("ocean_proximity", axis=1) #drop non-numerical attribute
imputer.fit(housing_num) #fit the imputer instance to the training data
X = imputer.transform(housing_num) #replacing missing values with learned medians
housing_tr = pd.DataFrame(X, columns=housing_num.columns,index=housing.index) #convert Numpy arrays into pandas dataframe
#文本和类别数据的处理
#使用Scikit-Learn的LabelEncoder将文本数据转变为数值型数据。
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
housing_cat = housing["ocean_proximity"]
housing_cat_encoded = encoder.fit_transform(housing_cat)
print(housing_cat_encoded)#Scikit-Learn中提供OneHotEncoder编码可以将字符型的类别变量转换成独热编码的向量
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot
#<16512x5 sparse matrix of type '<class 'numpy.float64'>'
# with 16512 stored elements in Compressed Sparse Row format>
#稀疏矩阵转为稠密
housing_cat_1hot.toarray()
#自定义转换器
#使用Scikit-Learn的FunctionTransformer类可以基于转换函数构建转换器
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
class MyLabelBinarizer(TransformerMixin):def __init__(self, *args, **kwargs):self.encoder = LabelBinarizer(*args, **kwargs)def fit(self, x, y=0):self.encoder.fit(x)return selfdef transform(self, x, y=0):return self.encoder.transform(x)
encoder=MyLabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)
housing_cat_1hot
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):def __init__(self, add_bedrooms_per_room = True): # no *args or **kargsself.add_bedrooms_per_room = add_bedrooms_per_roomdef fit(self, X, y=None):return self # nothing else to dodef transform(self, X, y=None):rooms_per_household = X[:, rooms_ix] / X[:, household_ix]population_per_household = X[:, population_ix] / X[:, household_ix]if self.add_bedrooms_per_room:bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]else:return np.c_[X, rooms_per_household, population_per_household]
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)
#特征缩放
#Transformation Pipelines
#Scikit-Learn中提供了Pipeline类来完成转换序列，使得程序能够按顺序执行每个转换。
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")),('attribs_adder', CombinedAttributesAdder()),('std_scaler', StandardScaler()),])
housing_num_tr = num_pipeline.fit_transform(housing_num)
#类别型额变量设置transformer pipeline
try:from sklearn.compose import ColumnTransformer
except ImportError:from future_encoders import ColumnTransformernum_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']full_pipeline = ColumnTransformer([('num', num_pipeline, num_attribs),('cat', OneHotEncoder(), cat_attribs),])housing_prepared = full_pipeline.fit_transform(housing)
#在训练集上进行训练和验证
from sklearn.linear_model import LinearRegressionlin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
#在部分数据上查看预测效果
# try it out on some training instances
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data) #data transformation
print('Predictions:\t\t', lin_reg.predict(some_data_prepared))
#使用 Scikit-Learn 中的 mean_squared_error函数，计算 RMSE
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
#使用 Scikit-Learn 中的 mean_absolute_error函数，计算 MAE。
from sklearn.metrics import mean_absolute_error
lin_mae=mean_absolute_error(housing_labels,housing_predictions)#使用决策树算法对数据进行拟合。
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
#6.微调模型
#计算交叉验证的得分
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)
def display_scores(scores):print("Scores:", scores)print("Mean:", scores.mean())print("Standard deviation:", scores.std())
#计算线性回归的交叉验证得分
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)
#使用随机森林来做预测，同时计算其交叉验证得分。
from sklearn.ensemble import RandomForestRegressorforest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)# calculate the mean_squared_error for Random Forest Regressor
housing_predictions = forest_reg.predict(housing_prepared)
forest_mse = mean_squared_error(housing_labels, housing_predictions)
forest_rmse = np.sqrt(forest_mse)forest_scores = cross_val_score(forest_reg, housing_prepared,housing_labels,scoring='neg_mean_squared_error',cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
#print(display_scores(forest_rmse_scores))
#使用线性核的SVM作为分类器，并计算其RMSE。
from sklearn.svm import SVR
svm_reg = SVR(kernel='linear')
svm_reg.fit(housing_prepared, housing_labels)
housing_predictions = svm_reg.predict(housing_prepared)
svm_mse = mean_squared_error(housing_labels, housing_predictions)
svm_rmse = np.sqrt(svm_mse)
#使用 Scikit-Learn 的 GridSearchCV来帮助选择参数
from sklearn.model_selection import GridSearchCV
param_grid = [{
    'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},{
    'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)
grid_search.best_params_
#grid_search.best_estimator_)
# look at the score of each hyperparameter combination tested during the grid search
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'],cvres['params']):print(np.sqrt(-mean_score),params)# 以 dataframe 的方式显示结果
pd.DataFrame(grid_search.cv_results_)
#使用随机搜索来进行参数选择。
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randintparam_distribs = {
    'n_estimators': randint(low=1, high=200),'max_features': randint(low=1, high=8),
}forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,n_iter=10, cv=5, scoring='neg_mean_squared_error',random_state=42)
rnd_search.fit(housing_prepared, housing_labels)
#输出每个属性值对于正确预测的相对重要程度。
feature_importances = grid_search.best_estimator_.feature_importances_
print(feature_importances)
extra_attribs = ['rooms_per_hhold','pop_per_hhold', 'bedrooms_per_room']
cat_encoder = full_pipeline.named_transformers_['cat']
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)
#在测试集上评估系统性能
final_model = grid_search.best_estimator_X_test = strat_test_set.drop('median_house_value', axis=1)
y_test = strat_test_set['median_house_value'].copy()X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
print(final_rmse)
#计算测试集的RMSE95%的置信区间。
# we can compute a 95% confidence interval for the test RMSE
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
mean = squared_errors.mean()
m = len(squared_errors)np.sqrt(stats.t.interval(confidence, m-1,loc=np.mean(squared_errors),scale=stats.sem(squared_errors)))
# we could also compute the interval manually like this
tscore = stats.t.ppf((1 + confidence)/2, df=m-1)
tmargin = tscore * squared_errors.std(ddof=1) / np.sqrt(m)
np.sqrt(mean - tmargin), np.sqrt(mean + tmargin)
# Alternatively, we could use a z-scores rather than t-scores
zscore = stats.norm.ppf((1 + confidence) / 2)
zmargin = zscore * squared_errors.std(ddof=1) / np.sqrt(m)
np.sqrt(mean - zmargin), np.sqrt(mean + zmargin)
#整合数据准备和预测的Pipeline
full_pipeline_with_predictor = Pipeline([('preparation', full_pipeline),('linear', LinearRegression())
])full_pipeline_with_predictor.fit(housing, housing_labels)
full_pipeline_with_predictor.predict(some_data)
#使用joblib保存模型
my_model = full_pipeline_with_predictorfrom sklearn.externals import joblib
joblib.dump(my_model, 'my_model.pkl') #save modelmy_model_loaded = joblib.load('my_model.pkl') #load model