下面代码为: .ipynb文件, 可以在Jupyter上运行
题目为:
给了3个.csv文件(一个训练, 一个测试,一个预测)
求: 预测乘客生还是死
import numpy as np#处理矩阵
import pandas as pd#资料处理(读csv文件)
import matplotlib.pyplot as plt#画图
import seaborn as snsimport warnings
warnings.filterwarnings('ignore')#读入文件
%matplotlib inline
train= pd.read_csv('/Users/liyixin/Desktop/AI/third-titanic/train.csv')#训练
test2= pd.read_csv('/Users/liyixin/Desktop/AI/third-titanic/test_ti.csv')#测试
test= pd.read_csv('/Users/liyixin/Desktop/AI/third-titanic/test.csv')#预测#数据处理(训练,测试,预测)
#drop:将无关的属性拿掉(职全为1)-Name,Ticket,Cabin;拿掉后,其他的位置不变
train = train.drop(["Name", "Ticket"], axis=1)
test = test.drop(["Name", "Ticket"], axis=1)
test2 = test2.drop(["Name", "Ticket"], axis=1)train = train.drop(["Cabin"], axis=1)
test = test.drop(["Cabin"], axis=1)
test2 = test2.drop(["Cabin"], axis=1)#loc:将male和female转化成0,1
train.loc[train["Sex"]=="male", "Sex"] = 1
train.loc[train["Sex"]=="female", "Sex"] = 0
test.loc[test["Sex"]=="male", "Sex"] = 1
test.loc[test["Sex"]=="female", "Sex"] = 0
test2.loc[test2["Sex"]=="male", "Sex"] = 1
test2.loc[test2["Sex"]=="female", "Sex"] = 0#fillna:补S,众数
train["Embarked"] = train["Embarked"].fillna("S")
test2["Embarked"] = test2["Embarked"].fillna("S")train.loc[train["Embarked"]=="S", "Embarked"] = 0
train.loc[train["Embarked"]=="C", "Embarked"] = 1
train.loc[train["Embarked"]=="Q", "Embarked"] = 2
test.loc[test["Embarked"]=="S", "Embarked"] = 0
test.loc[test["Embarked"]=="C", "Embarked"] = 1
test.loc[test["Embarked"]=="Q", "Embarked"] = 2
test2.loc[test2["Embarked"]=="S", "Embarked"] = 0
test2.loc[test2["Embarked"]=="C", "Embarked"] = 1
test2.loc[test2["Embarked"]=="Q", "Embarked"] = 2
#票价为空,补中位数的值median
test["Fare"]=test['Fare'].fillna(test['Fare'].median(),inplace=True)average_age_train = train["Age"].mean()#平均数
std_age_train = train["Age"].std()#中位数
count_nan_age_train = train["Age"].isnull().sum()average_age_test = test["Age"].mean()
std_age_test = test["Age"].std()
count_nan_age_test = test["Age"].isnull().sum()average_age_test2 = test2["Age"].mean()
std_age_test2 = test2["Age"].std()
count_nan_age_test2 = test2["Age"].isnull().sum()#random用公式: (平均-中位,平均+中位,~)来补空位
rand_1 = np.random.randint(average_age_train - std_age_train, average_age_train + std_age_train, size = count_nan_age_train)
rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)
rand_3 = np.random.randint(average_age_test2 - std_age_test2, average_age_test2 + std_age_test2, size = count_nan_age_test2)#loc: 补在哪一行
train.loc[np.isnan(train["Age"]), "Age"] = rand_1
test.loc[np.isnan(test["Age"]), "Age"] = rand_2
test2.loc[np.isnan(test2["Age"]), "Age"] = rand_3#astype: 年龄String->int
train['Age'] = train['Age'].astype(int)
test['Age'] = test['Age'].astype(int)
test2['Age'] = test2['Age'].astype(int)
from sklearn import preprocessing#数据标准化(max-min)->减少range
from keras import utils as np_utils
#drop: id和survived(输入和答案单独拿出来)->转Array
X_train = train.drop(["PassengerId", "Survived"], axis=1)
y_train = train["Survived"]X_test2 = test2.drop(["PassengerId", "Survived"], axis=1)
y_test2 = test2["Survived"]X_test = test.drop(["PassengerId"],axis=1)
#->转Array
X_train = np.array(X_train).astype('float32')
y_train = np.array(y_train).astype('int32')X_test2 = np.array(X_test2).astype('float32')
y_test2 = np.array(y_test2).astype('int32')
#minmax——scale标准化方法 range在(0,1)之间
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
X_train_norm=minmax_scale.fit_transform(X_train)
#onehot编码 要的1,其他为0 [0,1]与[1,0]
#np_utils与to_categorical是pandas的function
y_train_onehot = np_utils.to_categorical(y_train, 2)X_test2_norm=minmax_scale.fit_transform(X_test2)
#np_utils与to_categorical是pandas的function
y_test2_onehot = np_utils.to_categorical(y_test2, 2)
X_test = np.array(X_test).astype('float32')
from keras.layers.normalization import BatchNormalization#训练模型
from keras.models import Sequential#Sequential是线性model
from keras.utils import np_utils# np_utils正规化
from keras.layers import Dense,Dropout#Dense是层 ,Dropout过度学习
model = Sequential()#线性model->NET架构
#input(7个特征)
#参数: units=256 input_dim=7(7个特征) kernel_initializer选择数学方法
# activation选择记忆力函数(sigmoid /...)
model.add(Dense(units=512,input_dim=7,kernel_initializer='random_uniform',activation='sigmoid'))
model.add(Dropout(0.25))#过度学习
model.add(Dense(units=128,kernel_initializer='random_uniform',activation='sigmoid'))
model.add(Dropout(0.25))
model.add(Dense(units=32,kernel_initializer='random_uniform',activation='sigmoid'))
model.add(Dropout(0.25))
#output(2个特征)
model.add(Dense(units=2,kernel_initializer='random_uniform',activation='softmax'))
model.summary()#摘要
from keras import optimizers#优化器
#binary_crossentropy二分法 ( sgd:60% ->rmsprop:80% )
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
import keras
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
checkpointer = ModelCheckpoint(filepath='C:\\input\\NNModel.h5', verbose=1, save_best_only=True)
early=EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=0, mode='auto', baseline=None)
call_list=[checkpointer,early]
#model.fit训练的方式
train_history = model.fit(x=X_train_norm,y=y_train_onehot,validation_split=0.1,epochs=100,#训练的多少,找最低点batch_size=128,verbose=1)
#结果中: loss与val_loss
import matplotlib.pyplot as plt #画图,画出loss与val_loss
def show_train_history(train_history, train, validation): plt.plot(train_history.history[train]) plt.plot(train_history.history[validation]) plt.title('Train History') plt.ylabel(train) plt.xlabel('Epoch') plt.legend(['train', 'validation'], loc='upper left') plt.show()
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')
scores = model.evaluate(x=X_test2_norm,#model.evaluate检测(100笔数据)->用于测试准确度(0.81)y=y_test2_onehot)
scores[1]
prediction = model.predict_classes(X_test2_norm) # Making prediction and save result to prediction
import pandas as pd
print("\t[Info] Display Confusion Matrix:")
print("%s\n" % pd.crosstab(y_test2, prediction, rownames=['label'], colnames=['predict']))
#结果中: 0表示死,1表示活
Y_pred=model.predict_classes(X_test)
s=({"PassengerId":test["PassengerId"],"Survived":Y_pred})
submit=pd.DataFrame(data=s)
submit.to_csv('/Users/liyixin/Desktop/AI/third-titanic/titanic.csv',index=False)