分类代码
class AdaBoost:def __init__(self,n_estimators=50, learning_rate=1.0): #n_estimators=50:分类器数目self.clf_num = n_estimatorsself.learning_rate = learning_ratedef init_args(self,datasets,labels):self.X = datasetsself.Y = labelsself.M, self.N = datasets.shape#弱分类器数据和集合self.clf_sets = []#初始化每个数据的权重self.weights = [1.0/self.M]*self.M#G(x)系数alpha,即分类器的权重self.alpha = []def _G(self,features,labels,weights):m = len(features)error = 100000.0 #无穷大best_v = 0.0#单维featuresfeatures_min = min(features)features_max = max(features)n_step = (features_max - features_min + self.learning_rate)//self.learning_ratedirect,compare_array = None,Nonefor i in range(1,int(n_step)): #找出误差最小的那个划分方式作为一个弱分类器v = features_min + self.learning_rate*iif v not in features:#误差分类计算compare_array_positive = np.array([1 if features[k] > v else -1 for k in range(m)])weight_error_positive = sum([weights[k] for k in range(m)if compare_array_positive[k] != labels[k]])compare_array_nagetive = np.array([-1 if features[k] > v else 1 for k in range(m)])weight_error_nagetive = sum([weights[k] for k in range(m)if compare_array_nagetive[k] != labels[k]])if weight_error_positive < weight_error_nagetive:weight_error = weight_error_positive_compare_array = compare_array_positivedirect = 'positive'else:weight_error = weight_error_nagetive_compare_array = compare_array_nagetivedirect = 'negetive'if weight_error < error:error = weight_errorcompare_array = _compare_arraybest_v = vreturn best_v,direct,error,compare_array#计算alphadef _alpha(self,error):return 0.5*np.log((1-error)/error)#规范化因子def _Z(self,weights,a,clf):return sum([weights[i]*np.exp(-1*a*self.Y[i]*clf[i])for i in range(self.M)])#权值更新def _w(self,a,clf,Z):for i in range(self.M):self.weights[i] = self.weights[i]*np.exp(-1*a*self.Y[i]*clf[i])/Zdef G(self,x,v,direct):if direct == 'positive':return 1 if x>v else -1else:return -1 if x>v else 1def fit(self,X,y):self.init_args(X,y) #初始化各个参数for epoch in range(self.clf_num): #多个分类器的计算best_clf_error,best_v,clf_result = 100000,None,None#根据特征维度,选择误差最小的for j in range(self.N): #self.N是特征个数,选择哪一个特征列表现最好feature = self.X[:,j]#分类阈值,分类方向,分类误差,分类结果v,direct,error,compare_array = self._G(feature,self.Y,self.weights)if error < best_clf_error:best_clf_error = errorbest_v = vfinal_direct = directclf_result = compare_arrayaxis = jif best_clf_error == 0: #如果最小的分类误差是0,跳出循环break#计算G(x)系数aa = self._alpha(best_clf_error)self.alpha.append(a)#记录分类器self.clf_sets.append((axis,best_v,final_direct)) #哪维特征,分类阈值,分类方向#规范化因子Z = self._Z(self.weights,a,clf_result) #输入上一步的每个样本的权重,这一步分类器的权重,分类结果#每个样本权值更新self._w(a,clf_result,Z) #输入分类器权重,分类结果,规范化因子#预测函数def predict(self,feature): #这里的特征样本只有一个result = 0.0for i in range(len(self.clf_sets)): #self.clf_sets 弱分类器数据和集合axis,clf_v,direct = self.clf_sets[i] #哪维特征,分类阈值,分类方向f_input = feature[axis]result += self.alpha[i]*self.G(f_input,clf_v,direct) # 第几维特征,分类阈值,分类方向#signreturn 1 if result>0 else -1def score(self,X_test,y_test):right_count = 0for i in range(len(X_test)):feature = X_test[i]if self.predict(feature) == y_test[i]:right_count += 1return right_count/len(X_test)
运行例子:
获取数据的方法
def create_data():iris = load_iris()df = pd.DataFrame(iris.data, columns=iris.feature_names)df['label'] = iris.targetdf.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']data = np.array(df.iloc[:100,[0,1,-1]]) #取第一列,第二列和最后一列的数据for i in range(len(data)):if data[i,-1] == 0:data[i,-1] = -1 #如果标签是0,则把标签变成-1return data[:,:2],data[:,-1] #返回前两维特征
运行
X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
clf = AdaBoost(10, 0.2)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
当有100个分类器时
result = []
for i in range(1, 101):X, y = create_data()X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)clf = AdaBoost(n_estimators=100, learning_rate=0.2)clf.fit(X_train, y_train)r = clf.score(X_test, y_test)# print('{}/100 score:{}'.format(i, r))result.append(r)print('average score:{:.3f}%'.format(sum(result)))
运行例子