Tri-Training: Exploiting Unlabeled Data Using Three Classifiers
原文下载:链接: https://ieeexplore.ieee.org/abstract/document/1512038.
主要内容介绍
Tri-training是一种新的co-training风格的[1]半监督学习算法。Co-training分别在两个不同的视图上训练两个分类器,即两个独立的属性集,并利用每个分类器对未标记示例的预测来扩充另一个分类器的训练集。Tri-training不需要充分冗余视图,也不需要使用不同的监督学习算法的假设实例空间划分成一组等价类别。Tri-training算法产生三种不同的分类器从原始标记示例集。然后将这些分类器重新训练通过Tri-training过程中对无标签数据进行标记。具体来说,在每一轮的三训练中,在一定的条件下,对一个未标记的例子进行标记,如果其他两个分类器的标记一致,则对一个分类器进行标记。直到三种分类器在迭代过程中与未标记样本一致且不发生变化。三次训练对测试样本使用投票方案,三个分类器对测试样本投票结果一致。
算法流程
python代码
加载数据
#coding=gbk
#读入数据保存为mat格式
import torch
import pickle
from torch.utils.data import Dataset, DataLoader
import os
import numpy as np
from torchvision import datasets,transforms
import scipy.io as scio
data_dir = 'E:/data/test/test_input'label_sample = "E:/data/test/test_input"
#un_sample = "E:/data/test/test_input"
#test_sample = "E:/data/test/test_input"
#file_dir = {"label_sample":label_sample,"un_sample":un_sample,"test_sample":test_sample}"""
class TensorDataset(Dataset):def __init__(self,root,data_transforms = None,target_transform = None):self.root = rootself.data_transforms = data_transformsself.target_transform = target_transformself.labels = os.listdir(root)self.data = []self.label = []for index,label in enumerate(self.labels):file_list_img = os.listdir(self.root+'/'+label)for j in file_list_img:image = Image.open(self.root+'/'+i+'/'+j).convert('RGB')self.label.append(index)self.data.append(image)def __getitem__(self,index):img,target = self.data[index],self.label[index]if self.data_transform is not None:img = self.data_tranform(img)if self.target_transform is not None:target = self.target_transform(target)return img,targetdef __len__(self):return self.data_tensor.size(0)
"""
data_tf = transforms.Compose([#transforms.Resize((28,28)),transforms.RandomHorizontalFlip(),transforms.ToTensor(),transforms.Normalize([0.415, 0.429, 0.444], [0.282, 0.272, 0.272])])def text_save(file_dir,data):f = open(file_dir,'wb')pickle.dump(data,f)datasets = datasets.ImageFolder(data_dir,transform=data_tf)
dataloader = DataLoader(datasets,batch_size = 4, shuffle = True,num_workers = 0)
i = 0for data,label in dataloader:#print(data)data = data.reshape(data.shape[0],-1)if i == 0:data_vector = datalabel_vector = labelelse:data_vector = np.vstack((data_vector,data))label_vector = np.hstack((label_vector,label))i +=1#print(data_vector.shape)
#print(label_vector.shape)
text_save('data_vector.txt',data_vector)
text_save('label_vector.txt',label_vector)#text_save('undata_vector.txt',data_vector)
#text_save('unlabel_vector.txt',data_vector)#text_save('test_data_vector.txt',data_vector)
#text_save('testlabel_vector.txt',data_vector)
主模型
#coding=gbk
import numpy as np
import sklearn
import pickle
from sklearn.metrics import accuracy_scorefrom sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.neural_network import MLPClassifierclass TriTraining:def __init__(self, classifier):if sklearn.base.is_classifier(classifier):self.classifiers = [sklearn.base.clone(classifier) for i in range(3)]else:self.classifiers = [sklearn.base.clone(classifier[i]) for i in range(3)]def fit(self, L_X, L_y, U_X):for i in range(3):sample = sklearn.utils.resample(L_X, L_y) self.classifiers[i].fit(*sample)e_prime = [0.5]*3l_prime = [0]*3e = [0]*3update = [False]*3Li_X, Li_y = [[]]*3, [[]]*3#to save proxy labeled dataimprove = Trueself.iter = 0while improve:self.iter += 1#count iterations for i in range(3): j, k = np.delete(np.array([0,1,2]),i)update[i] = Falsee[i] = self.measure_error(L_X, L_y, j, k)if e[i] < e_prime[i]:U_y_j = self.classifiers[j].predict(U_X)U_y_k = self.classifiers[k].predict(U_X)Li_X[i] = U_X[U_y_j == U_y_k]#when two models agree on the label, save itLi_y[i] = U_y_j[U_y_j == U_y_k]if l_prime[i] == 0:#no updated beforel_prime[i] = int(e[i]/(e_prime[i] - e[i]) + 1)if l_prime[i] < len(Li_y[i]):if e[i]*len(Li_y[i])<e_prime[i] * l_prime[i]:update[i] = Trueelif l_prime[i] > e[i]/(e_prime[i] - e[i]):L_index = np.random.choice(len(Li_y[i]), int(e_prime[i] * l_prime[i]/e[i] -1))Li_X[i], Li_y[i] = Li_X[i][L_index], Li_y[i][L_index]update[i] = Truefor i in range(3):if update[i]:self.classifiers[i].fit(np.append(L_X,Li_X[i],axis=0), np.append(L_y, Li_y[i], axis=0))e_prime[i] = e[i]l_prime[i] = len(Li_y[i])if update == [False]*3:improve = False#if no classifier was updated, no improvementdef predict(self, X):pred = np.asarray([self.classifiers[i].predict(X) for i in range(3)])pred[0][pred[1]==pred[2]] = pred[1][pred[1]==pred[2]]return pred[0]def score(self, X, y):return sklearn.metrics.accuracy_score(y, self.predict(X))def measure_error(self, X, y, j, k):j_pred = self.classifiers[j].predict(X)k_pred = self.classifiers[k].predict(X)wrong_index =np.logical_and(j_pred != y, k_pred==j_pred)#wrong_index =np.logical_and(j_pred != y_test, k_pred!=y_test)return sum(wrong_index)/sum(j_pred == k_pred)def load_data(file_dir):f = open(file_dir,'rb')data = pickle.load(f)f.close()return datatraindata = load_data('data_vector.txt')
trainlabel = load_data('label_vector.txt')testdata = load_data('test_data_vector.txt')
testlabel = load_data('testlabel_vector.txt')udata = load_data('undata_vector.txt')
ulabel = load_data('unlabel_vector')"""
#随机森林
clf = RandomForestClassifier()
clf.fit(traindata,trainlabel)
res1 = clf.predict(testdata)
#print(accuracy_score(res1,testlabel))
"""#高斯朴素贝叶斯
gnb = GaussianNB()
gnb.fit(traindata,trainlabel)
y_pred = gnb.predict(testdata)
print(accuracy_score(y_pred,testlabel))#决策树
dt = tree.DecisionTreeClassifier()
dt.fit(traindata,trainlabel)
y_pred = dt.predict(testdata)
print(accuracy_score(y_pred,testlabel))#BP神经网络
bpnn = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
bpnn.fit(traindata,trainlabel)
y_pred = bpnn.predict(testdata)
print(accuracy_score(y_pred,testlabel))TT = TriTraining([GaussianNB(),tree.DecisionTreeClassifier(),MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)])
TT.fit(traindata,trainlabel,udata)
res2 = TT.predict(testdata)
print(accuracy_score(res2,testlabel))
参考文献
[1] A. Blum and T. Mitchell, “Combining labeled and unlabeled data with co-training,” in Proceedings of the 11th Annual Conference on Computational Learning Theory, Madison, WI, pp.92–100, 1998.