Datawhale17期-task3_综合

原始链接

数据特征工程

学习特征预处理、缺失值、异常值处理、数据分桶等特征处理方法
学习特征交互、编码、选择的相应方法

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
# from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')

data_train =pd.read_csv('dataset/train.csv')
data_test_a = pd.read_csv('dataset/testA.csv')

确实值、异常值、对象类型值处理

numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
label = 'isDefault'
numerical_fea.remove(label)

缺失值填充

把所有缺失值替换为指定的值0

data_train = data_train.fillna(0)
向用缺失值上面的值替换缺失值

data_train = data_train.fillna(axis=0,method=‘ffill’)
纵向用缺失值下面的值替换缺失值,且设置最多只填充两个连续的缺失值

data_train = data_train.fillna(axis=0,method=‘bfill’,limit=2)

#缺失值查看
data_train.isnull().sum()

id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           1
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  1
regionCode                0
dti                     239
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies      405
revolBal                  0
revolUtil               531
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     1
policyCode                0
n0                    40270
n1                    40270
n2                    40270
n2.1                  40270
n4                    33239
n5                    40270
n6                    40270
n7                    40270
n8                    40271
n9                    40270
n10                   33239
n11                   69752
n12                   40270
n13                   40270
n14                   40270
dtype: int64

#用平均值填充数值型
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_test_a[numerical_fea].median())#用众数填充类别型数据
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_test_a[category_fea] = data_test_a[category_fea].fillna(data_test_a[category_fea].mode())

data_train.isnull().sum()

id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           0
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  0
regionCode                0
dti                       0
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies        0
revolBal                  0
revolUtil                 0
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     0
policyCode                0
n0                        0
n1                        0
n2                        0
n2.1                      0
n4                        0
n5                        0
n6                        0
n7                        0
n8                        0
n9                        0
n10                       0
n11                       0
n12                       0
n13                       0
n14                       0
dtype: int64

#类别特征
category_fea

['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']

时间格式特别处理

转换成距离以某一时间点（最小时间）的差值（天数）

for data in [data_train, data_test_a]:data["issueDate"] = pd.to_datetime(data["issueDate"], format = "%Y-%m-%d")startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')#构造时间特征data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days

data_train['employmentLength'].value_counts(dropna=False).sort_index()

1 year        52489
10+ years    262753
2 years       72358
3 years       64152
4 years       47985
5 years       50102
6 years       37254
7 years       35407
8 years       36192
9 years       30272
< 1 year      64237
NaN           46799
Name: employmentLength, dtype: int64

#对象类型转换的数值
def employmentLength_to_int(s):if pd.isnull(s):return selse:return np.int8(s.split()[0])
for data in [data_train, data_test_a]:data["employmentLength"].replace(to_replace = "10+ years", value="10 years", inplace  = True)data["employmentLength"].replace("< 1 year", "0 years", inplace = True)data["employmentLength"] = data["employmentLength"].apply(employmentLength_to_int)

data["employmentLength"].value_counts(dropna = False).sort_index()

 0.0     159891.0     131822.0     182073.0     160114.0     118335.0     125436.0      93287.0      88238.0      89769.0      759410.0    65772
NaN      11742
Name: employmentLength, dtype: int64

#对earliesCreditLine进行预处理
data_train['earliesCreditLine'].sample(5)  #随机抽样5个

424703    Dec-1993
110607    May-2008
439311    Nov-2009
489048    Sep-2007
219141    Jul-1985
Name: earliesCreditLine, dtype: object

for data in [data_train, data_test_a]:data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s:int(s[-4:]))

类别特征处理

# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:print(f, '类型数：', data[f].nunique())

grade 类型数： 7
subGrade 类型数： 35
employmentTitle 类型数： 79282
homeOwnership 类型数： 6
verificationStatus 类型数： 3
purpose 类型数： 14
postCode 类型数： 889
regionCode 类型数： 51
applicationType 类型数： 2
initialListStatus 类型数： 2
title 类型数： 12058
policyCode 类型数： 1

像等级这种类别特征，是有优先级的可以labelencode或者自映射
get_dummies获取哑编码

for data in [data_train, data_test_a]:data['grade'] = data['grade'].map({
    'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})

# 类型数在2之上，又不是高维稀疏的,且纯分类特征 
for data in [data_train, data_test_a]:data = pd.get_dummies(data, columns=['subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

异常值处理

均方差法

def find_outliers_by_3segama(data,fea):data_std = np.std(data[fea])data_mean = np.mean(data[fea])outliers_cut_off = data_std * 3lower_rule = data_mean - outliers_cut_offupper_rule = data_mean + outliers_cut_offdata[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值')return data

#进一步分析变量异常值和目标变量的关系
data_train = data_train.copy()
for fea in numerical_fea:data_train = find_outliers_by_3segama(data_train,fea)print(data_train[fea+'_outliers'].value_counts())print(data_train.groupby(fea+'_outliers')['isDefault'].sum())print('*'*10)

正常值    800000
Name: id_outliers, dtype: int64
id_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    800000
Name: loanAmnt_outliers, dtype: int64
loanAmnt_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    800000
Name: term_outliers, dtype: int64
term_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    794259
异常值      5741
Name: interestRate_outliers, dtype: int64
interestRate_outliers
异常值      2916
正常值    156694
Name: isDefault, dtype: int64
**********
正常值    792046
异常值      7954
Name: installment_outliers, dtype: int64
installment_outliers
异常值      2152
正常值    157458
Name: isDefault, dtype: int64
**********
正常值    800000
Name: employmentTitle_outliers, dtype: int64
employmentTitle_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    799701
异常值       299
Name: homeOwnership_outliers, dtype: int64
homeOwnership_outliers
异常值        62
正常值    159548
Name: isDefault, dtype: int64
**********
正常值    793973
异常值      6027
Name: annualIncome_outliers, dtype: int64
annualIncome_outliers
异常值       756
正常值    158854
Name: isDefault, dtype: int64
**********
正常值    800000
Name: verificationStatus_outliers, dtype: int64
verificationStatus_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    783003
异常值     16997
Name: purpose_outliers, dtype: int64
purpose_outliers
异常值      3635
正常值    155975
Name: isDefault, dtype: int64
**********
正常值    798931
异常值      1069
Name: postCode_outliers, dtype: int64
postCode_outliers
异常值       221
正常值    159389
Name: isDefault, dtype: int64
**********
正常值    799994
异常值         6
Name: regionCode_outliers, dtype: int64
regionCode_outliers
异常值         1
正常值    159609
Name: isDefault, dtype: int64
**********
正常值    798440
异常值      1560
Name: dti_outliers, dtype: int64
dti_outliers
异常值       466
正常值    159144
Name: isDefault, dtype: int64
**********
正常值    778245
异常值     21755
Name: delinquency_2years_outliers, dtype: int64
delinquency_2years_outliers
异常值      5089
正常值    154521
Name: isDefault, dtype: int64
**********
正常值    788261
异常值     11739
Name: ficoRangeLow_outliers, dtype: int64
ficoRangeLow_outliers
异常值       778
正常值    158832
Name: isDefault, dtype: int64
**********
正常值    788261
异常值     11739
Name: ficoRangeHigh_outliers, dtype: int64
ficoRangeHigh_outliers
异常值       778
正常值    158832
Name: isDefault, dtype: int64
**********
正常值    790889
异常值      9111
Name: openAcc_outliers, dtype: int64
openAcc_outliers
异常值      2195
正常值    157415
Name: isDefault, dtype: int64
**********
正常值    792471
异常值      7529
Name: pubRec_outliers, dtype: int64
pubRec_outliers
异常值      1701
正常值    157909
Name: isDefault, dtype: int64
**********
正常值    794120
异常值      5880
Name: pubRecBankruptcies_outliers, dtype: int64
pubRecBankruptcies_outliers
异常值      1423
正常值    158187
Name: isDefault, dtype: int64
**********
正常值    790001
异常值      9999
Name: revolBal_outliers, dtype: int64
revolBal_outliers
异常值      1359
正常值    158251
Name: isDefault, dtype: int64
**********
正常值    799948
异常值        52
Name: revolUtil_outliers, dtype: int64
revolUtil_outliers
异常值        23
正常值    159587
Name: isDefault, dtype: int64
**********
正常值    791663
异常值      8337
Name: totalAcc_outliers, dtype: int64
totalAcc_outliers
异常值      1668
正常值    157942
Name: isDefault, dtype: int64
**********
正常值    800000
Name: initialListStatus_outliers, dtype: int64
initialListStatus_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    784586
异常值     15414
Name: applicationType_outliers, dtype: int64
applicationType_outliers
异常值      3875
正常值    155735
Name: isDefault, dtype: int64
**********
正常值    775134
异常值     24866
Name: title_outliers, dtype: int64
title_outliers
异常值      3900
正常值    155710
Name: isDefault, dtype: int64
**********
正常值    800000
Name: policyCode_outliers, dtype: int64
policyCode_outliers
正常值    159610
Name: isDefault, dtype: int64
**********
正常值    782773
异常值     17227
Name: n0_outliers, dtype: int64
n0_outliers
异常值      3485
正常值    156125
Name: isDefault, dtype: int64
**********
正常值    790500
异常值      9500
Name: n1_outliers, dtype: int64
n1_outliers
异常值      2491
正常值    157119
Name: isDefault, dtype: int64
**********
正常值    789067
异常值     10933
Name: n2_outliers, dtype: int64
n2_outliers
异常值      3205
正常值    156405
Name: isDefault, dtype: int64
**********
正常值    789067
异常值     10933
Name: n2.1_outliers, dtype: int64
n2.1_outliers
异常值      3205
正常值    156405
Name: isDefault, dtype: int64
**********
正常值    788660
异常值     11340
Name: n4_outliers, dtype: int64
n4_outliers
异常值      2476
正常值    157134
Name: isDefault, dtype: int64
**********
正常值    790355
异常值      9645
Name: n5_outliers, dtype: int64
n5_outliers
异常值      1858
正常值    157752
Name: isDefault, dtype: int64
**********
正常值    786006
异常值     13994
Name: n6_outliers, dtype: int64
n6_outliers
异常值      3182
正常值    156428
Name: isDefault, dtype: int64
**********
正常值    788430
异常值     11570
Name: n7_outliers, dtype: int64
n7_outliers
异常值      2746
正常值    156864
Name: isDefault, dtype: int64
**********
正常值    789625
异常值     10375
Name: n8_outliers, dtype: int64
n8_outliers
异常值      2131
正常值    157479
Name: isDefault, dtype: int64
**********
正常值    786384
异常值     13616
Name: n9_outliers, dtype: int64
n9_outliers
异常值      3953
正常值    155657
Name: isDefault, dtype: int64
**********
正常值    788979
异常值     11021
Name: n10_outliers, dtype: int64
n10_outliers
异常值      2639
正常值    156971
Name: isDefault, dtype: int64
**********
正常值    799434
异常值       566
Name: n11_outliers, dtype: int64
n11_outliers
异常值       112
正常值    159498
Name: isDefault, dtype: int64
**********
正常值    797585
异常值      2415
Name: n12_outliers, dtype: int64
n12_outliers
异常值       545
正常值    159065
Name: isDefault, dtype: int64
**********
正常值    788907
异常值     11093
Name: n13_outliers, dtype: int64
n13_outliers
异常值      2482
正常值    157128
Name: isDefault, dtype: int64
**********
正常值    788884
异常值     11116
Name: n14_outliers, dtype: int64
n14_outliers
异常值      3364
正常值    156246
Name: isDefault, dtype: int64
**********

#删除异常值 因为删除了异常值，所以需要reset_index重置索引
for fea in numerical_fea:data_train = data_train[data_train[fea+'_outliers']=='正常值']data_train = data_train.reset_index(drop=True)     #drop = true，表示不保留原始index，默认会把原始index作为数据列保存

数据分桶

#固定宽度分箱
# 通过除法映射到间隔均匀的分箱中，每个分箱的取值范围都是loanAmnt/1000 ，loanAmnt贷款金额
data['loanAmnt_bin1'] = np.floor_divide(data['loanAmnt'], 1000)

## 通过对数函数映射到指数宽度分箱
data['loanAmnt_bin2'] = np.floor(np.log10(data['loanAmnt']))

 #分位数分箱 将一组数据进行均匀分箱，10表示分为10组
data['loanAmnt_bin3'] = pd.qcut(data['loanAmnt'], 10, labels=False)

特征交互

特征之间的交互可衍生出新的特征

for col in ['grade', 'subGrade']: temp_dict = data_train.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={
    'mean': col + '_target_mean'})temp_dict.index = temp_dict[col].valuestemp_dict = temp_dict[col + '_target_mean'].to_dict()data_train[col + '_target_mean'] = data_train[col].map(temp_dict)data_test_a[col + '_target_mean'] = data_test_a[col].map(temp_dict)

# 其他衍生变量 mean 和 std
for df in [data_train, data_test_a]:for item in ['n0','n1','n2','n2.1','n4','n5','n6','n7','n8','n9','n10','n11','n12','n13','n14']:df['grade_to_mean_' + item] = df['grade'] / df.groupby([item])['grade'].transform('mean')df['grade_to_std_' + item] = df['grade'] / df.groupby([item])['grade'].transform('std')

特征编码

labelEncode 直接放入树模型中

#label-encode:subGrade,postCode,title
# 高维类别特征需要进行转换
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']):le = LabelEncoder()le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))data_train[col] = le.transform(list(data_train[col].astype(str).values))data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))
print('Label Encoding 完成')

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:07<00:00,  1.81s/it]Label Encoding 完成

逻辑回归等模型要单独增加的特征工程

对特征做归一化，去除相关性高的特征
归一化目的是让训练过程更好更快的收敛，避免特征大吃小的问题
去除相关性是增加模型的可解释性，加快预测过程。

归一化伪代码：

# 举例归一化过程
#伪代码
for fea in [要归一化的特征列表]：data[fea] = ((data[fea] - np.min(data[fea])) / (np.max(data[fea]) - np.min(data[fea])))

特征选择

放弃一些不必要的特征

特征选择的方法：

Filter
方差选择法
相关系数法（pearson 相关系数）
卡方检验
互信息法
Wrapper （RFE）
递归特征消除法
Embedded
基于惩罚项的特征选择法
基于树模型的特征选择