data_table = pandas.read_csv(r"/Users/JMP/Jmp_data/Python/PDABook/第五章/5.5 分布分析/分布分析.csv")
new = data_table[data_table[‘性别’].isnull()]
print(new)
newtabel = data_table.dropna(axis=0)
print(newtabel.head())
data_table[‘年龄’] = data_table[“年龄”].fillna(data_table[“年龄”].mean())
print(data_table)
data_group=pandas.read_csv(r"/Users/JMP/Jmp_data/Python/PDABook/第五章/5.7 RFM分析/RFM分析.csv")
print(data_group.head())
data_group[‘DealDateTime’]=pandas.to_datetime(data_group.DealDateTime, format=’%Y/%m/%d’)
print(data_group[‘DealDateTime’])
data_group[‘DateDiff’]=datetime.now() - data_group[‘DealDateTime’]
data_group[‘DateDiff’]=data_group[‘DateDiff’].dt.days
print(data_group.head())
R_agg=(data_group.groupby(by=“CustomerID”,as_index=False)[“DateDiff”].agg(“min”))
F_Agg=(data_group.groupby(by=“CustomerID”,as_index=False)[“OrderID”].agg(“count”))
M_Agg=(data_group.groupby(by=“CustomerID”,as_index=False)[“Sales”].agg(“sum”))
aggData= R_agg.merge(F_Agg).merge(M_Agg)
print(aggData)
bins= aggData.DateDiff.quantile(q=[0,0.2,0.4,0.6,0.8,1],interpolation=“nearest”)
bins[0]=0
fLable=[1,2,3,4,5]
print(bins)
R_S=pandas.cut(aggData.DateDiff,bins,labels=fLable)
print(R_S)
bins= aggData.OrderID.quantile(q=[0,0.2,0.4,0.6,0.8,1],interpolation=“nearest”)
bins[0]=0
fLable=[1,2,3,4,5]
print(bins)
F_S=pandas.cut(aggData.OrderID,bins,labels=fLable)
print(F_S)
bins= aggData.Sales.quantile(q=[0,0.2,0.4,0.6,0.8,1],interpolation=“nearest”)
bins[0]=0
fLable=[1,2,3,4,5]
print(bins)
M_S=pandas.cut(aggData.Sales,bins,labels=fLable)
print(M_S)
aggData[‘R_S’]=R_S
aggData[‘F_S’]=F_S
aggData[‘M_S’]=M_S
aggData[“RFM”]=100R_S.astype(int)+10F_S.astype(int)+M_S.astype(int)
print(aggData)
bins=aggData.RFM.quantile(q=[0,0.125,0.25,0.375,0.5,0.625,0.75,0.875,1],interpolation=“nearest”)
bins[0]=0
rfmlables=[1,2,3,4,5,6,7,8]
aggData[‘Level’]=pandas.cut(aggData.RFM, bins,labels=rfmlables)
print(aggData)