tutoring2

前期准备

import qgrid
import random
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from functools import reduce
import matplotlib.pyplot as plt
from pyecharts.charts import Bar, Line, Page
from pyecharts import options as opts

%matplotlib inline
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)

封装函数

def liftchart_num(data, score_bins, loan_cnt, bad_rate, bad_rate_cum, cover, auc, ks):
    bar = (
        Bar(init_opts=opts.InitOpts(width="900px", height="400px"))
        .add_xaxis(list(data[score_bins]))
        .add_yaxis("样本量", list(data[loan_cnt]), color="#1f77b4")
        .extend_axis(yaxis=opts.AxisOpts(axislabel_opts=opts.LabelOpts(formatter="{value}%")))
#         .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
        .set_global_opts(
            toolbox_opts=opts.ToolboxOpts(),
            legend_opts=opts.LegendOpts(pos_top="7%"),
            datazoom_opts=opts.DataZoomOpts(range_start=0,range_end=100),
            title_opts=opts.TitleOpts(title="%s" % score_bins, 
                                      subtitle='COVER={0:.3f}   AUC={1:.3f}   KS={2:.3f}'.format(cover,auc,ks))
        )
    )
    
    line = (
        Line()
        .add_xaxis(data[score_bins])
        .add_yaxis("每段首逾率", list(data[bad_rate]), yaxis_index=1, 
                   linestyle_opts=opts.LineStyleOpts(color="#DE47C8", width=3),
                   itemstyle_opts=opts.ItemStyleOpts(color="#DE47C8"))
        .add_yaxis("累积首逾率", list(data[bad_rate_cum]), yaxis_index=1,
                   linestyle_opts=opts.LineStyleOpts(color="#ff7f0e", width=3),
                   itemstyle_opts=opts.ItemStyleOpts(color="#ff7f0e"))
#         .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
    ); bar.overlap(line); return bar


def liftchart_cat(data, score_bins, loan_cnt, bad_rate, bad_rate_all, cover):
    bar = (
        Bar(init_opts=opts.InitOpts(width="1000px", height="400px"))
        .add_xaxis([str(x) for x in data[score_bins]])
        .add_yaxis("样本量", list(data[loan_cnt]), color="#1f77b4")
        .extend_axis(yaxis=opts.AxisOpts(axislabel_opts=opts.LabelOpts(formatter="{value}%")))
#         .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
        .set_global_opts(
            title_opts=opts.TitleOpts(title="%s" % score_bins, subtitle='COVER={0:.3f}'.format(cover)),
            legend_opts=opts.LegendOpts(pos_top="7%"),
            toolbox_opts=opts.ToolboxOpts(),
            datazoom_opts=opts.DataZoomOpts(range_start=0,range_end=100)
        )
    )
    
    line = (
        Line()
        .add_xaxis([str(x) for x in data[score_bins]])
        .add_yaxis("每段首逾率", list(data[bad_rate]), yaxis_index=1, 
                   linestyle_opts=opts.LineStyleOpts(color="#ff7f0e", width=3),
                   itemstyle_opts=opts.ItemStyleOpts(color="#ff7f0e"))
         .set_series_opts(
#           label_opts=opts.LabelOpts(is_show=False)
            markline_opts=opts.MarkLineOpts(data=[opts.MarkLineItem(y=bad_rate_all, name="整体首逾率")])
        )
    ); bar.overlap(line); return bar


def liftchart(data, xvars, yvar, uv, g=10):
    
    data_tmp = data.copy()
    page = Page()
    for xvar in xvars:
        
        ######## 可视化离散型变量
        if data_tmp[xvar].nunique() <= uv or data_tmp[xvar].dtype == 'object':
            
            ## 处理缺失值与-9999999
#             data_tmp[xvar] = data_tmp[xvar].astype('str')
            cover = 1 - sum(pd.isnull(data_tmp[xvar])) / len(data_tmp)
            bad_rate_all = round(data_tmp[yvar].mean()*100, 2)
#             data_tmp[xvar].replace('-9999999', '-999', inplace=True)
    
            result_cat = data_tmp \
                .groupby(xvar, as_index=False)[yvar] \
                .agg({'good' : lambda x: sum(x==0), 
                      'bad'  : lambda x: sum(x==1), 
                      'total': lambda x: len(x)}) \
                .fillna(0)
    
            result_cat['bad_rate'] = (result_cat['bad'] / result_cat['total']).map(lambda x: '{:.2f}'.format(x*100))
        
            plot_cat = liftchart_cat(result_cat, xvar, 'total', 'bad_rate', bad_rate_all, cover); page.add(plot_cat)
        
        ######## 可视化连续性变量
        else:
            
            data_tmp[xvar+'_bin'] = np.where(data_tmp[xvar] < 0, '999999999', pd.qcut(data_tmp[xvar], q=g, duplicates='drop', precision=1).astype('str'))
            
            result_num = data_tmp \
                .groupby(xvar+'_bin', as_index=False)[yvar] \
                .agg({'good' : lambda x: sum(x==0), 
                      'bad'  : lambda x: sum(x==1), 
                      'total': lambda x: len(x)}) \
                .fillna(0)
            
            result_num['bins_fst'] = result_num[xvar+'_bin'].apply(lambda x: float(x.split(',')[0].replace('(', '')))
            result_num = result_num.sort_values('bins_fst').reset_index(drop=True)
            
            data_tmp_g0 = data_tmp[data_tmp[xvar] > 0]
            score_median = data_tmp_g0[xvar].median()
            fst_rate_g = data_tmp_g0.loc[data_tmp_g0[xvar] > score_median, yvar].mean()
            fst_rate_l = data_tmp_g0.loc[data_tmp_g0[xvar] < score_median, yvar].mean()
            
            if fst_rate_g < fst_rate_l:
                
                result_num = result_num \
                    .assign(**{'is_neg_999': result_num[xvar+'_bin'] == '999999999', 
                               'is_nan'    : result_num[xvar+'_bin'] == 'nan'}) \
                    .sort_values(['is_neg_999', 'is_nan', 'bins_fst'], ascending=[1, 1, 0])

            result_num = result_num \
                .assign(bad_cum  = result_num['bad'].cumsum(),
                        good_cum = result_num['good'].cumsum(),
                        bad_rate = result_num['bad'] / result_num['total'])
            
            result_num['bad_rate_cum'] = result_num['bad_cum'] / result_num['total'].cumsum()
            result_num[['bad_rate','bad_rate_cum']] = result_num[['bad_rate','bad_rate_cum']].applymap(lambda x: '{:.2f}'.format(x*100))
            result_num.replace(to_replace=['999999999'], value=['-999'], inplace=True)
#             result_num[xvar+'_bin'] = result_num[xvar+'_bin'].str.replace('\\.0', '')
            
            #### 计算AUC、KS和IV
            result_num_woe = result_num[~result_num[xvar+'_bin'].isin(['-999','nan'])]
            result_num_woe.reset_index(drop=True, inplace=True)
            
            result_num_woe = result_num_woe \
                .assign(bad_rate_woe  = result_num_woe['bad'] / result_num_woe['bad_cum'][len(result_num_woe)-1],
                        good_rate_woe = result_num_woe['good'] / result_num_woe['good_cum'][len(result_num_woe)-1])
            
            result_num_woe['woe_v'] = np.log(result_num_woe['bad_rate_woe'] / result_num_woe['good_rate_woe'])
            result_num_woe['woe_w'] = result_num_woe['bad_rate_woe'] - result_num_woe['good_rate_woe']
            
            pred, real = data_tmp.loc[data_tmp[xvar] >= 0, xvar], data_tmp.loc[data_tmp[xvar] >= 0, yvar]
            fpr, tpr, thresholds = metrics.roc_curve(real, pred, pos_label=1)
            
            cover = 1 - sum(pd.isnull(data_tmp[xvar])) / len(data_tmp)
            auc = metrics.auc(fpr, tpr)
            ks  = max(abs(tpr - fpr))
            iv  = sum(result_num_woe['woe_w'] * result_num_woe['woe_v'])
            
            plot_num = liftchart_num(result_num, xvar+'_bin', 'total', 'bad_rate', 'bad_rate_cum', cover, auc, ks); page.add(plot_num)
#     return result_num_woe
    return page

读取数据

df_loc_raw = pd.read_excel("../data/return/join.xlsx")

df_return_raw = pd.read_excel("../data/return/df_return.xlsx")

数据处理

df_return_raw.rename(columns={'asdlfkjalskjf':'score'}, inplace=True)
df_return_select = df_return_raw[['score']]
df_return_select['y'] = [random.randint(0,1) for _ in range(len(df_return_select))]

绘图

p_all = liftchart(df_return_select, xvars=['score'], yvar='y', uv=20, g=10)
p_all.render_notebook()
    <div id="8e83fda3b7a94eacb5d6531aa59f6d99" style="width:900px; height:400px;"></div>