初始化¶
导入依赖包¶
In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, cross_val_score
In [2]:
# import warnings
# warnings.filterwarnings('ignore')
配置输出¶
In [3]:
# from basetool.init_tool import *
# from basetool.data_analysis import *
# from basetool.feature_engineering import *
# from basetool.feature_selection import *
# from basetool.model_evolution import *
# from basetool.model_hyperparameter import *
# init_setting()
时间统计¶
In [4]:
import time
import functools
'''
Created on 2018年11月1日
@author: jiangpeizhao
输出函数的运行时间
'''
def clock(func):
"""
- 输出函数的运行时间
:param func:
:return:
"""
@functools.wraps(func)
def clocked(*args, **kwargs):
t0 = time.time()
result = func(*args, **kwargs)
elapsed = time.time() - t0
name = func.__name__
arg_lst = []
if args:
arg_lst.append(', '.join(repr(arg) for arg in args))
if kwargs:
pairs = ['%s=%r' % (k, w) for k, w in sorted(kwargs.items())]
arg_lst.append(', '.join(pairs))
arg_str = ', '.join(arg_lst)
print('[%0.8fs] %s(%s) ' % (elapsed, name, arg_str))
return result
return clocked
In [5]:
import time
import functools
def clock_with_result(func):
"""
- 输出函数的运行时间
:param func:
:return:
"""
@functools.wraps(func)
def clocked(*args, **kwargs):
t0 = time.time()
result = func(*args, **kwargs)
elapsed = time.time() - t0
name = func.__name__
arg_lst = []
if args:
arg_lst.append(', '.join(repr(arg) for arg in args))
if kwargs:
pairs = ['%s=%r' % (k, w) for k, w in sorted(kwargs.items())]
arg_lst.append(', '.join(pairs))
arg_str = ', '.join(arg_lst)
print('[%0.8fs] %s(%s) -> %r ' % (elapsed, name, arg_str, result))
return result
return clocked
In [6]:
@clock
def add(x, y):
return x+y
add(1, 1)
[0.00000024s] add(1, 1)
Out[6]:
2
基本信息¶
字段说明¶
字段说明
NO 字段名称 数据类型 字段描述
1 ID Int 客户唯一标识
2 age Int 客户年龄
3 job String 客户的职业
4 marital String 婚姻状况
5 education String 受教育水平
6 default String 是否有违约记录
7 balance Int 每年账户的平均余额
8 housing String 是否有住房贷款
9 loan String 是否有个人贷款
10 contact String 与客户联系的沟通方式
11 day Int 最后一次联系的时间(几号)
12 month String 最后一次联系的时间(月份)
13 duration Int 最后一次联系的交流时长
14 campaign Int 在本次活动中,与该客户交流过的次数
15 pdays Int 距离上次活动最后一次联系该客户,过去了多久(999表示没有联系过)
16 previous Int 在本次活动之前,与该客户交流过的次数
17 poutcome String 上一次活动的结果
18 y Int 预测客户是否会订购定期存款业务
评估标准¶
本次测评算法为AUC,结果中ID表示客户唯一标识,pred表示预测客户订购定期存款业务的概率。
读取数据¶
读取文件¶
In [7]:
df_train = pd.read_csv('../doc/data/bank/input/train_set.csv')
df_test = pd.read_csv('../doc/data/bank/input/test_set.csv')
数据拼接¶
In [8]:
df_all = df_train.append(df_test)
df_all.shape
Out[8]:
(36169, 18)
In [9]:
df_list = []
df_list.append(df_train)
df_list.append(df_test)
df_all = pd.concat(df_list)
df_all.shape
Out[9]:
(36169, 18)
认识数据¶
数据类型¶
In [10]:
df_train.dtypes
Out[10]:
ID int64 age int64 job object marital object education object default object balance int64 housing object loan object contact object day int64 month object duration int64 campaign int64 pdays int64 previous int64 poutcome object y int64 dtype: object
空值所在列¶
In [11]:
import pandas as pd
import numpy as np
# 创建一个包含空值的DataFrame
df = pd.DataFrame({
'A': [1, 2, np.nan, 4],
'B': [5, np.nan, 7, 8],
'C': [9, 10, 11, 12],
'D': [3, np.nan, np.nan, np.nan]
})
# 找出包含空值的列
null_columns = df.columns[df.isnull().any()]
print(null_columns)
Index(['A', 'B', 'D'], dtype='object')
空值所在行¶
In [12]:
import pandas as pd
# 假设你有以下DataFrame
df = pd.DataFrame({
'A': [1, 2, None, 4],
'B': [5, None, 7, 8],
'C': [9, 10, 11, None]
})
# 找出存在空值的行
null_rows = df[df.isnull().any(axis=1)]
null_rows
Out[12]:
A | B | C | |
---|---|---|---|
1 | 2.0 | NaN | 10.0 |
2 | NaN | 7.0 | 11.0 |
3 | 4.0 | 8.0 | NaN |
空值所在位置¶
In [13]:
import pandas as pd
import numpy as np
df = pd.DataFrame({
'A': [1, 2, None, 4],
'B': [5, None, 7, 8],
'C': [9, 10, 11, None]
})
你可以使用以下代码找到所有空值的位置:
In [14]:
null_positions = np.where(df.isnull())
null_positions
Out[14]:
(array([1, 2, 3]), array([1, 0, 2]))
In [15]:
row_indices, column_indices = np.where(df.isnull())
print(row_indices)
print(column_indices)
[1 2 3] [1 0 2]
用A列填充B列的空值¶
In [16]:
import pandas as pd
import numpy as np
# 创建一个包含空值的示例 DataFrame
df = pd.DataFrame({
'A': [1, 2, 3, 4],
'B': [5, np.nan, 7, np.nan]
})
# 使用 A 列的值填充 B 列的空值
df['B'].fillna(df['A'], inplace=True)
df
Out[16]:
A | B | |
---|---|---|
0 | 1 | 5.0 |
1 | 2 | 2.0 |
2 | 3 | 7.0 |
3 | 4 | 4.0 |
数据缺失情况¶
In [17]:
def missing_ratio(df, is_plot=True, ascending=False, top_n=30):
"""
按数据缺失程度显示特征的数值缺失率
:param df: 要分析的数据集
:param is_plot: 是否画出直方图
:param ascending: False时,优先显示缺失程度高的特征;True时,优先显示缺失程度低的特征
:param top_n: 最大数目,如果为None则全部显示
:return: Dataframe
"""
df_na = (df.isnull().sum() / len(df)) * 100
if top_n is None:
df_na = df_na.drop(df_na[df_na == 0].index).sort_values(ascending=ascending)
else:
df_na = df_na.drop(df_na[df_na == 0].index).sort_values(ascending=ascending)[:top_n]
missing_data = pd.DataFrame({'Missing Ratio': df_na})
if is_plot and not missing_data.empty:
# 缺失值直方图
f, ax = plt.subplots(figsize=(15, 12))
plt.xticks(rotation='90')
sns.barplot(x=df_na.index, y=df_na)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)
return missing_data
In [18]:
missing_ratio(df_train)
Out[18]:
Missing Ratio |
---|
In [19]:
missing_ratio(df_test)
Out[19]:
Missing Ratio |
---|
缺失值列表与直方图¶
In [20]:
# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
def get_missing_list(app_train, is_plot=True):
# 缺失值列表
app_train_na = (app_train.isnull().sum() / len(app_train)) * 100
app_train_na = app_train_na.drop(app_train_na[app_train_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :app_train_na})
missing_data.head(app_train.shape[1])
if is_plot:
# 缺失值直方图
f, ax = plt.subplots(figsize=(15, 12))
plt.xticks(rotation='90')
sns.barplot(x=app_train_na.index, y=app_train_na)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)
return missing_data
In [21]:
tmp_df_train = df_train.copy()
mask = np.random.choice([True, False], size=tmp_df_train['balance'].shape, p=[0.3, 0.7])
tmp_df_train.loc[mask, 'balance'] = np.nan
mask = np.random.choice([True, False], size=tmp_df_train['pdays'].shape, p=[0.1, 0.9])
tmp_df_train.loc[mask, 'pdays'] = np.nan
missing_data = get_missing_list(tmp_df_train)
missing_data
Out[21]:
Missing Ratio | |
---|---|
balance | 29.332069 |
pdays | 9.768140 |
查看列取值¶
sample¶
In [22]:
df_train.sample(5)
Out[22]:
ID | age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
23488 | 23489 | 35 | management | married | tertiary | no | 4124 | yes | no | cellular | 23 | jul | 350 | 2 | 445 | 7 | other | 1 |
22719 | 22720 | 28 | technician | single | tertiary | no | 11862 | no | no | cellular | 21 | oct | 549 | 2 | 238 | 13 | other | 1 |
15544 | 15545 | 32 | technician | married | secondary | no | 9608 | no | no | cellular | 8 | aug | 114 | 4 | -1 | 0 | unknown | 0 |
10282 | 10283 | 71 | retired | married | unknown | no | 362 | no | no | telephone | 12 | feb | 139 | 2 | -1 | 0 | unknown | 0 |
21693 | 21694 | 36 | blue-collar | single | secondary | no | 376 | yes | yes | cellular | 4 | feb | 180 | 1 | 274 | 3 | failure | 0 |
In [23]:
def get_sample_df(df, sample_n=0.2):
"""
获取抽样数据
:param df: 要抽样的数据集
:param sample_n: 如果为整数,则为要抽样的行数;如果是0.0-1.0的浮点数,则为抽样的比率
:return: Dataframe 抽样结果
"""
if isinstance(sample_n, int):
sample_df = df.sample(n=sample_n)
elif isinstance(sample_n, float):
if sample_n <= 0.0 or sample_n > 1.0:
raise ValueError("sample_n取值范围为(0.0, 1.0].")
sample_df = df.sample(frac=sample_n)
else:
raise ValueError("sample_n取值为'int'或'float'.")
return sample_df
In [24]:
get_sample_df(df_train, 5)
Out[24]:
ID | age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
24125 | 24126 | 54 | management | divorced | tertiary | no | 5475 | no | no | cellular | 22 | jul | 321 | 4 | -1 | 0 | unknown | 1 |
16880 | 16881 | 52 | technician | married | secondary | no | 83 | no | no | telephone | 22 | jul | 135 | 3 | -1 | 0 | unknown | 0 |
6671 | 6672 | 35 | technician | married | secondary | no | 1508 | no | no | cellular | 28 | jul | 339 | 8 | -1 | 0 | unknown | 0 |
20550 | 20551 | 32 | self-employed | married | tertiary | no | 312 | yes | no | unknown | 9 | may | 262 | 2 | -1 | 0 | unknown | 0 |
23136 | 23137 | 58 | management | divorced | tertiary | no | 5701 | no | no | cellular | 2 | feb | 521 | 1 | -1 | 0 | unknown | 1 |
In [25]:
get_sample_df(df_train, 0.0003)
Out[25]:
ID | age | job | marital | education | default | balance | housing | loan | contact | day | month | duration | campaign | pdays | previous | poutcome | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7141 | 7142 | 45 | admin. | divorced | tertiary | no | 569 | yes | no | cellular | 14 | may | 228 | 1 | -1 | 0 | unknown | 0 |
11771 | 11772 | 32 | admin. | married | secondary | no | 1078 | yes | no | cellular | 16 | apr | 177 | 1 | -1 | 0 | unknown | 0 |
8641 | 8642 | 55 | technician | divorced | secondary | no | 929 | no | yes | telephone | 28 | sep | 80 | 3 | -1 | 0 | unknown | 0 |
15457 | 15458 | 58 | blue-collar | married | primary | no | 47 | no | no | cellular | 6 | aug | 281 | 2 | -1 | 0 | unknown | 0 |
14413 | 14414 | 27 | admin. | divorced | secondary | no | 0 | no | yes | cellular | 14 | jul | 227 | 4 | -1 | 0 | unknown | 0 |
13372 | 13373 | 60 | management | married | secondary | no | 725 | yes | no | cellular | 8 | jul | 266 | 1 | -1 | 0 | unknown | 0 |
21048 | 21049 | 46 | technician | married | secondary | yes | 4 | no | yes | cellular | 9 | jul | 78 | 1 | -1 | 0 | unknown | 0 |
22201 | 22202 | 31 | admin. | married | secondary | no | 355 | yes | no | cellular | 18 | nov | 530 | 1 | 193 | 1 | failure | 0 |
sns x坐标旋转45度¶
g.set_xticklabels(rotation=30)
直方图¶
In [26]:
sns.displot(df_train['duration'])
Out[26]:
<seaborn.axisgrid.FacetGrid at 0x7f2284bd6690>
In [27]:
sns.displot(df_train['duration'], bins=20)
Out[27]:
<seaborn.axisgrid.FacetGrid at 0x7f2282415690>
倾斜特征¶
In [28]:
from scipy import stats
from scipy.stats import norm, skew
numeric_feats = df_train.dtypes[(df_train.dtypes != "category") &(df_train.dtypes != "object")].index
# Check the skew of all numerical features
skewed_feats = df_train[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(20)
Skew in numerical features:
Out[28]:
Skew | |
---|---|
previous | 51.178561 |
balance | 8.515881 |
campaign | 4.849919 |
duration | 3.150829 |
pdays | 2.623793 |
y | 2.383820 |
age | 0.697072 |
day | 0.087181 |
ID | 0.000000 |
倾斜特征log变换¶
ridge_dataset_df['room_temp_max'] = np.log(ridge_dataset_df['room_temp_max'])
ridge_dataset_df['room_temp_max'] = np.exp(ridge_dataset_df['room_temp_max'])
ridge_dataset_df[CFG.target_col] = np.log1p(ridge_dataset_df[CFG.target_col])
stacked_pred = np.expm1(stacked_averaged_models.predict(test.values))
Box Cox变换¶
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))
from scipy.special import boxcox1p
skewed_features = skewness.index
# del 'room_rent_price',
skewed_features = ['village_households', 'construction_area']
skewed_features = []
lam = 0.15
for feat in skewed_features:
app_train[feat] = boxcox1p(app_train[feat], lam)
散点图¶
In [29]:
plt.scatter(df_train.balance, df_train.duration, c = "blue", marker = "s")
plt.title("Looking for outliers")
plt.xlabel("Balance")
plt.ylabel("Duration")
plt.show()
画核密度估计(kernel density estimate)图¶
In [30]:
sns.kdeplot(df_train.balance, df_train.duration, shade=True, n_levels=20, cmap="Reds")
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. FutureWarning
Out[30]:
<AxesSubplot:xlabel='balance', ylabel='duration'>
散点与分布¶
In [31]:
sns.pairplot(df_train.sample(100), size = 2.5)
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/seaborn/axisgrid.py:2076: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
Out[31]:
<seaborn.axisgrid.PairGrid at 0x7f22844e8cd0>
热力与分布¶
In [32]:
import seaborn as sns
import matplotlib.pyplot as plt
def plot_distribution(df, sample_n=100):
"""
热力与分布,数据量大的情况下画图会很慢
:param df: 要分析的数据集
:param sample_n: 如果为整数,则为要抽样的行数;如果是0.0-1.0的浮点数,则为抽样的比率
:return:
"""
sample_df = get_sample_df(df, sample_n=sample_n)
grid = sns.PairGrid(sample_df)
grid = grid.map_upper(plt.scatter, color='darkred')
grid = grid.map_diag(plt.hist, bins=10, color='darkred', edgecolor='k')
grid = grid.map_lower(sns.kdeplot, cmap='Reds')
plt.show()
In [33]:
plot_distribution(df_train, sample_n=100)
按照某离散值的列画图¶
In [34]:
import seaborn as sns
import matplotlib.pyplot as plt
def plot_distribution_by_discrete(df, hue, sample_n=100, diag_kind='auto'):
"""
按照某离散值的列画图,指定透明度,数据量大的情况下画图会很慢
:param diag_kind: auto, kde
:param df: 要分析的数据集
:param hue: 指定离散值列的列名
:param sample_n: 如果为整数,则为要抽样的行数;如果是0.0-1.0的浮点数,则为抽样的比率
:return:
"""
sample_df = get_sample_df(df, sample_n=sample_n)
sns.pairplot(sample_df, hue=hue, diag_kind=diag_kind,
plot_kws={'alpha': 0.6, 's': 80, 'edgecolor': 'k'},
size=4)
plt.show()
In [35]:
plot_distribution_by_discrete(df_train, 'y', sample_n=100)
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/seaborn/axisgrid.py:2076: UserWarning: The `size` parameter has been renamed to `height`; please update your code. warnings.warn(msg, UserWarning)
相关性¶
全部相关性¶
In [36]:
import seaborn as sns
import matplotlib.pyplot as plt
def plot_corr(df):
"""
全部相关性
:param df: 要分析的数据集
:return:
"""
corr_mat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corr_mat, vmax=.8, square=True)
plt.show()
In [37]:
plot_corr(df_train)
相关性top n¶
In [38]:
import seaborn as sns
import matplotlib.pyplot as plt
def plot_corr_top_k(df, target, k=10, small_is_better=False):
"""
最相关的k个属性
:param df: 要分析的数据集
:param target: 目标属性
:param k: 个数
:param small_is_better: 如果为True,则画出最不相关的k个属性
:return:
"""
corr_mat = df.corr()
if small_is_better:
cols = corr_mat.nsmallest(k, target)[target].index
else:
cols = corr_mat.nlargest(k, target)[target].index
cm = np.corrcoef(df[cols].values.T)
sns.set(font_scale=1.25)
sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f',
annot_kws={'size': 9},
yticklabels=cols.values,
xticklabels=cols.values)
plt.show()
In [39]:
plot_corr_top_k(df_train, 'y')
In [40]:
plot_corr_top_k(df_train, 'y', small_is_better=True)
画双Y轴的图像¶
In [41]:
def plot_double_y(df, x_name, y1_name, y2_name, title='Double Y axis', y_from_0=False):
"""
画双Y轴的图像
:param df: Dataframe
:param x_name: String, x属性名
:param y1_name: String, y1属性名
:param y2_name: String, y2属性名
:param title: 图标题
:param y_from_0: 是否从0开始画图
:return:
"""
X = df[x_name]
y1 = df[y1_name]
y2 = df[y2_name]
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(X, y1, label=y1_name)
ax2.plot(X, y2, 'r', label=y2_name)
if y_from_0:
ax1.set_ylim([0, max(y1)])
ax2.set_ylim([0, max(y2)])
ax1.set_xlabel(X.name)
ax1.set_ylabel(y1.name)
ax2.set_ylabel(y2.name)
fig.legend(loc=1, bbox_to_anchor=(1,1), bbox_transform=ax1.transAxes)
plt.title(title)
plt.grid(False)
plt.show()
In [42]:
plot_double_y(df_train.head(200), 'ID', 'duration', 'pdays', title='Double Y axis', y_from_0=False)
根据某离散值,画有多个子图的双Y轴的图像¶
In [43]:
import math
def plot_subplot_double_y(df, hue, x_name, y1_name, y2_name, legend_first=True):
"""
画有多个子图的双Y轴的图像
:param legend_first: 是否只在第一个图上显示图例,如果为False则在所有图上显示图例
:param df: Dataframe
:param hue: String, 数据分组列的列名
:param x_name: String, x属性名
:param y1_name: String, y1属性名
:param y2_name: String, y2属性名
:return:
"""
# print(y1_name, 'is blue;', y2_name, 'is red.')
fig = plt.figure(figsize=(15, 15))
fig.subplots_adjust(hspace=0.3, wspace=0.1)
project_list = list(set(df[hue]))
project_n = len(project_list)
for i, v in enumerate(project_list):
sub_df = df[df[hue] == v]
X = sub_df[x_name]
y1 = sub_df[y1_name]
y2 = sub_df[y2_name]
ax1 = fig.add_subplot(math.ceil(project_n / 3), 3, i + 1)
lns1 = ax1.plot(X, y1, label=y1_name)
ax1.set_title(v)
plt.xticks([])
plt.yticks([])
ax2 = ax1.twinx() # this is the important function
lns2 = ax2.plot(X, y2, 'r', label=y2_name)
if legend_first and i == 0:
fig.legend(loc=1, bbox_to_anchor=(1,1), bbox_transform=ax1.transAxes)
if not legend_first:
lns = lns1 + lns2
labs = [l.get_label() for l in lns]
ax1.legend(lns, labs, loc=0)
ax2.set_title(v)
plt.xticks([])
plt.yticks([])
plt.show()
In [44]:
plot_subplot_double_y(df_train.head(200), 'education', 'ID', 'duration', 'pdays', legend_first=True)
连续值分布 age Int 客户年龄¶
In [45]:
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_string_dtype
from sklearn.preprocessing import LabelEncoder
def plot_dist_by_target(df, source, target):
"""
根据目标列(取值为0,1)画分布图
:param df: 要分析的数据集
:param source: 需要画分布属性
:param target: 目标属性
:return:
"""
if is_string_dtype(df[source]):
lbl = LabelEncoder()
lbl.fit(list(df[source].values))
print(lbl.classes_)
df[source + '_lbl'] = lbl.transform(list(df[source].values))
source = source + '_lbl'
sns.distplot(df.loc[df[target] == 0, source], color="r", label='bad')
# plot loans that were not repaid
sns.distplot(df.loc[df[target] == 1, source], color="b", label='good')
# Label the plots
plt.title('Distribution of %s by Target Value' % source)
plt.xlabel('%s' % source);
plt.ylabel('Density')
plt.show()
In [46]:
plot_dist_by_target(df_train, 'age', 'y')
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
In [47]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
def plot_dist_and_qq(df, target):
"""
画直方图和QQ-plot
:param df: 要分析的数据集
:param target: 需要画分布属性
:return:
"""
sns.distplot(df[target], fit=stats.norm);
plt.figure()
stats.probplot(df[target], plot=plt)
plt.show()
In [48]:
plot_dist_and_qq(df_train, 'age')
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
In [49]:
df_all_log = df_all.copy()
df_all_log['age'] = np.log(df_all_log['age'])
plot_dist_and_qq(df_all_log, 'age')
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
离散值分布 job String 客户的职业¶
In [50]:
df_train['job'].describe()
Out[50]:
count 25317 unique 12 top blue-collar freq 5456 Name: job, dtype: object
In [51]:
set(df_train['job'])
Out[51]:
{'admin.', 'blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired', 'self-employed', 'services', 'student', 'technician', 'unemployed', 'unknown'}
In [52]:
df_train['job'].value_counts()
Out[52]:
blue-collar 5456 management 5296 technician 4241 admin. 2909 services 2342 retired 1273 self-employed 884 entrepreneur 856 unemployed 701 housemaid 663 student 533 unknown 163 Name: job, dtype: int64
In [53]:
plot_dist_by_target(df_train, 'job', 'y')
['admin.' 'blue-collar' 'entrepreneur' 'housemaid' 'management' 'retired' 'self-employed' 'services' 'student' 'technician' 'unemployed' 'unknown']
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
特征工程¶
所有列属性¶
In [54]:
df_all.dtypes
Out[54]:
ID int64 age int64 job object marital object education object default object balance int64 housing object loan object contact object day int64 month object duration int64 campaign int64 pdays int64 previous int64 poutcome object y float64 dtype: object
连续型与离散型特征¶
In [55]:
cate_features = ['job', 'marital','education','default','housing','loan','contact','month','poutcome',]
num_features = ['age', 'balance','day','duration','campaign','pdays','previous',]
按照月日统计取值的个数¶
In [56]:
def agg_feature_size(df, features):
"""
对给定的特征们生成size特征
:param df: 要做特征工程的数据集
:param features: list, 要聚合的特征列表,groupby的特征
:return:
"""
feature_name = 'agg_count'
for i in features:
feature_name += '_' + i
temp = df.groupby(features).size().reset_index().rename(columns={0: feature_name})
df = df.merge(temp, 'left', on=features)
return df, feature_name
In [57]:
data = df_all.copy()
print(data.shape)
ll=[]
for f in ['campaign', 'pdays', 'previous']+[ x for x in cate_features if x != 'month' ]:
data, feature_name=agg_feature_size(data,['day','month',f])
ll.append(feature_name)
print(data.shape)
(36169, 18) (36169, 29)
可选值少的列sparse_feature和可选值多的列dense_feature,做聚合¶
In [58]:
sparse_feature= cate_features + ['campaign']
dense_feature=num_features
In [59]:
from tqdm import tqdm
def get_new_columns(col_name, aggregation_dict):
"""
将聚合字典转list
:param col_name: String, 列名
:param aggregation_dict: dict, 除col_name之外的列聚合字典
:return:
"""
tmp_list = []
for k in aggregation_dict.keys():
for agg in aggregation_dict[k]:
if str(type(agg)) == "<class 'function'>":
tmp_list.append(col_name + '_' + k + '_' + 'other')
else:
tmp_list.append(col_name + '_' + k + '_' + agg)
return tmp_list
def agg_feature_list(df, sparse_feature, dense_feature):
"""
对特征做'count', 'nunique'或'mean', 'max', 'min', 'std'操作
:param df: 要做特征工程的数据集
:param sparse_feature: list, 可选值少的特征列表
:param dense_feature: list, 可选值多的特征列表
:return: Dataframe
"""
for d in tqdm(sparse_feature):
aggregation_dict = {}
for s in sparse_feature:
aggregation_dict[s] = ['count', 'nunique']
for den in dense_feature:
aggregation_dict[den] = ['mean', 'max', 'min', 'std']
aggregation_dict.pop(d)
temp = df.groupby(d).agg(aggregation_dict).reset_index()
temp.columns = [d] + get_new_columns(d, aggregation_dict)
df = pd.merge(df, temp, on=d, how='left')
return df
In [60]:
print(data.shape)
data = agg_feature_list(data, sparse_feature, dense_feature)
print(data.shape)
(36169, 29)
100%|██████████| 10/10 [00:01<00:00, 9.39it/s]
(36169, 467)
按行统计分位数¶
In [61]:
import pandas as pd
import numpy as np
# 假设 df 是你的 DataFrame
df = pd.DataFrame({
'A': [1, 2, 3, 4, 5],
'B': [2, 3, 4, 5, 6],
'C': [3, 4, 5, 6, 7],
'D': [4, 5, 6, 7, 8],
'E': [5, 6, 7, 8, 9]
})
original_df = df.copy()
df['max'] = original_df.apply(np.max, axis=1)
df['min'] = original_df.apply(np.min, axis=1)
df['mean'] = original_df.apply(np.mean, axis=1)
df['25%'] = original_df.apply(lambda row: np.percentile(row, 25), axis=1)
df['75%'] = original_df.apply(lambda row: np.percentile(row, 75), axis=1)
df['std'] = original_df.apply(np.std, axis=1)
In [62]:
df
Out[62]:
A | B | C | D | E | max | min | mean | 25% | 75% | std | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | 5 | 5 | 1 | 3.0 | 2.0 | 4.0 | 1.414214 |
1 | 2 | 3 | 4 | 5 | 6 | 6 | 2 | 4.0 | 3.0 | 5.0 | 1.414214 |
2 | 3 | 4 | 5 | 6 | 7 | 7 | 3 | 5.0 | 4.0 | 6.0 | 1.414214 |
3 | 4 | 5 | 6 | 7 | 8 | 8 | 4 | 6.0 | 5.0 | 7.0 | 1.414214 |
4 | 5 | 6 | 7 | 8 | 9 | 9 | 5 | 7.0 | 6.0 | 8.0 | 1.414214 |
CountVectorizer¶
In [63]:
data['new_con'] = data['job'].astype(str)
for i in ['marital', 'education', 'contact','month','poutcome']:
data['new_con'] = data['new_con'].astype(str) + '_' + data[i].astype(str)
data['new_con'] = data['new_con'].apply(lambda x: ' '.join(x.split('_')))
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/ipykernel_launcher.py:1: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()` """Entry point for launching an IPython kernel.
In [64]:
vector_feature = ['new_con']
In [65]:
from sklearn.feature_extraction.text import CountVectorizer
def get_count_vectorizer(df, vector_feature):
"""
对指定属性做CountVectorizer
:param df: 要做特征工程的数据集
:param vector_feature: list, 要处理的特征
:return:
"""
cv = CountVectorizer()
for feature in vector_feature:
cv.fit(df[feature])
result_df = pd.DataFrame(data=cv.transform(df[feature]).toarray(), index=None, columns=cv.get_feature_names())
df = pd.concat([df, result_df, ], axis=1)
return df
In [66]:
print(data.shape)
data = get_count_vectorizer(data, vector_feature)
data = data.drop(vector_feature, axis=1)
print(data.shape)
(36169, 468)
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead. warnings.warn(msg, category=FutureWarning)
(36169, 504)
dummies¶
In [67]:
print(data.shape)
df_all_t = pd.get_dummies(data)
print(df_all_t.shape)
(36169, 504) (36169, 539)
Labelencoder¶
#Encoding Non Numeric Columns
class MultiColumnLabelEncoder:
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
#pdb.set_trace()
return self # not relevant here
def transform(self,X):
#pdb.set_trace()
'''
Transforms columns of X specified in self.columns using
LabelEncoder(). If no columns specified, transforms all
columns in X.
'''
output = X.copy()
dictionary = {}
if self.columns is not None:
for col in (self.columns):
le=LabelEncoder()
output[col] = le.fit_transform(output[col])
dictionary[col] = {} #Creating Dictionary
dictionary[col]=dict(zip(le.classes_, range(len(le.classes_))))
else:
for colname,col in (output.iteritems()):
le=LabelEncoder()
output[colname] = le.fit_transform(col)
dictionary[colname] = {} #Creating Dictionary
dictionary[colname]=dict(zip(le.classes_, range(len(le.classes_))))
#d[colname][LabelEncoder().classes_] = 'value'
return output,dictionary
def fit_transform(self,X,y=None):
#pdb.set_trace()
return self.fit(X,y).transform(X)
使用方式:
app_train,app_train_encode_dict = (MultiColumnLabelEncoder(columns = ['location_region','location_center','overall_qual','trade_type','real_relative_floor']).fit_transform(app_train))
检查空值¶
In [68]:
missing_ratio(df_all_t, is_plot=False)
Out[68]:
Missing Ratio | |
---|---|
y | 30.003594 |
campaign_age_std | 0.013824 |
campaign_balance_std | 0.013824 |
campaign_day_std | 0.013824 |
campaign_duration_std | 0.013824 |
campaign_pdays_std | 0.013824 |
campaign_previous_std | 0.013824 |
In [69]:
df_all_t = df_all_t.fillna(0)
In [70]:
missing_ratio(df_all_t, is_plot=False)
Out[70]:
Missing Ratio |
---|
重新分割测试集训练集¶
In [71]:
df_train_t = df_all_t.iloc[:df_train.shape[0],:]
df_test_t = df_all_t.iloc[df_train.shape[0]:,:]
In [72]:
X_train_t = df_train_t.drop(['y', 'ID'], axis=1)
y_train_t = df_train_t[['y']]
对抗验证¶
In [73]:
import lightgbm as lgbm
def adversarial_validation(X_train, X_test):
"""
对抗验证
:param X_train: 训练集
:param X_test: 测试集
:return:
"""
df_train = X_train.copy()
df_test = X_test.copy()
# 定义新的Y
df_train['Is_Test'] = 0
df_test['Is_Test'] = 1
# 将 Train 和 Test 合成一个数据集。HasDetections是数据本来的Y,所以剔除。
df_adv = pd.concat([df_train, df_test])
adv_data = lgbm.Dataset(data=df_adv.drop('Is_Test', axis=1), label=df_adv.loc[:, 'Is_Test'])
# 定义模型参数
params = {
'boosting_type': 'gbdt',
'colsample_bytree': 1,
'learning_rate': 0.1,
'max_depth': 5,
'min_child_samples': 100,
'min_child_weight': 1,
'min_split_gain': 0.0,
'num_leaves': 20,
'objective': 'binary',
'random_state': 50,
'subsample': 1.0,
'subsample_freq': 0,
'metric': 'auc',
'num_threads': 8,
'verbosity': -1
}
# 交叉验证
adv_cv_results = lgbm.cv(params,
adv_data,
num_boost_round=10000,
nfold=5,
#categorical_feature=categorical_columns,
early_stopping_rounds=200,
verbose_eval=True,
seed=6)
print('交叉验证中最优的AUC为 {:.5f},对应的标准差为{:.5f}.'.format(
adv_cv_results['auc-mean'][-1], adv_cv_results['auc-stdv'][-1]))
print('模型最优的迭代次数为{}.'.format(len(adv_cv_results['auc-mean'])))
In [74]:
adversarial_validation(X_train_t, y_train_t)
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/lightgbm/engine.py:577: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead. _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. " /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/lightgbm/engine.py:620: UserWarning: 'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. Pass 'log_evaluation()' callback via 'callbacks' argument instead. _log_warning("'verbose_eval' argument is deprecated and will be removed in a future release of LightGBM. "
[1] cv_agg's auc: 1 + 0 [2] cv_agg's auc: 1 + 0 [3] cv_agg's auc: 1 + 0 [4] cv_agg's auc: 1 + 0 [5] cv_agg's auc: 1 + 0 [6] cv_agg's auc: 1 + 0 [7] cv_agg's auc: 1 + 0 [8] cv_agg's auc: 1 + 0 [9] cv_agg's auc: 1 + 0 [10] cv_agg's auc: 1 + 0 [11] cv_agg's auc: 1 + 0 [12] cv_agg's auc: 1 + 0 [13] cv_agg's auc: 1 + 0 [14] cv_agg's auc: 1 + 0 [15] cv_agg's auc: 1 + 0 [16] cv_agg's auc: 1 + 0 [17] cv_agg's auc: 1 + 0 [18] cv_agg's auc: 1 + 0 [19] cv_agg's auc: 1 + 0 [20] cv_agg's auc: 1 + 0 [21] cv_agg's auc: 1 + 0 [22] cv_agg's auc: 1 + 0 [23] cv_agg's auc: 1 + 0 [24] cv_agg's auc: 1 + 0 [25] cv_agg's auc: 1 + 0 [26] cv_agg's auc: 1 + 0 [27] cv_agg's auc: 1 + 0 [28] cv_agg's auc: 1 + 0 [29] cv_agg's auc: 1 + 0 [30] cv_agg's auc: 1 + 0 [31] cv_agg's auc: 1 + 0 [32] cv_agg's auc: 1 + 0 [33] cv_agg's auc: 1 + 0 [34] cv_agg's auc: 1 + 0 [35] cv_agg's auc: 1 + 0 [36] cv_agg's auc: 1 + 0 [37] cv_agg's auc: 1 + 0 [38] cv_agg's auc: 1 + 0 [39] cv_agg's auc: 1 + 0 [40] cv_agg's auc: 1 + 0 [41] cv_agg's auc: 1 + 0 [42] cv_agg's auc: 1 + 0 [43] cv_agg's auc: 1 + 0 [44] cv_agg's auc: 1 + 0 [45] cv_agg's auc: 1 + 0 [46] cv_agg's auc: 1 + 0 [47] cv_agg's auc: 1 + 0 [48] cv_agg's auc: 1 + 0 [49] cv_agg's auc: 1 + 0 [50] cv_agg's auc: 1 + 0 [51] cv_agg's auc: 1 + 0 [52] cv_agg's auc: 1 + 0 [53] cv_agg's auc: 1 + 0 [54] cv_agg's auc: 1 + 0 [55] cv_agg's auc: 1 + 0 [56] cv_agg's auc: 1 + 0 [57] cv_agg's auc: 1 + 0 [58] cv_agg's auc: 1 + 0 [59] cv_agg's auc: 1 + 0 [60] cv_agg's auc: 1 + 0 [61] cv_agg's auc: 1 + 0 [62] cv_agg's auc: 1 + 0 [63] cv_agg's auc: 1 + 0 [64] cv_agg's auc: 1 + 0 [65] cv_agg's auc: 1 + 0 [66] cv_agg's auc: 1 + 0 [67] cv_agg's auc: 1 + 0 [68] cv_agg's auc: 1 + 0 [69] cv_agg's auc: 1 + 0 [70] cv_agg's auc: 1 + 0 [71] cv_agg's auc: 1 + 0 [72] cv_agg's auc: 1 + 0 [73] cv_agg's auc: 1 + 0 [74] cv_agg's auc: 1 + 0 [75] cv_agg's auc: 1 + 0 [76] cv_agg's auc: 1 + 0 [77] cv_agg's auc: 1 + 0 [78] cv_agg's auc: 1 + 0 [79] cv_agg's auc: 1 + 0 [80] cv_agg's auc: 1 + 0 [81] cv_agg's auc: 1 + 0 [82] cv_agg's auc: 1 + 0 [83] cv_agg's auc: 1 + 0 [84] cv_agg's auc: 1 + 0 [85] cv_agg's auc: 1 + 0 [86] cv_agg's auc: 1 + 0 [87] cv_agg's auc: 1 + 0 [88] cv_agg's auc: 1 + 0 [89] cv_agg's auc: 1 + 0 [90] cv_agg's auc: 1 + 0 [91] cv_agg's auc: 1 + 0 [92] cv_agg's auc: 1 + 0 [93] cv_agg's auc: 1 + 0 [94] cv_agg's auc: 1 + 0 [95] cv_agg's auc: 1 + 0 [96] cv_agg's auc: 1 + 0 [97] cv_agg's auc: 1 + 0 [98] cv_agg's auc: 1 + 0 [99] cv_agg's auc: 1 + 0 [100] cv_agg's auc: 1 + 0 [101] cv_agg's auc: 1 + 0 [102] cv_agg's auc: 1 + 0 [103] cv_agg's auc: 1 + 0 [104] cv_agg's auc: 1 + 0 [105] cv_agg's auc: 1 + 0 [106] cv_agg's auc: 1 + 0 [107] cv_agg's auc: 1 + 0 [108] cv_agg's auc: 1 + 0 [109] cv_agg's auc: 1 + 0 [110] cv_agg's auc: 1 + 0 [111] cv_agg's auc: 1 + 0 [112] cv_agg's auc: 1 + 0 [113] cv_agg's auc: 1 + 0 [114] cv_agg's auc: 1 + 0 [115] cv_agg's auc: 1 + 0 [116] cv_agg's auc: 1 + 0 [117] cv_agg's auc: 1 + 0 [118] cv_agg's auc: 1 + 0 [119] cv_agg's auc: 1 + 0 [120] cv_agg's auc: 1 + 0 [121] cv_agg's auc: 1 + 0 [122] cv_agg's auc: 1 + 0 [123] cv_agg's auc: 1 + 0 [124] cv_agg's auc: 1 + 0 [125] cv_agg's auc: 1 + 0 [126] cv_agg's auc: 1 + 0 [127] cv_agg's auc: 1 + 0 [128] cv_agg's auc: 1 + 0 [129] cv_agg's auc: 1 + 0 [130] cv_agg's auc: 1 + 0 [131] cv_agg's auc: 1 + 0 [132] cv_agg's auc: 1 + 0 [133] cv_agg's auc: 1 + 0 [134] cv_agg's auc: 1 + 0 [135] cv_agg's auc: 1 + 0 [136] cv_agg's auc: 1 + 0 [137] cv_agg's auc: 1 + 0 [138] cv_agg's auc: 1 + 0 [139] cv_agg's auc: 1 + 0 [140] cv_agg's auc: 1 + 0 [141] cv_agg's auc: 1 + 0 [142] cv_agg's auc: 1 + 0 [143] cv_agg's auc: 1 + 0 [144] cv_agg's auc: 1 + 0 [145] cv_agg's auc: 1 + 0 [146] cv_agg's auc: 1 + 0 [147] cv_agg's auc: 1 + 0 [148] cv_agg's auc: 1 + 0 [149] cv_agg's auc: 1 + 0 [150] cv_agg's auc: 1 + 0 [151] cv_agg's auc: 1 + 0 [152] cv_agg's auc: 1 + 0 [153] cv_agg's auc: 1 + 0 [154] cv_agg's auc: 1 + 0 [155] cv_agg's auc: 1 + 0 [156] cv_agg's auc: 1 + 0 [157] cv_agg's auc: 1 + 0 [158] cv_agg's auc: 1 + 0 [159] cv_agg's auc: 1 + 0 [160] cv_agg's auc: 1 + 0 [161] cv_agg's auc: 1 + 0 [162] cv_agg's auc: 1 + 0 [163] cv_agg's auc: 1 + 0 [164] cv_agg's auc: 1 + 0 [165] cv_agg's auc: 1 + 0 [166] cv_agg's auc: 1 + 0 [167] cv_agg's auc: 1 + 0 [168] cv_agg's auc: 1 + 0 [169] cv_agg's auc: 1 + 0 [170] cv_agg's auc: 1 + 0 [171] cv_agg's auc: 1 + 0 [172] cv_agg's auc: 1 + 0 [173] cv_agg's auc: 1 + 0 [174] cv_agg's auc: 1 + 0 [175] cv_agg's auc: 1 + 0 [176] cv_agg's auc: 1 + 0 [177] cv_agg's auc: 1 + 0 [178] cv_agg's auc: 1 + 0 [179] cv_agg's auc: 1 + 0 [180] cv_agg's auc: 1 + 0 [181] cv_agg's auc: 1 + 0 [182] cv_agg's auc: 1 + 0 [183] cv_agg's auc: 1 + 0 [184] cv_agg's auc: 1 + 0 [185] cv_agg's auc: 1 + 0 [186] cv_agg's auc: 1 + 0 [187] cv_agg's auc: 1 + 0 [188] cv_agg's auc: 1 + 0 [189] cv_agg's auc: 1 + 0 [190] cv_agg's auc: 1 + 0 [191] cv_agg's auc: 1 + 0 [192] cv_agg's auc: 1 + 0 [193] cv_agg's auc: 1 + 0 [194] cv_agg's auc: 1 + 0 [195] cv_agg's auc: 1 + 0 [196] cv_agg's auc: 1 + 0 [197] cv_agg's auc: 1 + 0 [198] cv_agg's auc: 1 + 0 [199] cv_agg's auc: 1 + 0 [200] cv_agg's auc: 1 + 0 [201] cv_agg's auc: 1 + 0 交叉验证中最优的AUC为 1.00000,对应的标准差为0.00000. 模型最优的迭代次数为1.
模型初始化¶
In [75]:
lgbc = lgb.LGBMClassifier(objective='binary',metric= 'auc', subsample_freq=1, learning_rate=0.001, random_state=666)
定义CV¶
In [76]:
n_fold = 5
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=6)
在训练集上查看AUC¶
In [77]:
from sklearn.metrics import auc, roc_curve
from sklearn.model_selection import KFold
from scipy import interp
def plot_roc_curve(classifier, X, y, cv=KFold(n_splits=5, shuffle=True, random_state=6)):
"""
通过5折交叉验证画ROC曲线
:param classifier: 模型
:param X: array-like, 属性
:param y: array-like, 目标
:param cv: 交叉验证方式
:return:
"""
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
i = 0
for train, test in cv.split(X, y):
probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
tprs.append(interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
plt.plot(fpr, tpr, lw=1, alpha=0.3,
label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
print(r'Mean ROC (AUC = %0.2f ± %0.2f)' % (mean_auc, std_auc))
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
In [78]:
n_fold = 5
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=6)
plot_roc_curve(lgbc, X_train_t.values, y_train_t.values, cv)
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:98: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:133: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/ipykernel_launcher.py:23: DeprecationWarning: scipy.interp is deprecated and will be removed in SciPy 2.0.0, use numpy.interp instead /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:98: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:133: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/ipykernel_launcher.py:23: DeprecationWarning: scipy.interp is deprecated and will be removed in SciPy 2.0.0, use numpy.interp instead /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:98: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:133: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/ipykernel_launcher.py:23: DeprecationWarning: scipy.interp is deprecated and will be removed in SciPy 2.0.0, use numpy.interp instead /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:98: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:133: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/ipykernel_launcher.py:23: DeprecationWarning: scipy.interp is deprecated and will be removed in SciPy 2.0.0, use numpy.interp instead /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:98: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:133: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/ipykernel_launcher.py:23: DeprecationWarning: scipy.interp is deprecated and will be removed in SciPy 2.0.0, use numpy.interp instead
Mean ROC (AUC = 0.92 ± 0.00)
通过roc_auc评估模型状态¶
In [79]:
from sklearn.model_selection import learning_curve
def plot_learning_curve_proba(estimator, X, y, cv=None, scoring='roc_auc',
title=None, ylim=None, n_jobs=None,
train_sizes=np.linspace(.1, 1.0, 5)):
"""
Function from sklearn for plotting learning curves
Generate a simple plot of the test and training learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : int, cross-validation generator or an iterable, optional
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- An object to be used as a cross-validation generator.
- An iterable yielding train/test splits.
For integer/None inputs, if ``y`` is binary or multiclass,
:class:`StratifiedKFold` used. If the estimator is not a classifier
or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
Refer :ref:`User Guide <cross_validation>` for the various
cross-validators that can be used here.
n_jobs : int or None, optional (default=None)
Number of jobs to run in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
train_sizes : array-like, shape (n_ticks,), dtype float or int
Relative or absolute numbers of training examples that will be used to
generate the learning curve. If the dtype is float, it is regarded as a
fraction of the maximum size of the training set (that is determined
by the selected validation method), i.e. it has to be within (0, 1].
Otherwise it is interpreted as absolute sizes of the training sets.
Note that for classification the number of samples usually have to
be big enough to contain at least one sample from each class.
(default: np.linspace(0.1, 1.0, 5))
"""
if title is None:
title = estimator.__class__.__name__
scores = cross_val_score(estimator, X, y, cv=cv, scoring=scoring).mean()
print(title, ": get_scores_mean =", scores)
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=scoring)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
plt.show()
In [80]:
plot_learning_curve_proba(lgbc, X_train_t.values, y_train_t.values.ravel(), cv=None, scoring = 'roc_auc', train_sizes=np.linspace(.89, 1.0, 5))
LGBMClassifier : get_scores_mean = 0.9185733518317141
特征选择¶
通过方差过滤¶
In [81]:
from sklearn.feature_selection import VarianceThreshold
def variance_threshold_selector(df, threshold=0.0):
"""
过滤取值单一的属性
:param df: 要过滤的数据集
:param threshold: float, 例如(.98 * (1 - .98))
:return: Dataframe
"""
selector = VarianceThreshold(threshold)
selector.fit(df)
return df[df.columns[selector.get_support(indices=True)]]
In [82]:
print(X_train_t.shape)
X_train_t = variance_threshold_selector(X_train_t, (.999 * (1 - .999)))
print(X_train_t.shape)
(25317, 537) (25317, 422)
通过属性相关性过滤¶
In [83]:
def remove_collinear_variables(df, threshold=0.95):
"""
过滤相关性很高的属性
:param df: 要过滤的数据集
:param threshold: float
:return: Dataframe
"""
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
df = df.drop(columns=to_drop)
return df, to_drop, upper
In [84]:
print(X_train_t.shape)
X_train_t, to_drop, upper = remove_collinear_variables(X_train_t)
print(X_train_t.shape)
(25317, 422) (25317, 156)
通过与目标相关性过滤¶
In [85]:
def remove_by_target_corr(df, target, threshold=0.05):
"""
通过目标值过滤掉线性相关性低的特征
:param df: 要过滤的数据集
:param target: 目标值
:param threshold: 阈值
:return: Dataframe
"""
corr = df.corr()
corr.sort_values([target], ascending=False, inplace=True)
selected_fea = list(corr[abs(corr[target]) >= threshold].index)
return df[selected_fea]
In [86]:
print(df_train_t.shape)
df_train_corr = remove_by_target_corr(df_train_t, 'y')
print(df_train_corr.shape)
(25317, 539) (25317, 265)
RFE¶
In [87]:
from sklearn.feature_selection import RFE
def remove_through_rfe(estimator, X, y, n_features_to_select=200, step=0.05):
"""
逐渐删除不重要特征的方式筛选特征
:param estimator: 模型,需要包含模型重要性特征
:param X: Dataframe, 属性
:param y: Dataframe, 目标
:param n_features_to_select: 最终剩余参数量
:param step: 每次删除多少参数,如果是int类型,则删除指定个数,如果是float,则按照百分比删除
:return:
"""
rfe = RFE(estimator=estimator, n_features_to_select=n_features_to_select, step=step)
rfe = rfe.fit(X.values, y)
fea_rank_ = pd.DataFrame({'cols': X.columns, 'fea_rank': rfe.ranking_})
fea_rank_.loc[fea_rank_.fea_rank > 0].sort_values(by=['fea_rank'], ascending=True)
rfe_selected_features = fea_rank_[fea_rank_['fea_rank'] == 1]['cols'].values
rfe_drop_features = fea_rank_[fea_rank_['fea_rank'] > 1]['cols'].values
return rfe_selected_features, rfe_drop_features, rfe.support_
In [88]:
X_train_t.shape
Out[88]:
(25317, 156)
In [89]:
rfe_selected_features, rfe_drop_features, rfe_support = remove_through_rfe(lgbc, X_train_t, y_train_t.squeeze(), n_features_to_select=100, step=10)
In [90]:
len(rfe_selected_features)
Out[90]:
100
通过模型特征重要性选取¶
In [91]:
from lightgbm.callback import early_stopping
from lightgbm.callback import log_evaluation
def remove_through_feature_importance(estimator, X, y, cv=KFold(n_splits=5, shuffle=True, random_state=6), times=10):
"""
通过xgboost或者lightgbm选择特征,只选特征重要性大于0的
:param estimator: 模型
:param X: Dataframe, 属性
:param y: Dataframe, 目标
:param cv: 交叉验证方式
:param times: int, 模型运行次数
:return:
"""
fi = np.zeros(X.shape[1])
for _ in range(times):
for train_index, valid_index in cv.split(X, y):
train_features, valid_features = X.iloc[train_index], X.iloc[valid_index]
train_y, valid_y = y.iloc[train_index], y.iloc[valid_index]
callbacks = [early_stopping(stopping_rounds=100, first_metric_only=True), log_evaluation(period=0)]
estimator.fit(train_features,
train_y,
callbacks=callbacks,
eval_set=[(valid_features, valid_y)],
)
fi += estimator.feature_importances_
fi = fi / cv.get_n_splits() / times
fi = pd.DataFrame({'feature': list(X.columns), 'importance': fi}).sort_values('importance', ascending=False)
zero_features = list(fi[fi['importance'] == 0.0]['feature'])
selected_features = list(fi[fi['importance'] > 0]['feature'])
return zero_features, selected_features, fi
In [92]:
zero_features, selected_features, fi = remove_through_feature_importance(lgbc, X_train_t, y_train_t.squeeze(), cv=cv, times=1)
Training until validation scores don't improve for 100 rounds Did not meet early stopping. Best iteration is: [44] valid_0's auc: 0.923314 Evaluated only: auc Training until validation scores don't improve for 100 rounds Did not meet early stopping. Best iteration is: [100] valid_0's auc: 0.915777 Evaluated only: auc Training until validation scores don't improve for 100 rounds Did not meet early stopping. Best iteration is: [48] valid_0's auc: 0.916964 Evaluated only: auc Training until validation scores don't improve for 100 rounds Did not meet early stopping. Best iteration is: [97] valid_0's auc: 0.910574 Evaluated only: auc Training until validation scores don't improve for 100 rounds Did not meet early stopping. Best iteration is: [100] valid_0's auc: 0.928906 Evaluated only: auc
选择最优特征个数¶
In [93]:
def get_model_fea_mun(estimator, X, y, sorted_features, max_feature_num=None,
cv=KFold(n_splits=5, shuffle=True, random_state=6), scoring='roc_auc',
step=1):
"""
根据给定的特征(已经按照特征重要性排序,例如remove_through_feature_importance返回的特征列表),返回截取的最优特征
:param estimator: 模型
:param X: Dataframe, 属性
:param y: Dataframe, 目标
:param sorted_features: 已经按照特征重要性排序,例如remove_through_feature_importance返回的特征列表
:param max_feature_num: 遍历的最多特征数目,如果为空的话,就全部遍历
:param cv: 交叉验证方式
:param scoring: String or 评分函数
:param step: 遍历特征时的步长
:return:
"""
fea_nums = []
avg_val_cv_score_list = []
arr_mean_list = []
arr_std_list = []
if max_feature_num is None:
max_feature_num = len(sorted_features)
for i in range(2, min(X.shape[1], max_feature_num + 1), step):
data = X[sorted_features[:i]]
val_cv_score = cross_val_score(estimator, data.values, y.values, cv=cv, scoring=scoring)
arr_mean = np.mean(val_cv_score)
arr_std = np.std(val_cv_score, ddof=1)
avg_val_cv_score_list.append(arr_mean)
fea_nums.append(i)
arr_mean_list.append(arr_mean)
print('feature index: %0.4f, arr_mean: %0.4f, arr_std: %0.4f' % (i, arr_mean, arr_std))
arr_std_list.append(arr_std)
fea_num = fea_nums[avg_val_cv_score_list.index(max(avg_val_cv_score_list))]
selected_fea = sorted_features[:fea_num]
return fea_num, selected_fea, fea_nums, arr_mean_list, arr_std_list
In [94]:
fea_num, selected_fea, fea_nums, arr_mean_list, arr_std_list = get_model_fea_mun(lgbc, X_train_t, y_train_t.squeeze(), selected_features, max_feature_num=15)
feature index: 2.0000, arr_mean: 0.8474, arr_std: 0.0079 feature index: 3.0000, arr_mean: 0.9008, arr_std: 0.0081 feature index: 4.0000, arr_mean: 0.9061, arr_std: 0.0096 feature index: 5.0000, arr_mean: 0.9113, arr_std: 0.0082 feature index: 6.0000, arr_mean: 0.9114, arr_std: 0.0064 feature index: 7.0000, arr_mean: 0.9112, arr_std: 0.0061 feature index: 8.0000, arr_mean: 0.9069, arr_std: 0.0098 feature index: 9.0000, arr_mean: 0.9105, arr_std: 0.0111 feature index: 10.0000, arr_mean: 0.9119, arr_std: 0.0111 feature index: 11.0000, arr_mean: 0.9117, arr_std: 0.0106 feature index: 12.0000, arr_mean: 0.9135, arr_std: 0.0114 feature index: 13.0000, arr_mean: 0.9131, arr_std: 0.0115 feature index: 14.0000, arr_mean: 0.9134, arr_std: 0.0094 feature index: 15.0000, arr_mean: 0.9138, arr_std: 0.0069
In [95]:
plt.plot(fea_nums, arr_mean_list)
plt.show()
In [96]:
plt.plot(fea_nums, arr_std_list)
plt.show()
Mlxtend SFS¶
In [97]:
import copy
from sklearn.model_selection import KFold
from mlxtend.feature_selection import SequentialFeatureSelector
def print_subsets(subsets, header_list):
"""
打印SFS的结果, 补充列名
:param subsets:
:param header_list:
:return:
"""
header_array = np.array(header_list)
fea_dict = copy.deepcopy(subsets)
for key, value in fea_dict.items():
print('key:', key)
fea_dict[key]['fea_num'] = key
print('\tfeature_idx:', value['feature_idx'])
print('\tcv_scores:', value['cv_scores'])
print('\tavg_score:', value['avg_score'])
print('\tfeature_names:', header_array[list(value['feature_idx'])])
fea_dict[key]['feature_names'] = header_array[list(value['feature_idx'])]
print('-----------------------------------')
return pd.DataFrame.from_dict(fea_dict).T
def remove_through_sfs(estimator, X, y,
cv=KFold(n_splits=5, shuffle=True, random_state=6),
n_jobs=5, k_features='best', scoring='roc_auc', floating=False, forward=True):
"""
通过SFS、SBS、SFFS、SBFS选择特征
:param estimator: 模型
:param X: Dataframe, 属性
:param y: Dataframe, 目标
:param cv: 交叉验证方式
:param n_jobs: 并行数
:param k_features: "best", "parsimonious", int, tuple
:param scoring: String or 评分函数
:param floating: bool, 是否浮动搜索
:param forward: 向前搜索为True, 向后搜索为False
:return:
"""
cv_list = list(cv.split(X.values, y.values.ravel()))
feature_selector = SequentialFeatureSelector(estimator=estimator,
n_jobs=n_jobs,
pre_dispatch=2*n_jobs,
k_features=k_features,
forward=forward,
floating=floating,
verbose=1,
scoring=scoring,
cv=cv_list)
feature_selector = feature_selector.fit(X.values, y.values.ravel())
sfs_result = print_subsets(feature_selector.get_metric_dict(), list(X.columns.values))
filtered_features = X.columns[list(feature_selector.k_feature_idx_)]
return list(filtered_features), sfs_result, feature_selector.subsets_
In [98]:
sfs_fea, sfs_result, sfs_filtered_fea = remove_through_sfs(lgbc, X_train_t[selected_features], y_train_t, cv=cv, k_features=2)
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers. [Parallel(n_jobs=5)]: Done 40 tasks | elapsed: 6.2s [Parallel(n_jobs=5)]: Done 51 out of 51 | elapsed: 7.3s finished Features: 1/2[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers. [Parallel(n_jobs=5)]: Done 40 tasks | elapsed: 8.5s [Parallel(n_jobs=5)]: Done 50 out of 50 | elapsed: 10.1s finished Features: 2/2
key: 1 feature_idx: (0,) cv_scores: [0.80226666 0.79810757 0.78784109 0.7888948 0.81707377] avg_score: 0.7988367768853533 feature_names: ['duration'] ----------------------------------- key: 2 feature_idx: (0, 2) cv_scores: [0.89835497 0.89641772 0.88761565 0.89024351 0.89473057] avg_score: 0.8934724825917986 feature_names: ['duration' 'agg_count_day_month_housing'] -----------------------------------
sklearn SFS 特征筛选¶
In [99]:
from sklearn.feature_selection import SequentialFeatureSelector
lgbc = lgb.LGBMClassifier(objective='binary',metric= 'auc', subsample_freq=1, learning_rate=0.01, random_state=666, n_jobs=2)
# lgbm_sk_sfs = SequentialFeatureSelector(estimator=lgbc,
# n_features_to_select='auto',
# tol=0.01,
# direction='forward',
# scoring='roc_auc',
# cv=cv,
# n_jobs=4)
lgbm_sk_sfs = SequentialFeatureSelector(estimator=lgbc,
n_features_to_select=2,
direction='forward',
scoring='roc_auc',
cv=cv,
n_jobs=2)
lgbm_sk_sfs.fit(X_train_t, y_train_t.squeeze(), )
Out[99]:
SequentialFeatureSelector(cv=StratifiedKFold(n_splits=5, random_state=6, shuffle=True), estimator=LGBMClassifier(learning_rate=0.01, metric='auc', n_jobs=2, objective='binary', random_state=666, subsample_freq=1), n_features_to_select=2, n_jobs=2, scoring='roc_auc')
In [100]:
lgbm_sk_sfs_fea = lgbm_sk_sfs.get_feature_names_out()
lgbm_sk_sfs_fea
Out[100]:
array(['duration', 'agg_count_day_month_housing'], dtype=object)
RFE CV 特征筛选¶
In [101]:
from sklearn.feature_selection import RFECV
lgbc = lgb.LGBMClassifier(objective='binary',metric= 'auc', subsample_freq=1, learning_rate=0.01, random_state=666, n_jobs=2)
lgbm_rfe = RFECV(estimator=lgbc,
step=1,
min_features_to_select=1,
cv=cv,
scoring='roc_auc',
verbose=0,
n_jobs=4,
importance_getter='auto')
lgbm_rfe = lgbm_rfe.fit(X_train_t, y_train_t.squeeze(),)
In [102]:
print('Dataframe特征数:', X_train_t.shape[1])
print('n_features_in_:', lgbm_rfe.n_features_in_)
print('使用的特征数n_features_:', lgbm_rfe.n_features_)
print('使用的特征名:', lgbm_rfe.feature_names_in_[lgbm_rfe.support_])
Dataframe特征数: 156 n_features_in_: 156 使用的特征数n_features_: 50 使用的特征名: ['age' 'balance' 'day' 'duration' 'campaign' 'pdays' 'agg_count_day_month_campaign' 'agg_count_day_month_pdays' 'agg_count_day_month_job' 'agg_count_day_month_marital' 'agg_count_day_month_education' 'agg_count_day_month_default' 'agg_count_day_month_housing' 'agg_count_day_month_loan' 'agg_count_day_month_contact' 'job_campaign_mean' 'job_balance_min' 'job_duration_max' 'marital_job_count' 'housing_job_count' 'contact_job_count' 'contact_campaign_mean' 'contact_campaign_max' 'contact_age_max' 'month_job_count' 'month_campaign_mean' 'month_campaign_max' 'month_campaign_std' 'month_age_mean' 'month_age_max' 'month_age_min' 'month_age_std' 'month_balance_mean' 'month_balance_max' 'month_balance_min' 'month_day_mean' 'month_day_std' 'month_duration_mean' 'month_duration_max' 'month_duration_min' 'month_duration_std' 'month_pdays_mean' 'month_pdays_max' 'month_pdays_std' 'month_previous_std' 'poutcome_job_count' 'poutcome_age_mean' 'poutcome_age_max' 'campaign_age_mean' 'unknown']
In [103]:
lgbm_fea_rank_ = pd.DataFrame({'cols': lgbm_rfe.feature_names_in_,
'fea_rank': lgbm_rfe.ranking_,
'mean_test_score': lgbm_rfe.cv_results_['mean_test_score'],
'std_test_score': lgbm_rfe.cv_results_['std_test_score'],
})
lgbm_fea_rank_.sort_values(by=['fea_rank'], ascending=True, inplace=True)
lgbm_rfe_selected_features = lgbm_fea_rank_[lgbm_fea_rank_['fea_rank'] == 1]['cols'].values
lgbm_rfe_drop_features = lgbm_fea_rank_[lgbm_fea_rank_['fea_rank'] > 1]['cols'].values
lgbm_fea_rank_
Out[103]:
cols | fea_rank | mean_test_score | std_test_score | |
---|---|---|---|---|
0 | age | 1 | 0.736006 | 0.003809 |
61 | contact_job_count | 1 | 0.933702 | 0.002466 |
62 | contact_campaign_mean | 1 | 0.933652 | 0.002423 |
63 | contact_campaign_max | 1 | 0.933627 | 0.002448 |
64 | contact_age_max | 1 | 0.933564 | 0.002440 |
... | ... | ... | ... | ... |
57 | education_pdays_std | 103 | 0.933676 | 0.002346 |
137 | employed | 104 | 0.933570 | 0.002437 |
58 | default_job_count | 105 | 0.933720 | 0.002390 |
138 | entrepreneur | 106 | 0.933570 | 0.002437 |
155 | education_unknown | 107 | 0.933570 | 0.002437 |
156 rows × 4 columns
In [104]:
lgbm_rfe_selected_features
Out[104]:
array(['age', 'contact_job_count', 'contact_campaign_mean', 'contact_campaign_max', 'contact_age_max', 'month_job_count', 'month_campaign_mean', 'month_campaign_max', 'month_campaign_std', 'month_age_mean', 'month_age_max', 'month_age_min', 'month_age_std', 'month_balance_mean', 'month_balance_max', 'unknown', 'month_day_mean', 'month_day_std', 'month_duration_mean', 'month_duration_max', 'month_duration_min', 'month_duration_std', 'month_pdays_mean', 'month_pdays_max', 'month_pdays_std', 'month_previous_std', 'poutcome_job_count', 'poutcome_age_mean', 'poutcome_age_max', 'campaign_age_mean', 'housing_job_count', 'marital_job_count', 'month_balance_min', 'balance', 'campaign', 'pdays', 'day', 'agg_count_day_month_campaign', 'agg_count_day_month_pdays', 'agg_count_day_month_job', 'agg_count_day_month_marital', 'agg_count_day_month_education', 'agg_count_day_month_default', 'agg_count_day_month_housing', 'agg_count_day_month_loan', 'agg_count_day_month_contact', 'job_campaign_mean', 'job_duration_max', 'job_balance_min', 'duration'], dtype=object)
超参数搜索¶
定义交叉验证策略¶
#Validation function
n_folds = 5
def rmsle_cv(model):
kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
return(rmse)
通过roc_auc选择超参数¶
In [105]:
from sklearn.model_selection import validation_curve
def plot_validation_curve(estimator, X, y, param_name, param_range, cv=5, scoring='roc_auc', n_jobs=1, ylim=None):
"""
通过指定评分函数选择超参数
:param estimator: 要评估的模型
:param X: array-like, 属性
:param y: array-like, 目标
:param param_name: 要搜索的参数名
:param param_range: 要搜索的参数取值范围
:param cv: 交叉验证方式
:param scoring: 评分函数
:param n_jobs: 进程数
:param ylim: 画图时ylim的取值,例如(0.0, 1.1)
:return:
"""
train_scores, test_scores = validation_curve(estimator, X, y, param_name=param_name, param_range=param_range,
cv=cv, scoring=scoring, n_jobs=n_jobs)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.title("Validation Curve with SVM")
plt.xlabel(r"$\gamma$")
plt.ylabel("Score")
if ylim is not None:
plt.ylim(*ylim)
lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2,
color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.2,
color="navy", lw=lw)
plt.legend(loc="best")
plt.show()
In [106]:
param_name = 'reg_alpha'
param_range = np.logspace(-2, 3, 8)
plot_validation_curve(lgbc, X_train_t.values, y_train_t.values.ravel(), param_name, param_range)
网格搜索¶
In [107]:
lgbc=lgb.LGBMClassifier(random_state=666, objective='binary', metric='auc', n_jobs=4)
# Define the search space
space = {
'boosting_type': ['gbdt',],
'subsample': [0.8, 0.85, ],
'num_leaves': [63, 127, ],
'learning_rate': [0.05, 0.1,],
'colsample_bytree': [0.7, 0.8, ],
'max_depth': [7, 12,],
}
In [108]:
from sklearn.model_selection import GridSearchCV
grid_cv_tuner = GridSearchCV(estimator=lgbc,
param_grid=space,
n_jobs=4,
verbose=1,
cv=cv,
scoring='roc_auc',)
grid_cv_result = grid_cv_tuner.fit(X_train_t.values, y_train_t.values.ravel())
print('最优参数:', grid_cv_result.best_params_)
Fitting 5 folds for each of 32 candidates, totalling 160 fits 最优参数: {'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 12, 'num_leaves': 63, 'subsample': 0.8}
In [109]:
model_grid = lgb.LGBMClassifier(random_state=666, objective='binary', metric='auc', n_jobs=8)
model_grid.set_params(**grid_cv_result.best_params_)
model_grid
Out[109]:
LGBMClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=12, metric='auc', n_jobs=8, num_leaves=63, objective='binary', random_state=666, subsample=0.8)
In [110]:
scores = cross_val_score(model_grid, X_train_t.values, y_train_t.values.ravel(), cv=cv, scoring='roc_auc').mean()
scores
Out[110]:
0.9398604932218328
随机搜索¶
In [111]:
lgbc=lgb.LGBMClassifier(random_state=666, objective='binary', metric='auc', n_jobs=4)
# Define the search space
space = {
'boosting_type': ['gbdt', 'dart',],
'subsample': [0.8, 0.85, 0.90, 0.95, 1.0],
'num_leaves': [7, 15, 31, 63, 127, 255, 511],
'learning_rate': [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1,],
'min_child_samples': [5, 10, 15, 20, 25, 30, 40, 50],
'reg_alpha': [0, 0.01, 0.1, 0.5, 1.0],
'reg_lambda': [0, 0.001, 0.01, 0.02, 0.05, 0.1, 1.0],
'colsample_bytree': [0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1.0],
'is_unbalance': [True, False],
'max_depth': [-1, 3, 5, 7, 9, 12, 15, 17, 25],
'n_estimators': [100, 200, 300, 400, 500, 800, 1000, 1500, 2000],
}
# 查找次数
MAX_EVALS = 3
In [112]:
from sklearn.model_selection import RandomizedSearchCV
random_cv_tuner = RandomizedSearchCV(estimator=lgbc,
param_distributions=space,
n_jobs=4,
n_iter=MAX_EVALS,
verbose=1,
cv=cv,
scoring='roc_auc',
random_state=666,)
random_cv_result = random_cv_tuner.fit(X_train_t.values, y_train_t.values.ravel())
print('最优参数:', random_cv_result.best_params_)
Fitting 5 folds for each of 3 candidates, totalling 15 fits 最优参数: {'subsample': 0.8, 'reg_lambda': 0.1, 'reg_alpha': 1.0, 'num_leaves': 511, 'n_estimators': 400, 'min_child_samples': 25, 'max_depth': 25, 'learning_rate': 0.005, 'is_unbalance': True, 'colsample_bytree': 0.8, 'boosting_type': 'dart'}
In [113]:
model_random = lgb.LGBMClassifier(random_state=666, objective='binary', metric='auc', n_jobs=8)
model_random.set_params(**random_cv_result.best_params_)
model_random
Out[113]:
LGBMClassifier(boosting_type='dart', colsample_bytree=0.8, is_unbalance=True, learning_rate=0.005, max_depth=25, metric='auc', min_child_samples=25, n_estimators=400, n_jobs=8, num_leaves=511, objective='binary', random_state=666, reg_alpha=1.0, reg_lambda=0.1, subsample=0.8)
In [114]:
scores = cross_val_score(model_random, X_train_t.values, y_train_t.values.ravel(), cv=cv, scoring='roc_auc').mean()
scores
Out[114]:
0.9341663475818229
贝叶斯搜索¶
Scikit-Optimize¶
In [115]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
N_FOLDS = 5
MAX_EVALS = 2
In [116]:
lgbc=lgb.LGBMClassifier(random_state=666, objective='binary', metric='auc', n_jobs=1)
# Define the search space
space = {
'boosting_type': Categorical(['gbdt', 'dart',]),
'subsample': Real(0.8, 1),
'num_leaves': Categorical([7, 15, 31, 63, 127, 255, 511]),
'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
'min_child_samples': Integer(5, 50),
'reg_alpha': Categorical([0, 0.01, 0.1, 0.5, 1.0]),
'reg_lambda': Categorical([0, 0.001, 0.01, 0.02, 0.05, 0.1, 1.0]),
'colsample_bytree': Real(0.6, 1.0),
'is_unbalance': Categorical([True, False]),
'max_depth': Categorical([-1, 3, 5, 7, 9, 12, 15, 17, 25]),
'n_estimators': Integer(100, 2000),
}
In [117]:
xgb_bayes_cv_tuner = BayesSearchCV(
estimator=lgb.LGBMClassifier(random_state=666, objective='binary', metric='auc', n_jobs=8),
search_spaces=space,
n_jobs=4,
n_iter=MAX_EVALS,
pre_dispatch=2*5,
n_points=20,
verbose=1,
cv=cv,
scoring = 'roc_auc',
optimizer_kwargs = {'base_estimator': 'GP'}
)
sh_bayes_cv_result = xgb_bayes_cv_tuner.fit(X_train_t.values, y_train_t.values.ravel())
print('最优参数:', sh_bayes_cv_result.best_params_)
Fitting 5 folds for each of 2 candidates, totalling 10 fits 最优参数: OrderedDict([('boosting_type', 'gbdt'), ('colsample_bytree', 0.9304881347700349), ('is_unbalance', True), ('learning_rate', 0.004310616984184679), ('max_depth', 9), ('min_child_samples', 7), ('n_estimators', 1181), ('num_leaves', 15), ('reg_alpha', 0.1), ('reg_lambda', 0.0), ('subsample', 0.9115912187272238)])
In [118]:
model_bayes = lgb.LGBMClassifier(random_state=666, objective='binary', metric='auc', n_jobs=7)
model_bayes.set_params(**sh_bayes_cv_result.best_params_)
model_bayes
Out[118]:
LGBMClassifier(colsample_bytree=0.9304881347700349, is_unbalance=True, learning_rate=0.004310616984184679, max_depth=9, metric='auc', min_child_samples=7, n_estimators=1181, n_jobs=7, num_leaves=15, objective='binary', random_state=666, reg_alpha=0.1, subsample=0.9115912187272238)
In [119]:
scores = cross_val_score(model_bayes, X_train_t.values, y_train_t.values.ravel(), cv=cv, scoring='roc_auc').mean()
scores
Out[119]:
0.9384674137815839
贝叶斯库2¶
In [120]:
import csv
import json
from hyperopt import STATUS_OK
from timeit import default_timer as timer
from hyperopt import Trials
from hyperopt import tpe
from hyperopt import fmin
from hyperopt import hp
N_FOLDS = 5
In [121]:
# Define the search space
space = {
'boosting_type': hp.choice('boosting_type',
[{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)},
{'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)},
{'boosting_type': 'goss', 'subsample': 1.0}]),
'num_leaves': hp.quniform('num_leaves', 20, 150, 5),
'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
'reg_alpha': hp.uniform('reg_alpha', 0.0, 1.0),
'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
'is_unbalance': hp.choice('is_unbalance', [True, False]),
'max_depth': hp.choice('max_depth', [-1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
'min_split_gain': hp.uniform('min_split_gain', 0.0, 1.0),
'min_child_weight': hp.loguniform('min_child_weight', np.log(0.0001), np.log(100)),
'verbosity': hp.choice('verbosity', [-1]),
}
In [122]:
def objective(hyperparameters):
"""Objective function for Gradient Boosting Machine Hyperparameter Optimization.
Writes a new line to `outfile` on every iteration"""
# Keep track of evals
global ITERATION
ITERATION += 1
# Using early stopping to find number of trees trained
if 'n_estimators' in hyperparameters:
del hyperparameters['n_estimators']
# Retrieve the subsample
subsample = hyperparameters['boosting_type'].get('subsample', 1.0)
# Extract the boosting type and subsample to top level keys
hyperparameters['boosting_type'] = hyperparameters['boosting_type']['boosting_type']
hyperparameters['subsample'] = subsample
# Make sure parameters that need to be integers are integers
for parameter_name in ['num_leaves', 'subsample_for_bin', 'min_child_samples']:
hyperparameters[parameter_name] = int(hyperparameters[parameter_name])
start = timer()
train_set = lgbm.Dataset(data=X_train_t, label=y_train_t, params=hyperparameters)
# Perform n_folds cross validation
cv_results = lgb.cv(hyperparameters, train_set, num_boost_round = 100, nfold = N_FOLDS,
early_stopping_rounds = 10, metrics = 'auc', seed = 50)
run_time = timer() - start
# Extract the best score
best_score = cv_results['auc-mean'][-1]
# Loss must be minimized
loss = 1 - best_score
# Boosting rounds that returned the highest cv score
n_estimators = len(cv_results['auc-mean'])
# Add the number of estimators to the hyperparameters
hyperparameters['n_estimators'] = n_estimators
# Write to the csv file ('a' means append)
of_connection = open(OUT_FILE, 'a')
writer = csv.writer(of_connection)
writer.writerow([loss, hyperparameters, ITERATION, run_time, best_score])
of_connection.close()
# Dictionary with information for evaluation
return {'loss': loss, 'hyperparameters': hyperparameters, 'iteration': ITERATION,
'train_time': run_time, 'status': STATUS_OK}
In [123]:
MAX_EVALS = 5
# Create a new file and open a connection
OUT_FILE = 'bayesian_trials_10.csv'
of_connection = open(OUT_FILE, 'w')
writer = csv.writer(of_connection)
# Write column names
headers = ['loss', 'hyperparameters', 'iteration', 'runtime', 'score']
writer.writerow(headers)
of_connection.close()
# Record results
trials = Trials()
In [124]:
# Global variable
global ITERATION
ITERATION = 0
# Run optimization
best = fmin(fn = objective, space = space, algo = tpe.suggest,
trials = trials, max_evals = MAX_EVALS)
best
0%| | 0/5 [00:00<?, ?trial/s, best loss=?]
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/lightgbm/engine.py:577: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead. _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
20%|██ | 1/5 [00:00<00:01, 2.00trial/s, best loss: 0.06653741483827447]
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/lightgbm/engine.py:577: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead. _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
40%|████ | 2/5 [00:01<00:02, 1.15trial/s, best loss: 0.06653741483827447]
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/lightgbm/engine.py:577: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead. _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
60%|██████ | 3/5 [00:02<00:01, 1.06trial/s, best loss: 0.06463691054649812]
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/lightgbm/engine.py:577: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead. _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. " /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/lightgbm/callback.py:223: UserWarning: Early stopping is not available in dart mode _log_warning('Early stopping is not available in dart mode')
80%|████████ | 4/5 [00:03<00:00, 1.05trial/s, best loss: 0.06463691054649812]
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/lightgbm/engine.py:577: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead. _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
100%|██████████| 5/5 [00:04<00:00, 1.08trial/s, best loss: 0.06463691054649812]
Out[124]:
{'boosting_type': 0, 'colsample_by_tree': 0.7851588458179977, 'gdbt_subsample': 0.7357510377183742, 'is_unbalance': 0, 'learning_rate': 0.0546754977350724, 'max_depth': 9, 'min_child_samples': 70.0, 'min_child_weight': 0.492079183819726, 'min_split_gain': 0.7520983267583293, 'num_leaves': 135.0, 'reg_alpha': 0.953530968440641, 'reg_lambda': 0.9802616970117984, 'subsample_for_bin': 280000.0, 'verbosity': 0}
In [125]:
# Sort the trials with lowest loss (highest AUC) first
trials_dict = sorted(trials.results, key = lambda x: x['loss'])
print('Finished, best results')
print(trials_dict[:1])
Finished, best results [{'loss': 0.06463691054649812, 'hyperparameters': {'boosting_type': 'gbdt', 'colsample_bytree': 0.7851588458179977, 'is_unbalance': True, 'learning_rate': 0.0546754977350724, 'max_depth': 9, 'min_child_samples': 70, 'min_child_weight': 0.492079183819726, 'min_split_gain': 0.7520983267583293, 'num_leaves': 135, 'reg_alpha': 0.953530968440641, 'reg_lambda': 0.9802616970117984, 'subsample_for_bin': 280000, 'verbosity': -1, 'subsample': 0.7357510377183742, 'n_estimators': 73}, 'iteration': 3, 'train_time': 1.0275821890681982, 'status': 'ok'}]
In [126]:
# Save the trial results
with open('trials_sh.json', 'w') as f:
f.write(json.dumps(trials_dict))
特征重要性¶
前15个与后15个画图 coef_的特征重要性¶
# Plot important coefficients
coefs = pd.Series(lasso.coef_, index = train.columns)
print("ElasticNet picked " + str(sum(coefs != 0)) + " features and eliminated the other " + str(sum(coefs == 0)) + " features")
imp_coefs = pd.concat([coefs.sort_values().head(15),
coefs.sort_values().tail(15)])
imp_coefs.plot(kind = "barh")
plt.title("Coefficients in the ElasticNet Model")
plt.show()
feature_importance的特征重要性¶
# 获取特征重要性
importance = model_random.feature_importance(importance_type='gain')
# 如果你想要获取特征名称,可以使用以下方式:
feature_name = gbm.feature_name()
feature_importance = pd.DataFrame({'feature_name': feature_name, 'importance': importance})
feature_importance = feature_importance.sort_values(by='importance', ascending=False, inplace=True).reset_index(drop=True)
feature_importance
# Plot important coefficients
coefs = pd.Series(model_random.feature_importances_, index = app_train.drop(['price','predict_price','predict_price_diff'], axis=1).columns)
print("Light GBM picked " + str(sum(coefs != 0)) + " features and eliminated the other " + str(sum(coefs == 0)) + " features")
imp_coefs = pd.concat([coefs.sort_values().head(1),
coefs.sort_values().tail(30)])
imp_coefs.plot(kind = "barh")
plt.title("Coefficients in the Light GBM Model")
plt.show()
# Plot important coefficients
coefs = pd.Series(lgbmr_diff.feature_importances_, index = app_train.drop(['price','predict_price','predict_price_diff'], axis=1).columns)
print("Light GBM picked " + str(sum(coefs != 0)) + " features and eliminated the other " + str(sum(coefs == 0)) + " features")
imp_coefs = pd.concat([coefs.sort_values().head(1),
coefs.sort_values().tail(30)])
imp_coefs.plot(kind = "barh")
plt.title("Coefficients in the Light GBM Model")
plt.show()
重要性表格 coef_的¶
feature_importances = pd.DataFrame(lasso.coef_,
index = train.columns,
columns=['weight']).sort_values('weight',ascending=False)
feature_importances
重要性表格 feature_importance的¶
feature_importances = pd.DataFrame(lgbmr_diff.booster_.feature_importance(),
index = app_train.drop(['price','predict_price','predict_price_diff'], axis=1).columns,
columns=['importance']).sort_values('importance',ascending=False)
feature_importances
重要性画图¶
In [127]:
def plot_feature_importances(df, threshold = 0.9):
"""
Plots 15 most important features and the cumulative importance of features.
Prints the number of features needed to reach threshold cumulative importance.
Parameters
--------
df : dataframe
Dataframe of feature importances. Columns must be feature and importance
threshold : float, default = 0.9
Threshold for prining information about cumulative importances
Return
--------
df : dataframe
Dataframe ordered by feature importances with a normalized column (sums to 1)
and a cumulative importance column
"""
plt.rcParams['font.size'] = 18
# Sort features according to importance
df = df.sort_values('importance', ascending = False).reset_index()
# Normalize the feature importances to add up to one
df['importance_normalized'] = df['importance'] / df['importance'].sum()
df['cumulative_importance'] = np.cumsum(df['importance_normalized'])
# Make a horizontal bar chart of feature importances
plt.figure(figsize = (10, 6))
ax = plt.subplot()
# Need to reverse the index to plot most important on top
ax.barh(list(reversed(list(df.index[:15]))),
df['importance_normalized'].head(15),
align = 'center', edgecolor = 'k')
# Set the yticks and labels
ax.set_yticks(list(reversed(list(df.index[:15]))))
ax.set_yticklabels(df['feature'].head(15))
# Plot labeling
plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
plt.show()
# Cumulative importance plot
plt.figure(figsize = (8, 6))
plt.plot(list(range(len(df))), df['cumulative_importance'], 'r-')
plt.xlabel('Number of Features'); plt.ylabel('Cumulative Importance');
plt.title('Cumulative Feature Importance');
plt.show();
importance_index = np.min(np.where(df['cumulative_importance'] > threshold))
print('%d features required for %0.2f of cumulative importance' % (importance_index + 1, threshold))
return df
In [128]:
feature_importance_df = pd.DataFrame({
'feature': ['f1', 'f2', 'f3', 'f4'],
'importance': [10, 40, 30, 20]
})
plot_feature_importances(feature_importance_df)
4 features required for 0.90 of cumulative importance
Out[128]:
index | feature | importance | importance_normalized | cumulative_importance | |
---|---|---|---|---|---|
0 | 1 | f2 | 40 | 0.4 | 0.4 |
1 | 2 | f3 | 30 | 0.3 | 0.7 |
2 | 3 | f4 | 20 | 0.2 | 0.9 |
3 | 0 | f1 | 10 | 0.1 | 1.0 |
画出决策树¶
TODO 画Lightgbm的树¶
In [ ]:
画png¶
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = tree.export_graphviz(model_1, out_file=None,
feature_names=['predict_price', 'kejiaowenhuafuwu_1_basetype', 'construction_area'],
class_names=['alpha1'],
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
存pdf¶
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
Stacking¶
预测¶
五折预测¶
In [129]:
n_splits = 5
kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
clf = lgb.LGBMClassifier(
boosting_type="gbdt", num_leaves=30, reg_alpha=0, reg_lambda=0.,
max_depth=-1, n_estimators=200, objective='binary',metric= 'auc',
subsample=0.95, colsample_bytree=0.7, subsample_freq=1,
learning_rate=0.02, random_state=2017, verbosity=-1,
)
res=df_test_t[['ID']]
res['pred'] = 0
for train_idx, val_idx in kfold.split(X_train_t, y_train_t):
clf.random_state = clf.random_state + 1
train_x1 = X_train_t.iloc[train_idx]
train_y1 = y_train_t.iloc[train_idx]
test_x1 = X_train_t.values[val_idx]
test_y1 = y_train_t.values[val_idx]
clf.fit(train_x1, train_y1,eval_set=[(train_x1, train_y1),(test_x1, test_y1)],eval_metric='auc',early_stopping_rounds=100)
res['pred'] += clf.predict_proba(df_test_t[clf.feature_name_])[:,1]
res['pred'] = res['pred']/n_splits
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy # This is added back by InteractiveShellApp.init_path() /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:98: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:133: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/lightgbm/sklearn.py:726: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead. _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
[1] training's auc: 0.781097 valid_1's auc: 0.766617 [2] training's auc: 0.919492 valid_1's auc: 0.911415 [3] training's auc: 0.928044 valid_1's auc: 0.920577 [4] training's auc: 0.930622 valid_1's auc: 0.921999 [5] training's auc: 0.926668 valid_1's auc: 0.917282 [6] training's auc: 0.933601 valid_1's auc: 0.925055 [7] training's auc: 0.935709 valid_1's auc: 0.92746 [8] training's auc: 0.937511 valid_1's auc: 0.92886 [9] training's auc: 0.936499 valid_1's auc: 0.927193 [10] training's auc: 0.937617 valid_1's auc: 0.928551 [11] training's auc: 0.937647 valid_1's auc: 0.928052 [12] training's auc: 0.93678 valid_1's auc: 0.927249 [13] training's auc: 0.935973 valid_1's auc: 0.926058 [14] training's auc: 0.93492 valid_1's auc: 0.92376 [15] training's auc: 0.9368 valid_1's auc: 0.925877 [16] training's auc: 0.938124 valid_1's auc: 0.927339 [17] training's auc: 0.939328 valid_1's auc: 0.928343 [18] training's auc: 0.940036 valid_1's auc: 0.928996 [19] training's auc: 0.939638 valid_1's auc: 0.928287 [20] training's auc: 0.939048 valid_1's auc: 0.927306 [21] training's auc: 0.940047 valid_1's auc: 0.928466 [22] training's auc: 0.939799 valid_1's auc: 0.927757 [23] training's auc: 0.940307 valid_1's auc: 0.928658 [24] training's auc: 0.940976 valid_1's auc: 0.929477 [25] training's auc: 0.941642 valid_1's auc: 0.930041 [26] training's auc: 0.94134 valid_1's auc: 0.929732 [27] training's auc: 0.941963 valid_1's auc: 0.930199 [28] training's auc: 0.942465 valid_1's auc: 0.930602 [29] training's auc: 0.942892 valid_1's auc: 0.931036 [30] training's auc: 0.943336 valid_1's auc: 0.931541 [31] training's auc: 0.943852 valid_1's auc: 0.932173 [32] training's auc: 0.944294 valid_1's auc: 0.932203 [33] training's auc: 0.944497 valid_1's auc: 0.932242 [34] training's auc: 0.944982 valid_1's auc: 0.932646 [35] training's auc: 0.945112 valid_1's auc: 0.932907 [36] training's auc: 0.945034 valid_1's auc: 0.932765 [37] training's auc: 0.94502 valid_1's auc: 0.932561 [38] training's auc: 0.945323 valid_1's auc: 0.932662 [39] training's auc: 0.945536 valid_1's auc: 0.93272 [40] training's auc: 0.945843 valid_1's auc: 0.932939 [41] training's auc: 0.945933 valid_1's auc: 0.932948 [42] training's auc: 0.945936 valid_1's auc: 0.932622 [43] training's auc: 0.945958 valid_1's auc: 0.932576 [44] training's auc: 0.94641 valid_1's auc: 0.932757 [45] training's auc: 0.946779 valid_1's auc: 0.933167 [46] training's auc: 0.946724 valid_1's auc: 0.933082 [47] training's auc: 0.946734 valid_1's auc: 0.933106 [48] training's auc: 0.947044 valid_1's auc: 0.9333 [49] training's auc: 0.947344 valid_1's auc: 0.933651 [50] training's auc: 0.947583 valid_1's auc: 0.933752 [51] training's auc: 0.947733 valid_1's auc: 0.93399 [52] training's auc: 0.948085 valid_1's auc: 0.93408 [53] training's auc: 0.948302 valid_1's auc: 0.934196 [54] training's auc: 0.948475 valid_1's auc: 0.934294 [55] training's auc: 0.948812 valid_1's auc: 0.934489 [56] training's auc: 0.948908 valid_1's auc: 0.934523 [57] training's auc: 0.949124 valid_1's auc: 0.934622 [58] training's auc: 0.949312 valid_1's auc: 0.93478 [59] training's auc: 0.949452 valid_1's auc: 0.934963 [60] training's auc: 0.949562 valid_1's auc: 0.935113 [61] training's auc: 0.949702 valid_1's auc: 0.935268 [62] training's auc: 0.949831 valid_1's auc: 0.935314 [63] training's auc: 0.949926 valid_1's auc: 0.935269 [64] training's auc: 0.950115 valid_1's auc: 0.935389 [65] training's auc: 0.950254 valid_1's auc: 0.935423 [66] training's auc: 0.950422 valid_1's auc: 0.935393 [67] training's auc: 0.950545 valid_1's auc: 0.935526 [68] training's auc: 0.950699 valid_1's auc: 0.93559 [69] training's auc: 0.950891 valid_1's auc: 0.935666 [70] training's auc: 0.950988 valid_1's auc: 0.935728 [71] training's auc: 0.951115 valid_1's auc: 0.935835 [72] training's auc: 0.951249 valid_1's auc: 0.93593 [73] training's auc: 0.951391 valid_1's auc: 0.936024 [74] training's auc: 0.951534 valid_1's auc: 0.936153 [75] training's auc: 0.951666 valid_1's auc: 0.936089 [76] training's auc: 0.951797 valid_1's auc: 0.93616 [77] training's auc: 0.951883 valid_1's auc: 0.936141 [78] training's auc: 0.952005 valid_1's auc: 0.936217 [79] training's auc: 0.952121 valid_1's auc: 0.936562 [80] training's auc: 0.952238 valid_1's auc: 0.936667 [81] training's auc: 0.952307 valid_1's auc: 0.936702 [82] training's auc: 0.952354 valid_1's auc: 0.936687 [83] training's auc: 0.952472 valid_1's auc: 0.936772 [84] training's auc: 0.952577 valid_1's auc: 0.936815 [85] training's auc: 0.95269 valid_1's auc: 0.936884 [86] training's auc: 0.952831 valid_1's auc: 0.93696 [87] training's auc: 0.952979 valid_1's auc: 0.93703 [88] training's auc: 0.953069 valid_1's auc: 0.937103 [89] training's auc: 0.953195 valid_1's auc: 0.937081 [90] training's auc: 0.953263 valid_1's auc: 0.937123 [91] training's auc: 0.953353 valid_1's auc: 0.937175 [92] training's auc: 0.953461 valid_1's auc: 0.937301 [93] training's auc: 0.95358 valid_1's auc: 0.937261 [94] training's auc: 0.95369 valid_1's auc: 0.937217 [95] training's auc: 0.953794 valid_1's auc: 0.937277 [96] training's auc: 0.953913 valid_1's auc: 0.937258 [97] training's auc: 0.954047 valid_1's auc: 0.937291 [98] training's auc: 0.954138 valid_1's auc: 0.937285 [99] training's auc: 0.954278 valid_1's auc: 0.937276 [100] training's auc: 0.95439 valid_1's auc: 0.937339 [101] training's auc: 0.95457 valid_1's auc: 0.937379 [102] training's auc: 0.954722 valid_1's auc: 0.93739 [103] training's auc: 0.95484 valid_1's auc: 0.937401 [104] training's auc: 0.954954 valid_1's auc: 0.937419 [105] training's auc: 0.955084 valid_1's auc: 0.937564 [106] training's auc: 0.955175 valid_1's auc: 0.937591 [107] training's auc: 0.955297 valid_1's auc: 0.937571 [108] training's auc: 0.955396 valid_1's auc: 0.937554 [109] training's auc: 0.955547 valid_1's auc: 0.937541 [110] training's auc: 0.955631 valid_1's auc: 0.937609 [111] training's auc: 0.955753 valid_1's auc: 0.937598 [112] training's auc: 0.955872 valid_1's auc: 0.937673 [113] training's auc: 0.955986 valid_1's auc: 0.937723 [114] training's auc: 0.956108 valid_1's auc: 0.937723 [115] training's auc: 0.956231 valid_1's auc: 0.937759 [116] training's auc: 0.956325 valid_1's auc: 0.937769 [117] training's auc: 0.956458 valid_1's auc: 0.937765 [118] training's auc: 0.956556 valid_1's auc: 0.937795 [119] training's auc: 0.956686 valid_1's auc: 0.937824 [120] training's auc: 0.956764 valid_1's auc: 0.937806 [121] training's auc: 0.956839 valid_1's auc: 0.937828 [122] training's auc: 0.956944 valid_1's auc: 0.937903 [123] training's auc: 0.957023 valid_1's auc: 0.937948 [124] training's auc: 0.957171 valid_1's auc: 0.937955 [125] training's auc: 0.95729 valid_1's auc: 0.937961 [126] training's auc: 0.957367 valid_1's auc: 0.938001 [127] training's auc: 0.957493 valid_1's auc: 0.938027 [128] training's auc: 0.957596 valid_1's auc: 0.938027 [129] training's auc: 0.957658 valid_1's auc: 0.938094 [130] training's auc: 0.957729 valid_1's auc: 0.93812 [131] training's auc: 0.957804 valid_1's auc: 0.938111 [132] training's auc: 0.957922 valid_1's auc: 0.938145 [133] training's auc: 0.958053 valid_1's auc: 0.938215 [134] training's auc: 0.958142 valid_1's auc: 0.938238 [135] training's auc: 0.958278 valid_1's auc: 0.938213 [136] training's auc: 0.958379 valid_1's auc: 0.938234 [137] training's auc: 0.958482 valid_1's auc: 0.938289 [138] training's auc: 0.958558 valid_1's auc: 0.938313 [139] training's auc: 0.958637 valid_1's auc: 0.938351 [140] training's auc: 0.958756 valid_1's auc: 0.938371 [141] training's auc: 0.95887 valid_1's auc: 0.938357 [142] training's auc: 0.958982 valid_1's auc: 0.938382 [143] training's auc: 0.959083 valid_1's auc: 0.938388 [144] training's auc: 0.959209 valid_1's auc: 0.9384 [145] training's auc: 0.959283 valid_1's auc: 0.938444 [146] training's auc: 0.959397 valid_1's auc: 0.938487 [147] training's auc: 0.959477 valid_1's auc: 0.938489 [148] training's auc: 0.959576 valid_1's auc: 0.938486 [149] training's auc: 0.95964 valid_1's auc: 0.938494 [150] training's auc: 0.95973 valid_1's auc: 0.938532 [151] training's auc: 0.959877 valid_1's auc: 0.938516 [152] training's auc: 0.959993 valid_1's auc: 0.938531 [153] training's auc: 0.960097 valid_1's auc: 0.938561 [154] training's auc: 0.960206 valid_1's auc: 0.93853 [155] training's auc: 0.960299 valid_1's auc: 0.938584 [156] training's auc: 0.960381 valid_1's auc: 0.938588 [157] training's auc: 0.960501 valid_1's auc: 0.938569 [158] training's auc: 0.960577 valid_1's auc: 0.938642 [159] training's auc: 0.96067 valid_1's auc: 0.938686 [160] training's auc: 0.960816 valid_1's auc: 0.938706 [161] training's auc: 0.960898 valid_1's auc: 0.938758 [162] training's auc: 0.960976 valid_1's auc: 0.938759 [163] training's auc: 0.961035 valid_1's auc: 0.938759 [164] training's auc: 0.961137 valid_1's auc: 0.938711 [165] training's auc: 0.961214 valid_1's auc: 0.938755 [166] training's auc: 0.961355 valid_1's auc: 0.938764 [167] training's auc: 0.961421 valid_1's auc: 0.938809 [168] training's auc: 0.961521 valid_1's auc: 0.938821 [169] training's auc: 0.961596 valid_1's auc: 0.938777 [170] training's auc: 0.961676 valid_1's auc: 0.938762 [171] training's auc: 0.961752 valid_1's auc: 0.938751 [172] training's auc: 0.961857 valid_1's auc: 0.938785 [173] training's auc: 0.961961 valid_1's auc: 0.938789 [174] training's auc: 0.962048 valid_1's auc: 0.93879 [175] training's auc: 0.962132 valid_1's auc: 0.938845 [176] training's auc: 0.962245 valid_1's auc: 0.938869 [177] training's auc: 0.962359 valid_1's auc: 0.938896 [178] training's auc: 0.962433 valid_1's auc: 0.938864 [179] training's auc: 0.962527 valid_1's auc: 0.938882 [180] training's auc: 0.962637 valid_1's auc: 0.938873 [181] training's auc: 0.962736 valid_1's auc: 0.938872 [182] training's auc: 0.962838 valid_1's auc: 0.938901 [183] training's auc: 0.962946 valid_1's auc: 0.938926 [184] training's auc: 0.963004 valid_1's auc: 0.938908 [185] training's auc: 0.963099 valid_1's auc: 0.938901 [186] training's auc: 0.963178 valid_1's auc: 0.938916 [187] training's auc: 0.963255 valid_1's auc: 0.938968 [188] training's auc: 0.963349 valid_1's auc: 0.938932 [189] training's auc: 0.963421 valid_1's auc: 0.93898 [190] training's auc: 0.963519 valid_1's auc: 0.938959 [191] training's auc: 0.963593 valid_1's auc: 0.938934 [192] training's auc: 0.963704 valid_1's auc: 0.938967 [193] training's auc: 0.963821 valid_1's auc: 0.93899 [194] training's auc: 0.963936 valid_1's auc: 0.938991 [195] training's auc: 0.964036 valid_1's auc: 0.939023 [196] training's auc: 0.964145 valid_1's auc: 0.939015 [197] training's auc: 0.964246 valid_1's auc: 0.939046 [198] training's auc: 0.964355 valid_1's auc: 0.939044 [199] training's auc: 0.964429 valid_1's auc: 0.939037 [200] training's auc: 0.964528 valid_1's auc: 0.93901
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/ipykernel_launcher.py:19: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:98: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:133: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/lightgbm/sklearn.py:726: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead. _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
[1] training's auc: 0.776624 valid_1's auc: 0.772262 [2] training's auc: 0.796614 valid_1's auc: 0.782982 [3] training's auc: 0.913776 valid_1's auc: 0.908833 [4] training's auc: 0.929291 valid_1's auc: 0.921814 [5] training's auc: 0.933364 valid_1's auc: 0.927398 [6] training's auc: 0.935591 valid_1's auc: 0.929281 [7] training's auc: 0.935175 valid_1's auc: 0.928765 [8] training's auc: 0.936684 valid_1's auc: 0.930178 [9] training's auc: 0.937454 valid_1's auc: 0.931257 [10] training's auc: 0.937209 valid_1's auc: 0.93115 [11] training's auc: 0.936663 valid_1's auc: 0.93004 [12] training's auc: 0.935955 valid_1's auc: 0.928575 [13] training's auc: 0.937636 valid_1's auc: 0.930557 [14] training's auc: 0.93834 valid_1's auc: 0.931562 [15] training's auc: 0.938795 valid_1's auc: 0.932275 [16] training's auc: 0.939293 valid_1's auc: 0.93272 [17] training's auc: 0.939535 valid_1's auc: 0.932924 [18] training's auc: 0.94005 valid_1's auc: 0.933481 [19] training's auc: 0.940585 valid_1's auc: 0.933936 [20] training's auc: 0.940906 valid_1's auc: 0.934129 [21] training's auc: 0.941439 valid_1's auc: 0.93461 [22] training's auc: 0.941724 valid_1's auc: 0.934996 [23] training's auc: 0.942058 valid_1's auc: 0.935672 [24] training's auc: 0.942281 valid_1's auc: 0.936006 [25] training's auc: 0.942578 valid_1's auc: 0.936176 [26] training's auc: 0.942951 valid_1's auc: 0.937441 [27] training's auc: 0.943074 valid_1's auc: 0.937309 [28] training's auc: 0.943566 valid_1's auc: 0.937797 [29] training's auc: 0.943768 valid_1's auc: 0.937927 [30] training's auc: 0.944032 valid_1's auc: 0.938186 [31] training's auc: 0.944158 valid_1's auc: 0.938036 [32] training's auc: 0.944345 valid_1's auc: 0.938107 [33] training's auc: 0.944692 valid_1's auc: 0.938374 [34] training's auc: 0.944747 valid_1's auc: 0.938387 [35] training's auc: 0.944935 valid_1's auc: 0.938497 [36] training's auc: 0.945056 valid_1's auc: 0.938658 [37] training's auc: 0.945317 valid_1's auc: 0.93887 [38] training's auc: 0.94555 valid_1's auc: 0.93923 [39] training's auc: 0.945674 valid_1's auc: 0.939235 [40] training's auc: 0.945826 valid_1's auc: 0.939263 [41] training's auc: 0.945917 valid_1's auc: 0.939317 [42] training's auc: 0.946147 valid_1's auc: 0.939448 [43] training's auc: 0.946479 valid_1's auc: 0.939571 [44] training's auc: 0.946681 valid_1's auc: 0.939741 [45] training's auc: 0.946796 valid_1's auc: 0.939667 [46] training's auc: 0.946994 valid_1's auc: 0.939746 [47] training's auc: 0.947178 valid_1's auc: 0.939972 [48] training's auc: 0.947322 valid_1's auc: 0.940031 [49] training's auc: 0.947486 valid_1's auc: 0.94012 [50] training's auc: 0.947625 valid_1's auc: 0.940217 [51] training's auc: 0.947781 valid_1's auc: 0.940286 [52] training's auc: 0.947852 valid_1's auc: 0.940266 [53] training's auc: 0.948047 valid_1's auc: 0.940349 [54] training's auc: 0.94818 valid_1's auc: 0.940409 [55] training's auc: 0.948277 valid_1's auc: 0.940386 [56] training's auc: 0.948417 valid_1's auc: 0.940538 [57] training's auc: 0.948595 valid_1's auc: 0.940599 [58] training's auc: 0.948768 valid_1's auc: 0.940617 [59] training's auc: 0.94889 valid_1's auc: 0.940698 [60] training's auc: 0.949061 valid_1's auc: 0.940755 [61] training's auc: 0.949202 valid_1's auc: 0.940874 [62] training's auc: 0.949331 valid_1's auc: 0.940859 [63] training's auc: 0.949442 valid_1's auc: 0.940991 [64] training's auc: 0.949575 valid_1's auc: 0.941061 [65] training's auc: 0.949673 valid_1's auc: 0.941145 [66] training's auc: 0.949749 valid_1's auc: 0.94109 [67] training's auc: 0.949919 valid_1's auc: 0.941076 [68] training's auc: 0.950065 valid_1's auc: 0.941159 [69] training's auc: 0.950219 valid_1's auc: 0.941335 [70] training's auc: 0.950375 valid_1's auc: 0.941379 [71] training's auc: 0.9505 valid_1's auc: 0.941422 [72] training's auc: 0.950631 valid_1's auc: 0.941435 [73] training's auc: 0.950723 valid_1's auc: 0.94149 [74] training's auc: 0.95085 valid_1's auc: 0.941571 [75] training's auc: 0.950937 valid_1's auc: 0.941534 [76] training's auc: 0.950988 valid_1's auc: 0.941507 [77] training's auc: 0.95111 valid_1's auc: 0.941543 [78] training's auc: 0.951236 valid_1's auc: 0.941469 [79] training's auc: 0.951339 valid_1's auc: 0.941519 [80] training's auc: 0.95145 valid_1's auc: 0.941694 [81] training's auc: 0.951595 valid_1's auc: 0.941781 [82] training's auc: 0.951722 valid_1's auc: 0.94182 [83] training's auc: 0.951783 valid_1's auc: 0.941921 [84] training's auc: 0.951959 valid_1's auc: 0.942029 [85] training's auc: 0.952048 valid_1's auc: 0.942006 [86] training's auc: 0.952142 valid_1's auc: 0.942046 [87] training's auc: 0.952229 valid_1's auc: 0.942157 [88] training's auc: 0.952356 valid_1's auc: 0.942206 [89] training's auc: 0.952427 valid_1's auc: 0.942121 [90] training's auc: 0.952581 valid_1's auc: 0.942218 [91] training's auc: 0.952755 valid_1's auc: 0.942201 [92] training's auc: 0.952877 valid_1's auc: 0.94224 [93] training's auc: 0.952933 valid_1's auc: 0.942289 [94] training's auc: 0.95302 valid_1's auc: 0.942313 [95] training's auc: 0.953149 valid_1's auc: 0.942363 [96] training's auc: 0.953271 valid_1's auc: 0.942405 [97] training's auc: 0.953412 valid_1's auc: 0.942407 [98] training's auc: 0.953502 valid_1's auc: 0.942442 [99] training's auc: 0.953622 valid_1's auc: 0.942518 [100] training's auc: 0.953703 valid_1's auc: 0.942526 [101] training's auc: 0.953839 valid_1's auc: 0.942534 [102] training's auc: 0.95394 valid_1's auc: 0.94262 [103] training's auc: 0.954036 valid_1's auc: 0.942635 [104] training's auc: 0.954108 valid_1's auc: 0.942656 [105] training's auc: 0.954202 valid_1's auc: 0.942735 [106] training's auc: 0.954292 valid_1's auc: 0.942748 [107] training's auc: 0.954394 valid_1's auc: 0.942761 [108] training's auc: 0.954503 valid_1's auc: 0.942778 [109] training's auc: 0.954621 valid_1's auc: 0.942835 [110] training's auc: 0.954686 valid_1's auc: 0.942849 [111] training's auc: 0.954832 valid_1's auc: 0.942833 [112] training's auc: 0.954924 valid_1's auc: 0.94284 [113] training's auc: 0.955055 valid_1's auc: 0.942887 [114] training's auc: 0.955156 valid_1's auc: 0.94289 [115] training's auc: 0.955266 valid_1's auc: 0.942937 [116] training's auc: 0.955358 valid_1's auc: 0.943066 [117] training's auc: 0.955473 valid_1's auc: 0.943042 [118] training's auc: 0.955562 valid_1's auc: 0.943064 [119] training's auc: 0.955687 valid_1's auc: 0.94305 [120] training's auc: 0.955794 valid_1's auc: 0.94305 [121] training's auc: 0.955919 valid_1's auc: 0.943119 [122] training's auc: 0.955998 valid_1's auc: 0.943154 [123] training's auc: 0.956106 valid_1's auc: 0.943239 [124] training's auc: 0.95625 valid_1's auc: 0.943261 [125] training's auc: 0.956395 valid_1's auc: 0.943179 [126] training's auc: 0.95648 valid_1's auc: 0.943204 [127] training's auc: 0.956597 valid_1's auc: 0.943246 [128] training's auc: 0.956715 valid_1's auc: 0.943193 [129] training's auc: 0.956812 valid_1's auc: 0.943219 [130] training's auc: 0.956908 valid_1's auc: 0.943231 [131] training's auc: 0.957007 valid_1's auc: 0.943268 [132] training's auc: 0.957106 valid_1's auc: 0.943231 [133] training's auc: 0.957222 valid_1's auc: 0.943234 [134] training's auc: 0.957319 valid_1's auc: 0.943284 [135] training's auc: 0.957425 valid_1's auc: 0.943293 [136] training's auc: 0.957507 valid_1's auc: 0.943327 [137] training's auc: 0.957609 valid_1's auc: 0.943347 [138] training's auc: 0.957699 valid_1's auc: 0.943402 [139] training's auc: 0.957802 valid_1's auc: 0.943432 [140] training's auc: 0.957898 valid_1's auc: 0.94344 [141] training's auc: 0.95798 valid_1's auc: 0.943471 [142] training's auc: 0.958052 valid_1's auc: 0.943443 [143] training's auc: 0.958141 valid_1's auc: 0.943482 [144] training's auc: 0.958215 valid_1's auc: 0.943516 [145] training's auc: 0.958351 valid_1's auc: 0.943543 [146] training's auc: 0.95843 valid_1's auc: 0.943516 [147] training's auc: 0.958522 valid_1's auc: 0.94354 [148] training's auc: 0.958616 valid_1's auc: 0.943552 [149] training's auc: 0.958692 valid_1's auc: 0.943535 [150] training's auc: 0.958778 valid_1's auc: 0.943543 [151] training's auc: 0.958873 valid_1's auc: 0.943542 [152] training's auc: 0.958936 valid_1's auc: 0.943544 [153] training's auc: 0.959067 valid_1's auc: 0.943612 [154] training's auc: 0.959205 valid_1's auc: 0.943651 [155] training's auc: 0.959292 valid_1's auc: 0.943627 [156] training's auc: 0.959401 valid_1's auc: 0.9436 [157] training's auc: 0.959501 valid_1's auc: 0.943617 [158] training's auc: 0.959626 valid_1's auc: 0.943737 [159] training's auc: 0.959721 valid_1's auc: 0.943782 [160] training's auc: 0.959792 valid_1's auc: 0.943804 [161] training's auc: 0.959896 valid_1's auc: 0.943824 [162] training's auc: 0.960009 valid_1's auc: 0.943828 [163] training's auc: 0.960115 valid_1's auc: 0.94387 [164] training's auc: 0.960237 valid_1's auc: 0.943896 [165] training's auc: 0.960366 valid_1's auc: 0.943864 [166] training's auc: 0.960482 valid_1's auc: 0.943837 [167] training's auc: 0.960615 valid_1's auc: 0.943803 [168] training's auc: 0.960742 valid_1's auc: 0.943844 [169] training's auc: 0.960826 valid_1's auc: 0.943896 [170] training's auc: 0.960899 valid_1's auc: 0.94393 [171] training's auc: 0.961004 valid_1's auc: 0.94396 [172] training's auc: 0.961081 valid_1's auc: 0.943945 [173] training's auc: 0.961195 valid_1's auc: 0.943937 [174] training's auc: 0.961319 valid_1's auc: 0.943942 [175] training's auc: 0.961455 valid_1's auc: 0.943909 [176] training's auc: 0.961527 valid_1's auc: 0.943936 [177] training's auc: 0.961633 valid_1's auc: 0.943966 [178] training's auc: 0.961736 valid_1's auc: 0.943963 [179] training's auc: 0.961838 valid_1's auc: 0.943952 [180] training's auc: 0.961935 valid_1's auc: 0.944005 [181] training's auc: 0.962038 valid_1's auc: 0.944025 [182] training's auc: 0.962147 valid_1's auc: 0.944042 [183] training's auc: 0.962236 valid_1's auc: 0.944044 [184] training's auc: 0.962322 valid_1's auc: 0.944057 [185] training's auc: 0.9624 valid_1's auc: 0.94408 [186] training's auc: 0.962502 valid_1's auc: 0.944091 [187] training's auc: 0.962595 valid_1's auc: 0.944076 [188] training's auc: 0.962683 valid_1's auc: 0.944027 [189] training's auc: 0.962793 valid_1's auc: 0.944073 [190] training's auc: 0.962864 valid_1's auc: 0.944082 [191] training's auc: 0.96298 valid_1's auc: 0.94406 [192] training's auc: 0.963066 valid_1's auc: 0.944068 [193] training's auc: 0.963135 valid_1's auc: 0.944106 [194] training's auc: 0.963244 valid_1's auc: 0.944093 [195] training's auc: 0.963327 valid_1's auc: 0.944104 [196] training's auc: 0.963457 valid_1's auc: 0.944081 [197] training's auc: 0.963537 valid_1's auc: 0.944091 [198] training's auc: 0.963621 valid_1's auc: 0.944072 [199] training's auc: 0.963675 valid_1's auc: 0.944108 [200] training's auc: 0.96375 valid_1's auc: 0.944101
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/ipykernel_launcher.py:19: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:98: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:133: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/lightgbm/sklearn.py:726: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead. _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
[1] training's auc: 0.909697 valid_1's auc: 0.881038 [2] training's auc: 0.922853 valid_1's auc: 0.903733 [3] training's auc: 0.929882 valid_1's auc: 0.90803 [4] training's auc: 0.930996 valid_1's auc: 0.909541 [5] training's auc: 0.934382 valid_1's auc: 0.91245 [6] training's auc: 0.934049 valid_1's auc: 0.91238 [7] training's auc: 0.935598 valid_1's auc: 0.914496 [8] training's auc: 0.935943 valid_1's auc: 0.915354 [9] training's auc: 0.936008 valid_1's auc: 0.914976 [10] training's auc: 0.938175 valid_1's auc: 0.91813 [11] training's auc: 0.940639 valid_1's auc: 0.919456 [12] training's auc: 0.941483 valid_1's auc: 0.920148 [13] training's auc: 0.941907 valid_1's auc: 0.920002 [14] training's auc: 0.94209 valid_1's auc: 0.920618 [15] training's auc: 0.942345 valid_1's auc: 0.920514 [16] training's auc: 0.942898 valid_1's auc: 0.92151 [17] training's auc: 0.943294 valid_1's auc: 0.92212 [18] training's auc: 0.943508 valid_1's auc: 0.922305 [19] training's auc: 0.944083 valid_1's auc: 0.923084 [20] training's auc: 0.94433 valid_1's auc: 0.922868 [21] training's auc: 0.944782 valid_1's auc: 0.92363 [22] training's auc: 0.945073 valid_1's auc: 0.924174 [23] training's auc: 0.945257 valid_1's auc: 0.924457 [24] training's auc: 0.945386 valid_1's auc: 0.924689 [25] training's auc: 0.945502 valid_1's auc: 0.924574 [26] training's auc: 0.945599 valid_1's auc: 0.924422 [27] training's auc: 0.945641 valid_1's auc: 0.924713 [28] training's auc: 0.945669 valid_1's auc: 0.924617 [29] training's auc: 0.945948 valid_1's auc: 0.924825 [30] training's auc: 0.945919 valid_1's auc: 0.924948 [31] training's auc: 0.946131 valid_1's auc: 0.924893 [32] training's auc: 0.946333 valid_1's auc: 0.925427 [33] training's auc: 0.946604 valid_1's auc: 0.925691 [34] training's auc: 0.947045 valid_1's auc: 0.92576 [35] training's auc: 0.947229 valid_1's auc: 0.925935 [36] training's auc: 0.94745 valid_1's auc: 0.926257 [37] training's auc: 0.947727 valid_1's auc: 0.926429 [38] training's auc: 0.947913 valid_1's auc: 0.926478 [39] training's auc: 0.948053 valid_1's auc: 0.926572 [40] training's auc: 0.948211 valid_1's auc: 0.926859 [41] training's auc: 0.948332 valid_1's auc: 0.926766 [42] training's auc: 0.948522 valid_1's auc: 0.926901 [43] training's auc: 0.948704 valid_1's auc: 0.927013 [44] training's auc: 0.94888 valid_1's auc: 0.927088 [45] training's auc: 0.949058 valid_1's auc: 0.927243 [46] training's auc: 0.949182 valid_1's auc: 0.927141 [47] training's auc: 0.949295 valid_1's auc: 0.927243 [48] training's auc: 0.94945 valid_1's auc: 0.92746 [49] training's auc: 0.949549 valid_1's auc: 0.927482 [50] training's auc: 0.949593 valid_1's auc: 0.927542 [51] training's auc: 0.94966 valid_1's auc: 0.927536 [52] training's auc: 0.949726 valid_1's auc: 0.927641 [53] training's auc: 0.9499 valid_1's auc: 0.927714 [54] training's auc: 0.950055 valid_1's auc: 0.9278 [55] training's auc: 0.950119 valid_1's auc: 0.92785 [56] training's auc: 0.950182 valid_1's auc: 0.92803 [57] training's auc: 0.95038 valid_1's auc: 0.928129 [58] training's auc: 0.950497 valid_1's auc: 0.928193 [59] training's auc: 0.950684 valid_1's auc: 0.928284 [60] training's auc: 0.950861 valid_1's auc: 0.928241 [61] training's auc: 0.950949 valid_1's auc: 0.928399 [62] training's auc: 0.951129 valid_1's auc: 0.928467 [63] training's auc: 0.951211 valid_1's auc: 0.928464 [64] training's auc: 0.951375 valid_1's auc: 0.928513 [65] training's auc: 0.951506 valid_1's auc: 0.928574 [66] training's auc: 0.951637 valid_1's auc: 0.928552 [67] training's auc: 0.951756 valid_1's auc: 0.928635 [68] training's auc: 0.951876 valid_1's auc: 0.928695 [69] training's auc: 0.952024 valid_1's auc: 0.928796 [70] training's auc: 0.952143 valid_1's auc: 0.928794 [71] training's auc: 0.952252 valid_1's auc: 0.928795 [72] training's auc: 0.952443 valid_1's auc: 0.929049 [73] training's auc: 0.952549 valid_1's auc: 0.92916 [74] training's auc: 0.952659 valid_1's auc: 0.929195 [75] training's auc: 0.952765 valid_1's auc: 0.929193 [76] training's auc: 0.952879 valid_1's auc: 0.929285 [77] training's auc: 0.953022 valid_1's auc: 0.929266 [78] training's auc: 0.953128 valid_1's auc: 0.929308 [79] training's auc: 0.953228 valid_1's auc: 0.92938 [80] training's auc: 0.953328 valid_1's auc: 0.929527 [81] training's auc: 0.953502 valid_1's auc: 0.929523 [82] training's auc: 0.953638 valid_1's auc: 0.92943 [83] training's auc: 0.953745 valid_1's auc: 0.929534 [84] training's auc: 0.953834 valid_1's auc: 0.929609 [85] training's auc: 0.95394 valid_1's auc: 0.929645 [86] training's auc: 0.954042 valid_1's auc: 0.92973 [87] training's auc: 0.954204 valid_1's auc: 0.93001 [88] training's auc: 0.954263 valid_1's auc: 0.930023 [89] training's auc: 0.9544 valid_1's auc: 0.929964 [90] training's auc: 0.954497 valid_1's auc: 0.929985 [91] training's auc: 0.954656 valid_1's auc: 0.930005 [92] training's auc: 0.954745 valid_1's auc: 0.930027 [93] training's auc: 0.954818 valid_1's auc: 0.92999 [94] training's auc: 0.954933 valid_1's auc: 0.930002 [95] training's auc: 0.955037 valid_1's auc: 0.930014 [96] training's auc: 0.955119 valid_1's auc: 0.930078 [97] training's auc: 0.955257 valid_1's auc: 0.930265 [98] training's auc: 0.955368 valid_1's auc: 0.930226 [99] training's auc: 0.955488 valid_1's auc: 0.93026 [100] training's auc: 0.955562 valid_1's auc: 0.930299 [101] training's auc: 0.955646 valid_1's auc: 0.930306 [102] training's auc: 0.955782 valid_1's auc: 0.930392 [103] training's auc: 0.955872 valid_1's auc: 0.930439 [104] training's auc: 0.955988 valid_1's auc: 0.930484 [105] training's auc: 0.956128 valid_1's auc: 0.93059 [106] training's auc: 0.956242 valid_1's auc: 0.930611 [107] training's auc: 0.956363 valid_1's auc: 0.930665 [108] training's auc: 0.956487 valid_1's auc: 0.930703 [109] training's auc: 0.956554 valid_1's auc: 0.930657 [110] training's auc: 0.956676 valid_1's auc: 0.930739 [111] training's auc: 0.956807 valid_1's auc: 0.930734 [112] training's auc: 0.95691 valid_1's auc: 0.930748 [113] training's auc: 0.95703 valid_1's auc: 0.930791 [114] training's auc: 0.957147 valid_1's auc: 0.930798 [115] training's auc: 0.95728 valid_1's auc: 0.930989 [116] training's auc: 0.957399 valid_1's auc: 0.930975 [117] training's auc: 0.957491 valid_1's auc: 0.931035 [118] training's auc: 0.957585 valid_1's auc: 0.931112 [119] training's auc: 0.957684 valid_1's auc: 0.931096 [120] training's auc: 0.957833 valid_1's auc: 0.931117 [121] training's auc: 0.95795 valid_1's auc: 0.931191 [122] training's auc: 0.958181 valid_1's auc: 0.931332 [123] training's auc: 0.958267 valid_1's auc: 0.93135 [124] training's auc: 0.958406 valid_1's auc: 0.931364 [125] training's auc: 0.958488 valid_1's auc: 0.931386 [126] training's auc: 0.958597 valid_1's auc: 0.931358 [127] training's auc: 0.95871 valid_1's auc: 0.931338 [128] training's auc: 0.958781 valid_1's auc: 0.931392 [129] training's auc: 0.958864 valid_1's auc: 0.931432 [130] training's auc: 0.958935 valid_1's auc: 0.931402 [131] training's auc: 0.959042 valid_1's auc: 0.931376 [132] training's auc: 0.959144 valid_1's auc: 0.931365 [133] training's auc: 0.959244 valid_1's auc: 0.931359 [134] training's auc: 0.959349 valid_1's auc: 0.931398 [135] training's auc: 0.959468 valid_1's auc: 0.931414 [136] training's auc: 0.959561 valid_1's auc: 0.931492 [137] training's auc: 0.959657 valid_1's auc: 0.931462 [138] training's auc: 0.959748 valid_1's auc: 0.931504 [139] training's auc: 0.959839 valid_1's auc: 0.931451 [140] training's auc: 0.959952 valid_1's auc: 0.93154 [141] training's auc: 0.96002 valid_1's auc: 0.931548 [142] training's auc: 0.960129 valid_1's auc: 0.931531 [143] training's auc: 0.960231 valid_1's auc: 0.931585 [144] training's auc: 0.960306 valid_1's auc: 0.931541 [145] training's auc: 0.960433 valid_1's auc: 0.931583 [146] training's auc: 0.960527 valid_1's auc: 0.931583 [147] training's auc: 0.960639 valid_1's auc: 0.931631 [148] training's auc: 0.960714 valid_1's auc: 0.931581 [149] training's auc: 0.960796 valid_1's auc: 0.931606 [150] training's auc: 0.960945 valid_1's auc: 0.931631 [151] training's auc: 0.961029 valid_1's auc: 0.931622 [152] training's auc: 0.961105 valid_1's auc: 0.931597 [153] training's auc: 0.961188 valid_1's auc: 0.931567 [154] training's auc: 0.961296 valid_1's auc: 0.931625 [155] training's auc: 0.961385 valid_1's auc: 0.931684 [156] training's auc: 0.961447 valid_1's auc: 0.931705 [157] training's auc: 0.96156 valid_1's auc: 0.931692 [158] training's auc: 0.96166 valid_1's auc: 0.931711 [159] training's auc: 0.961759 valid_1's auc: 0.931697 [160] training's auc: 0.961828 valid_1's auc: 0.931706 [161] training's auc: 0.96194 valid_1's auc: 0.931751 [162] training's auc: 0.962059 valid_1's auc: 0.93178 [163] training's auc: 0.962142 valid_1's auc: 0.931843 [164] training's auc: 0.962201 valid_1's auc: 0.931842 [165] training's auc: 0.962303 valid_1's auc: 0.931971 [166] training's auc: 0.962397 valid_1's auc: 0.931931 [167] training's auc: 0.962478 valid_1's auc: 0.931903 [168] training's auc: 0.962581 valid_1's auc: 0.931906 [169] training's auc: 0.962681 valid_1's auc: 0.931914 [170] training's auc: 0.96275 valid_1's auc: 0.931975 [171] training's auc: 0.962824 valid_1's auc: 0.931961 [172] training's auc: 0.962934 valid_1's auc: 0.932021 [173] training's auc: 0.963007 valid_1's auc: 0.932047 [174] training's auc: 0.96314 valid_1's auc: 0.9321 [175] training's auc: 0.963238 valid_1's auc: 0.932142 [176] training's auc: 0.963332 valid_1's auc: 0.932129 [177] training's auc: 0.963442 valid_1's auc: 0.932128 [178] training's auc: 0.963514 valid_1's auc: 0.932164 [179] training's auc: 0.963626 valid_1's auc: 0.93222 [180] training's auc: 0.963721 valid_1's auc: 0.932227 [181] training's auc: 0.963808 valid_1's auc: 0.932257 [182] training's auc: 0.963898 valid_1's auc: 0.932271 [183] training's auc: 0.963952 valid_1's auc: 0.932278 [184] training's auc: 0.964049 valid_1's auc: 0.932305 [185] training's auc: 0.964141 valid_1's auc: 0.932392 [186] training's auc: 0.964236 valid_1's auc: 0.932434 [187] training's auc: 0.964341 valid_1's auc: 0.932471 [188] training's auc: 0.964435 valid_1's auc: 0.932534 [189] training's auc: 0.964516 valid_1's auc: 0.932524 [190] training's auc: 0.964615 valid_1's auc: 0.932563 [191] training's auc: 0.964715 valid_1's auc: 0.932602 [192] training's auc: 0.96477 valid_1's auc: 0.932599 [193] training's auc: 0.964865 valid_1's auc: 0.932641 [194] training's auc: 0.964937 valid_1's auc: 0.932751 [195] training's auc: 0.965013 valid_1's auc: 0.932749 [196] training's auc: 0.965092 valid_1's auc: 0.932734 [197] training's auc: 0.965152 valid_1's auc: 0.932761 [198] training's auc: 0.965263 valid_1's auc: 0.932712 [199] training's auc: 0.965345 valid_1's auc: 0.932691 [200] training's auc: 0.965434 valid_1's auc: 0.932716
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/ipykernel_launcher.py:19: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:98: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:133: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/lightgbm/sklearn.py:726: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead. _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
[1] training's auc: 0.783598 valid_1's auc: 0.776793 [2] training's auc: 0.79352 valid_1's auc: 0.787587 [3] training's auc: 0.914091 valid_1's auc: 0.907838 [4] training's auc: 0.923054 valid_1's auc: 0.916925 [5] training's auc: 0.922128 valid_1's auc: 0.915936 [6] training's auc: 0.919232 valid_1's auc: 0.912433 [7] training's auc: 0.928051 valid_1's auc: 0.922002 [8] training's auc: 0.9244 valid_1's auc: 0.918203 [9] training's auc: 0.921217 valid_1's auc: 0.914821 [10] training's auc: 0.928473 valid_1's auc: 0.921869 [11] training's auc: 0.931592 valid_1's auc: 0.925673 [12] training's auc: 0.933524 valid_1's auc: 0.928171 [13] training's auc: 0.936196 valid_1's auc: 0.93078 [14] training's auc: 0.935746 valid_1's auc: 0.930452 [15] training's auc: 0.936988 valid_1's auc: 0.932585 [16] training's auc: 0.937944 valid_1's auc: 0.933771 [17] training's auc: 0.939145 valid_1's auc: 0.935258 [18] training's auc: 0.939845 valid_1's auc: 0.936125 [19] training's auc: 0.940218 valid_1's auc: 0.936371 [20] training's auc: 0.940006 valid_1's auc: 0.935505 [21] training's auc: 0.939595 valid_1's auc: 0.93467 [22] training's auc: 0.94009 valid_1's auc: 0.935051 [23] training's auc: 0.94084 valid_1's auc: 0.935829 [24] training's auc: 0.941458 valid_1's auc: 0.936245 [25] training's auc: 0.941307 valid_1's auc: 0.936193 [26] training's auc: 0.941756 valid_1's auc: 0.9367 [27] training's auc: 0.942137 valid_1's auc: 0.936935 [28] training's auc: 0.942215 valid_1's auc: 0.936609 [29] training's auc: 0.942599 valid_1's auc: 0.936803 [30] training's auc: 0.942955 valid_1's auc: 0.937156 [31] training's auc: 0.943514 valid_1's auc: 0.937763 [32] training's auc: 0.943487 valid_1's auc: 0.937385 [33] training's auc: 0.943681 valid_1's auc: 0.937764 [34] training's auc: 0.944093 valid_1's auc: 0.938082 [35] training's auc: 0.944097 valid_1's auc: 0.93792 [36] training's auc: 0.944385 valid_1's auc: 0.938198 [37] training's auc: 0.944809 valid_1's auc: 0.938425 [38] training's auc: 0.944918 valid_1's auc: 0.938188 [39] training's auc: 0.945175 valid_1's auc: 0.938532 [40] training's auc: 0.945185 valid_1's auc: 0.938394 [41] training's auc: 0.945227 valid_1's auc: 0.938433 [42] training's auc: 0.945261 valid_1's auc: 0.938336 [43] training's auc: 0.945568 valid_1's auc: 0.93846 [44] training's auc: 0.945798 valid_1's auc: 0.938618 [45] training's auc: 0.946083 valid_1's auc: 0.938849 [46] training's auc: 0.946372 valid_1's auc: 0.939167 [47] training's auc: 0.946396 valid_1's auc: 0.939076 [48] training's auc: 0.946702 valid_1's auc: 0.939426 [49] training's auc: 0.946865 valid_1's auc: 0.939428 [50] training's auc: 0.947115 valid_1's auc: 0.939627 [51] training's auc: 0.947142 valid_1's auc: 0.939505 [52] training's auc: 0.947328 valid_1's auc: 0.939586 [53] training's auc: 0.947555 valid_1's auc: 0.939688 [54] training's auc: 0.947763 valid_1's auc: 0.939908 [55] training's auc: 0.947895 valid_1's auc: 0.939982 [56] training's auc: 0.947936 valid_1's auc: 0.939841 [57] training's auc: 0.948091 valid_1's auc: 0.939955 [58] training's auc: 0.94825 valid_1's auc: 0.940029 [59] training's auc: 0.948488 valid_1's auc: 0.94019 [60] training's auc: 0.948692 valid_1's auc: 0.940231 [61] training's auc: 0.948874 valid_1's auc: 0.940357 [62] training's auc: 0.949002 valid_1's auc: 0.940477 [63] training's auc: 0.949169 valid_1's auc: 0.940637 [64] training's auc: 0.949316 valid_1's auc: 0.940804 [65] training's auc: 0.949403 valid_1's auc: 0.940983 [66] training's auc: 0.949541 valid_1's auc: 0.941026 [67] training's auc: 0.949654 valid_1's auc: 0.941092 [68] training's auc: 0.949808 valid_1's auc: 0.94119 [69] training's auc: 0.949977 valid_1's auc: 0.94135 [70] training's auc: 0.950104 valid_1's auc: 0.941309 [71] training's auc: 0.950237 valid_1's auc: 0.941363 [72] training's auc: 0.950348 valid_1's auc: 0.941481 [73] training's auc: 0.950511 valid_1's auc: 0.941483 [74] training's auc: 0.950678 valid_1's auc: 0.941574 [75] training's auc: 0.950799 valid_1's auc: 0.941653 [76] training's auc: 0.950908 valid_1's auc: 0.941695 [77] training's auc: 0.951024 valid_1's auc: 0.941673 [78] training's auc: 0.951231 valid_1's auc: 0.94174 [79] training's auc: 0.951357 valid_1's auc: 0.941788 [80] training's auc: 0.951579 valid_1's auc: 0.941726 [81] training's auc: 0.951707 valid_1's auc: 0.941826 [82] training's auc: 0.951792 valid_1's auc: 0.941959 [83] training's auc: 0.951925 valid_1's auc: 0.941959 [84] training's auc: 0.952045 valid_1's auc: 0.942059 [85] training's auc: 0.952189 valid_1's auc: 0.942085 [86] training's auc: 0.952334 valid_1's auc: 0.942198 [87] training's auc: 0.952442 valid_1's auc: 0.942216 [88] training's auc: 0.952559 valid_1's auc: 0.942358 [89] training's auc: 0.952643 valid_1's auc: 0.942406 [90] training's auc: 0.952739 valid_1's auc: 0.942391 [91] training's auc: 0.952862 valid_1's auc: 0.942555 [92] training's auc: 0.952941 valid_1's auc: 0.942576 [93] training's auc: 0.953026 valid_1's auc: 0.942632 [94] training's auc: 0.953107 valid_1's auc: 0.942589 [95] training's auc: 0.953211 valid_1's auc: 0.942671 [96] training's auc: 0.953355 valid_1's auc: 0.942774 [97] training's auc: 0.953473 valid_1's auc: 0.942889 [98] training's auc: 0.953587 valid_1's auc: 0.942906 [99] training's auc: 0.953706 valid_1's auc: 0.94299 [100] training's auc: 0.953849 valid_1's auc: 0.943029 [101] training's auc: 0.953988 valid_1's auc: 0.943068 [102] training's auc: 0.954177 valid_1's auc: 0.943148 [103] training's auc: 0.954298 valid_1's auc: 0.943198 [104] training's auc: 0.954408 valid_1's auc: 0.943217 [105] training's auc: 0.9545 valid_1's auc: 0.943352 [106] training's auc: 0.954611 valid_1's auc: 0.943346 [107] training's auc: 0.954707 valid_1's auc: 0.943359 [108] training's auc: 0.95484 valid_1's auc: 0.943362 [109] training's auc: 0.954922 valid_1's auc: 0.943439 [110] training's auc: 0.955026 valid_1's auc: 0.94339 [111] training's auc: 0.955114 valid_1's auc: 0.943369 [112] training's auc: 0.955188 valid_1's auc: 0.943343 [113] training's auc: 0.955286 valid_1's auc: 0.94331 [114] training's auc: 0.955378 valid_1's auc: 0.943328 [115] training's auc: 0.955473 valid_1's auc: 0.943342 [116] training's auc: 0.955557 valid_1's auc: 0.943391 [117] training's auc: 0.955715 valid_1's auc: 0.943478 [118] training's auc: 0.955831 valid_1's auc: 0.943489 [119] training's auc: 0.955984 valid_1's auc: 0.943592 [120] training's auc: 0.956065 valid_1's auc: 0.943594 [121] training's auc: 0.956168 valid_1's auc: 0.943648 [122] training's auc: 0.95632 valid_1's auc: 0.943711 [123] training's auc: 0.95641 valid_1's auc: 0.943736 [124] training's auc: 0.956497 valid_1's auc: 0.943733 [125] training's auc: 0.95657 valid_1's auc: 0.94373 [126] training's auc: 0.956715 valid_1's auc: 0.943752 [127] training's auc: 0.956852 valid_1's auc: 0.943792 [128] training's auc: 0.956984 valid_1's auc: 0.943832 [129] training's auc: 0.957074 valid_1's auc: 0.943842 [130] training's auc: 0.957162 valid_1's auc: 0.943889 [131] training's auc: 0.95726 valid_1's auc: 0.943894 [132] training's auc: 0.957334 valid_1's auc: 0.943939 [133] training's auc: 0.957424 valid_1's auc: 0.94396 [134] training's auc: 0.957503 valid_1's auc: 0.943966 [135] training's auc: 0.957629 valid_1's auc: 0.943951 [136] training's auc: 0.957755 valid_1's auc: 0.94395 [137] training's auc: 0.957812 valid_1's auc: 0.944004 [138] training's auc: 0.957938 valid_1's auc: 0.944022 [139] training's auc: 0.958041 valid_1's auc: 0.944008 [140] training's auc: 0.958122 valid_1's auc: 0.943947 [141] training's auc: 0.958192 valid_1's auc: 0.94395 [142] training's auc: 0.958368 valid_1's auc: 0.94399 [143] training's auc: 0.958461 valid_1's auc: 0.943982 [144] training's auc: 0.958526 valid_1's auc: 0.943961 [145] training's auc: 0.958595 valid_1's auc: 0.943947 [146] training's auc: 0.958667 valid_1's auc: 0.943932 [147] training's auc: 0.958799 valid_1's auc: 0.94396 [148] training's auc: 0.958922 valid_1's auc: 0.943972 [149] training's auc: 0.959039 valid_1's auc: 0.94396 [150] training's auc: 0.959127 valid_1's auc: 0.944001 [151] training's auc: 0.959218 valid_1's auc: 0.944051 [152] training's auc: 0.959285 valid_1's auc: 0.944032 [153] training's auc: 0.959425 valid_1's auc: 0.944011 [154] training's auc: 0.9595 valid_1's auc: 0.944047 [155] training's auc: 0.959606 valid_1's auc: 0.944084 [156] training's auc: 0.959716 valid_1's auc: 0.94407 [157] training's auc: 0.959794 valid_1's auc: 0.944081 [158] training's auc: 0.959924 valid_1's auc: 0.944078 [159] training's auc: 0.960023 valid_1's auc: 0.944086 [160] training's auc: 0.96015 valid_1's auc: 0.944097 [161] training's auc: 0.960285 valid_1's auc: 0.944083 [162] training's auc: 0.960414 valid_1's auc: 0.944097 [163] training's auc: 0.960504 valid_1's auc: 0.944081 [164] training's auc: 0.960625 valid_1's auc: 0.944104 [165] training's auc: 0.960721 valid_1's auc: 0.944105 [166] training's auc: 0.960786 valid_1's auc: 0.944134 [167] training's auc: 0.960865 valid_1's auc: 0.94417 [168] training's auc: 0.960965 valid_1's auc: 0.944188 [169] training's auc: 0.961055 valid_1's auc: 0.944199 [170] training's auc: 0.961153 valid_1's auc: 0.944203 [171] training's auc: 0.96128 valid_1's auc: 0.944222 [172] training's auc: 0.961398 valid_1's auc: 0.944203 [173] training's auc: 0.961508 valid_1's auc: 0.9442 [174] training's auc: 0.961594 valid_1's auc: 0.94425 [175] training's auc: 0.961663 valid_1's auc: 0.944245 [176] training's auc: 0.961756 valid_1's auc: 0.9442 [177] training's auc: 0.961858 valid_1's auc: 0.944199 [178] training's auc: 0.961941 valid_1's auc: 0.944206 [179] training's auc: 0.962017 valid_1's auc: 0.944201 [180] training's auc: 0.962126 valid_1's auc: 0.944191 [181] training's auc: 0.962233 valid_1's auc: 0.944193 [182] training's auc: 0.962331 valid_1's auc: 0.944227 [183] training's auc: 0.962453 valid_1's auc: 0.944223 [184] training's auc: 0.962617 valid_1's auc: 0.944289 [185] training's auc: 0.962711 valid_1's auc: 0.944332 [186] training's auc: 0.962839 valid_1's auc: 0.944323 [187] training's auc: 0.962944 valid_1's auc: 0.944335 [188] training's auc: 0.963024 valid_1's auc: 0.944325 [189] training's auc: 0.96313 valid_1's auc: 0.944314 [190] training's auc: 0.963249 valid_1's auc: 0.944329 [191] training's auc: 0.96336 valid_1's auc: 0.944332 [192] training's auc: 0.963456 valid_1's auc: 0.944347 [193] training's auc: 0.963535 valid_1's auc: 0.944337 [194] training's auc: 0.963619 valid_1's auc: 0.94433 [195] training's auc: 0.963725 valid_1's auc: 0.944315 [196] training's auc: 0.963826 valid_1's auc: 0.944296 [197] training's auc: 0.963922 valid_1's auc: 0.944247 [198] training's auc: 0.964019 valid_1's auc: 0.944236 [199] training's auc: 0.964123 valid_1's auc: 0.944226 [200] training's auc: 0.964199 valid_1's auc: 0.94431
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/ipykernel_launcher.py:19: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:98: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/sklearn/preprocessing/_label.py:133: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True) /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/lightgbm/sklearn.py:726: UserWarning: 'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. Pass 'early_stopping()' callback via 'callbacks' argument instead. _log_warning("'early_stopping_rounds' argument is deprecated and will be removed in a future release of LightGBM. "
[1] training's auc: 0.784804 valid_1's auc: 0.767997 [2] training's auc: 0.917981 valid_1's auc: 0.916209 [3] training's auc: 0.9268 valid_1's auc: 0.921888 [4] training's auc: 0.930108 valid_1's auc: 0.92465 [5] training's auc: 0.934729 valid_1's auc: 0.929689 [6] training's auc: 0.935002 valid_1's auc: 0.929617 [7] training's auc: 0.935205 valid_1's auc: 0.927856 [8] training's auc: 0.937024 valid_1's auc: 0.931202 [9] training's auc: 0.938437 valid_1's auc: 0.933334 [10] training's auc: 0.938945 valid_1's auc: 0.933977 [11] training's auc: 0.93956 valid_1's auc: 0.934155 [12] training's auc: 0.940662 valid_1's auc: 0.935197 [13] training's auc: 0.940789 valid_1's auc: 0.935106 [14] training's auc: 0.941103 valid_1's auc: 0.935533 [15] training's auc: 0.941389 valid_1's auc: 0.935839 [16] training's auc: 0.941553 valid_1's auc: 0.935607 [17] training's auc: 0.941548 valid_1's auc: 0.935289 [18] training's auc: 0.941608 valid_1's auc: 0.93496 [19] training's auc: 0.941558 valid_1's auc: 0.934731 [20] training's auc: 0.941444 valid_1's auc: 0.934511 [21] training's auc: 0.941739 valid_1's auc: 0.934883 [22] training's auc: 0.942009 valid_1's auc: 0.93495 [23] training's auc: 0.942444 valid_1's auc: 0.935049 [24] training's auc: 0.942539 valid_1's auc: 0.935112 [25] training's auc: 0.942757 valid_1's auc: 0.935371 [26] training's auc: 0.942927 valid_1's auc: 0.935429 [27] training's auc: 0.942933 valid_1's auc: 0.935356 [28] training's auc: 0.9435 valid_1's auc: 0.935215 [29] training's auc: 0.943782 valid_1's auc: 0.935143 [30] training's auc: 0.943777 valid_1's auc: 0.935092 [31] training's auc: 0.944054 valid_1's auc: 0.93544 [32] training's auc: 0.944332 valid_1's auc: 0.935493 [33] training's auc: 0.944627 valid_1's auc: 0.935649 [34] training's auc: 0.944812 valid_1's auc: 0.935649 [35] training's auc: 0.945152 valid_1's auc: 0.936076 [36] training's auc: 0.945161 valid_1's auc: 0.935956 [37] training's auc: 0.945471 valid_1's auc: 0.936273 [38] training's auc: 0.945569 valid_1's auc: 0.936141 [39] training's auc: 0.945841 valid_1's auc: 0.936419 [40] training's auc: 0.946087 valid_1's auc: 0.936614 [41] training's auc: 0.946304 valid_1's auc: 0.936734 [42] training's auc: 0.946457 valid_1's auc: 0.936857 [43] training's auc: 0.946582 valid_1's auc: 0.93707 [44] training's auc: 0.946723 valid_1's auc: 0.937247 [45] training's auc: 0.946866 valid_1's auc: 0.937428 [46] training's auc: 0.946964 valid_1's auc: 0.93746 [47] training's auc: 0.947127 valid_1's auc: 0.937474 [48] training's auc: 0.947361 valid_1's auc: 0.937716 [49] training's auc: 0.94744 valid_1's auc: 0.937609 [50] training's auc: 0.947599 valid_1's auc: 0.937799 [51] training's auc: 0.9478 valid_1's auc: 0.937859 [52] training's auc: 0.947947 valid_1's auc: 0.937979 [53] training's auc: 0.948061 valid_1's auc: 0.938321 [54] training's auc: 0.948219 valid_1's auc: 0.938397 [55] training's auc: 0.948317 valid_1's auc: 0.938523 [56] training's auc: 0.948463 valid_1's auc: 0.938532 [57] training's auc: 0.948711 valid_1's auc: 0.938787 [58] training's auc: 0.948832 valid_1's auc: 0.93882 [59] training's auc: 0.949038 valid_1's auc: 0.938926 [60] training's auc: 0.949193 valid_1's auc: 0.938956 [61] training's auc: 0.949349 valid_1's auc: 0.939023 [62] training's auc: 0.949436 valid_1's auc: 0.939027 [63] training's auc: 0.949516 valid_1's auc: 0.939082 [64] training's auc: 0.949614 valid_1's auc: 0.939167 [65] training's auc: 0.949778 valid_1's auc: 0.939152 [66] training's auc: 0.949884 valid_1's auc: 0.939179 [67] training's auc: 0.949991 valid_1's auc: 0.939221 [68] training's auc: 0.950052 valid_1's auc: 0.939117 [69] training's auc: 0.950157 valid_1's auc: 0.939112 [70] training's auc: 0.95026 valid_1's auc: 0.939075 [71] training's auc: 0.950349 valid_1's auc: 0.939144 [72] training's auc: 0.950421 valid_1's auc: 0.939146 [73] training's auc: 0.950556 valid_1's auc: 0.939311 [74] training's auc: 0.95071 valid_1's auc: 0.939428 [75] training's auc: 0.95081 valid_1's auc: 0.939346 [76] training's auc: 0.950969 valid_1's auc: 0.939435 [77] training's auc: 0.951062 valid_1's auc: 0.939376 [78] training's auc: 0.951171 valid_1's auc: 0.939402 [79] training's auc: 0.951344 valid_1's auc: 0.939611 [80] training's auc: 0.951526 valid_1's auc: 0.93967 [81] training's auc: 0.951651 valid_1's auc: 0.939683 [82] training's auc: 0.951824 valid_1's auc: 0.939743 [83] training's auc: 0.95195 valid_1's auc: 0.939676 [84] training's auc: 0.952154 valid_1's auc: 0.939731 [85] training's auc: 0.952249 valid_1's auc: 0.939729 [86] training's auc: 0.952339 valid_1's auc: 0.939768 [87] training's auc: 0.95246 valid_1's auc: 0.939817 [88] training's auc: 0.952574 valid_1's auc: 0.93985 [89] training's auc: 0.95267 valid_1's auc: 0.939922 [90] training's auc: 0.952827 valid_1's auc: 0.940005 [91] training's auc: 0.952968 valid_1's auc: 0.939963 [92] training's auc: 0.953139 valid_1's auc: 0.940011 [93] training's auc: 0.953228 valid_1's auc: 0.939998 [94] training's auc: 0.953329 valid_1's auc: 0.939988 [95] training's auc: 0.953397 valid_1's auc: 0.940003 [96] training's auc: 0.953541 valid_1's auc: 0.940056 [97] training's auc: 0.953703 valid_1's auc: 0.940157 [98] training's auc: 0.953796 valid_1's auc: 0.940127 [99] training's auc: 0.954018 valid_1's auc: 0.940281 [100] training's auc: 0.954096 valid_1's auc: 0.940296 [101] training's auc: 0.954288 valid_1's auc: 0.940314 [102] training's auc: 0.954455 valid_1's auc: 0.940362 [103] training's auc: 0.954573 valid_1's auc: 0.94034 [104] training's auc: 0.954646 valid_1's auc: 0.940389 [105] training's auc: 0.954769 valid_1's auc: 0.940369 [106] training's auc: 0.954861 valid_1's auc: 0.940386 [107] training's auc: 0.954955 valid_1's auc: 0.94037 [108] training's auc: 0.955142 valid_1's auc: 0.940395 [109] training's auc: 0.955244 valid_1's auc: 0.94043 [110] training's auc: 0.95535 valid_1's auc: 0.940431 [111] training's auc: 0.95547 valid_1's auc: 0.940447 [112] training's auc: 0.955611 valid_1's auc: 0.940516 [113] training's auc: 0.955731 valid_1's auc: 0.940488 [114] training's auc: 0.955838 valid_1's auc: 0.94062 [115] training's auc: 0.956014 valid_1's auc: 0.940666 [116] training's auc: 0.95612 valid_1's auc: 0.940672 [117] training's auc: 0.956243 valid_1's auc: 0.940678 [118] training's auc: 0.956335 valid_1's auc: 0.940684 [119] training's auc: 0.956439 valid_1's auc: 0.940763 [120] training's auc: 0.956552 valid_1's auc: 0.940833 [121] training's auc: 0.956682 valid_1's auc: 0.94087 [122] training's auc: 0.956772 valid_1's auc: 0.940866 [123] training's auc: 0.956909 valid_1's auc: 0.940883 [124] training's auc: 0.957017 valid_1's auc: 0.940894 [125] training's auc: 0.957109 valid_1's auc: 0.940913 [126] training's auc: 0.95722 valid_1's auc: 0.940934 [127] training's auc: 0.957312 valid_1's auc: 0.940942 [128] training's auc: 0.957404 valid_1's auc: 0.941026 [129] training's auc: 0.957566 valid_1's auc: 0.94099 [130] training's auc: 0.957668 valid_1's auc: 0.941055 [131] training's auc: 0.95777 valid_1's auc: 0.941073 [132] training's auc: 0.95786 valid_1's auc: 0.941114 [133] training's auc: 0.957948 valid_1's auc: 0.94113 [134] training's auc: 0.958032 valid_1's auc: 0.941186 [135] training's auc: 0.958118 valid_1's auc: 0.941238 [136] training's auc: 0.958259 valid_1's auc: 0.94125 [137] training's auc: 0.958397 valid_1's auc: 0.941339 [138] training's auc: 0.958505 valid_1's auc: 0.941344 [139] training's auc: 0.958581 valid_1's auc: 0.941365 [140] training's auc: 0.958665 valid_1's auc: 0.941348 [141] training's auc: 0.958815 valid_1's auc: 0.941331 [142] training's auc: 0.958928 valid_1's auc: 0.941357 [143] training's auc: 0.959023 valid_1's auc: 0.941417 [144] training's auc: 0.959119 valid_1's auc: 0.941429 [145] training's auc: 0.959219 valid_1's auc: 0.941431 [146] training's auc: 0.959298 valid_1's auc: 0.941448 [147] training's auc: 0.95942 valid_1's auc: 0.941459 [148] training's auc: 0.959505 valid_1's auc: 0.941469 [149] training's auc: 0.959607 valid_1's auc: 0.941473 [150] training's auc: 0.959678 valid_1's auc: 0.941528 [151] training's auc: 0.959795 valid_1's auc: 0.941544 [152] training's auc: 0.959851 valid_1's auc: 0.941526 [153] training's auc: 0.959941 valid_1's auc: 0.94152 [154] training's auc: 0.960032 valid_1's auc: 0.941506 [155] training's auc: 0.960143 valid_1's auc: 0.94159 [156] training's auc: 0.96024 valid_1's auc: 0.941608 [157] training's auc: 0.960343 valid_1's auc: 0.941603 [158] training's auc: 0.960442 valid_1's auc: 0.941583 [159] training's auc: 0.960532 valid_1's auc: 0.941611 [160] training's auc: 0.960609 valid_1's auc: 0.941602 [161] training's auc: 0.9607 valid_1's auc: 0.941618 [162] training's auc: 0.960795 valid_1's auc: 0.941615 [163] training's auc: 0.960893 valid_1's auc: 0.941611 [164] training's auc: 0.960998 valid_1's auc: 0.941631 [165] training's auc: 0.961102 valid_1's auc: 0.941584 [166] training's auc: 0.961181 valid_1's auc: 0.941618 [167] training's auc: 0.961243 valid_1's auc: 0.9416 [168] training's auc: 0.961326 valid_1's auc: 0.941605 [169] training's auc: 0.961417 valid_1's auc: 0.941688 [170] training's auc: 0.961486 valid_1's auc: 0.941679 [171] training's auc: 0.961574 valid_1's auc: 0.941666 [172] training's auc: 0.961674 valid_1's auc: 0.941657 [173] training's auc: 0.961789 valid_1's auc: 0.941709 [174] training's auc: 0.961881 valid_1's auc: 0.941761 [175] training's auc: 0.961958 valid_1's auc: 0.941742 [176] training's auc: 0.962029 valid_1's auc: 0.941742 [177] training's auc: 0.962116 valid_1's auc: 0.94172 [178] training's auc: 0.962203 valid_1's auc: 0.94173 [179] training's auc: 0.96228 valid_1's auc: 0.941714 [180] training's auc: 0.962365 valid_1's auc: 0.941728 [181] training's auc: 0.962451 valid_1's auc: 0.941732 [182] training's auc: 0.962532 valid_1's auc: 0.941783 [183] training's auc: 0.962589 valid_1's auc: 0.941792 [184] training's auc: 0.96264 valid_1's auc: 0.941819 [185] training's auc: 0.962743 valid_1's auc: 0.941792 [186] training's auc: 0.962861 valid_1's auc: 0.941794 [187] training's auc: 0.96298 valid_1's auc: 0.941795 [188] training's auc: 0.96305 valid_1's auc: 0.941861 [189] training's auc: 0.963129 valid_1's auc: 0.941823 [190] training's auc: 0.963235 valid_1's auc: 0.941816 [191] training's auc: 0.963324 valid_1's auc: 0.941818 [192] training's auc: 0.9634 valid_1's auc: 0.941813 [193] training's auc: 0.963503 valid_1's auc: 0.941843 [194] training's auc: 0.963603 valid_1's auc: 0.941807 [195] training's auc: 0.963717 valid_1's auc: 0.941824 [196] training's auc: 0.963808 valid_1's auc: 0.941819 [197] training's auc: 0.963914 valid_1's auc: 0.941818 [198] training's auc: 0.964018 valid_1's auc: 0.941829 [199] training's auc: 0.964099 valid_1's auc: 0.94182 [200] training's auc: 0.964176 valid_1's auc: 0.941824
/data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/ipykernel_launcher.py:19: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /data/jiangpz/miniconda3/envs/jpz_k2/lib/python3.7/site-packages/ipykernel_launcher.py:21: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
In [130]:
res.head()
Out[130]:
ID | pred | |
---|---|---|
25317 | 25318 | 0.044366 |
25318 | 25319 | 0.008862 |
25319 | 25320 | 0.013512 |
25320 | 25321 | 0.639717 |
25321 | 25322 | 0.029758 |