智能网站建设系统,设计logo免费网站,根据网站开发app,长春网络关键词排名过采样和下采样调整不均衡样本的逻辑回归模型 目录 过采样和下采样调整不均衡样本的逻辑回归模型1 过采样1.1 样本不均衡1.2 概念1.3 图片理解1.4 SMOTE算法1.5 算法导入1.6 函数及格式1.7 样本类别可视化理解 2 下采样2.1 概念2.2 图片理解2.3 数据处理理解2.4 样本类别可视化…过采样和下采样调整不均衡样本的逻辑回归模型 目录 过采样和下采样调整不均衡样本的逻辑回归模型1 过采样1.1 样本不均衡1.2 概念1.3 图片理解1.4 SMOTE算法1.5 算法导入1.6 函数及格式1.7 样本类别可视化理解 2 下采样2.1 概念2.2 图片理解2.3 数据处理理解2.4 样本类别可视化理解 3 实际调整模型 1 过采样 1.1 样本不均衡
数据集中不同类别的样本数量差异很大通常表现为一个类别的样本数量远多于其他类别。
1.2 概念
增加少数类的样本数量使其样本多的类别样本数量相同。
1.3 图片理解 1.4 SMOTE算法 1.5 算法导入
from imblearn.over_sampling import SMOTE1.6 函数及格式
ov SMOTE(random_state0)随机抽取函数 random_state是随机种子保证同一数字时随机抽取数据相同 x_ov,y_ov ov.fit_resample(x_tr_all,y_tr_all) x_ov经过随机抽取自动拟合后数据,y_ovx_tr_all,y_tr_all
1.7 样本类别可视化理解
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import pylab as mpl# 标准化处理
scaler StandardScaler()
data pd.read_csv(creditcard.csv)
a data[[Amount]]
b data[Amount]
# z标准化处理Amount再存Amount中
data[Amount] scaler.fit_transform(data[[Amount]])
# 删除time列
data data.drop([Time],axis1)
# 特征数据x,删除class列
x_all data.drop([Class],axis1)
# class为标签结果列
y_all data.Class
# # 训练集特征测试集特征,训练集结果测试集结果,test_size抽取的测试集百分比train_size 抽取的训练集百分比
x_tr_all,x_te_all,y_tr_all,y_te_all \train_test_split(x_all,y_all, test_size0.2,random_state1000)
# 样本不均衡图片
mpl.rcParams[font.sans-serif][Microsoft YaHei]
mpl.rcParams[axes.unicode_minus]False
labels_count pd.value_counts(y_all)
plt.title(正负样本数1)
plt.xlabel(类别)
plt.ylabel(频数)
labels_count.plot(kindbar)
plt.show()
# #过采样使样本均衡
from imblearn.over_sampling import SMOTE
ov SMOTE(random_state0)
x_tr_ov,y_tr_ov ov.fit_resample(x_tr_all,y_tr_all)
# 交叉验证
scores []
c_range [0.01,0.1,1,10,100]
# 均衡样本正负图像显示
mpl.rcParams[font.sans-serif][Microsoft YaHei]
mpl.rcParams[axes.unicode_minus]False
labels_count pd.value_counts(y_tr_ov)
plt.title(正负样本数)
plt.xlabel(类别)
plt.ylabel(频数)
labels_count.plot(kindbar)
plt.show()2 下采样 2.1 概念
减少多数类的样本数量使其样本少的类别样本数量相同但可能会丢失重要信息。
2.2 图片理解 2.3 数据处理理解
pt_eg **data_tr[data_tr[‘Class’] 0]**找出两类数据ng_eg data_tr[data_tr[‘Class’] 1]pt_eg pt_eg.sample(len(ng_eg)) 根据少的数据对多的数据进行抽取data_c pd.concat([pt_eg,ng_eg])再将两类数据合并
2.4 样本类别可视化理解
代码展示
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from numpy.random import sample
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import pylab as mpl# 标准化处理
scaler StandardScaler()
data pd.read_csv(creditcard.csv)
a data[[Amount]]
b data[Amount]
# z标准化处理Amount再存Amount中
data[Amount] scaler.fit_transform(data[[Amount]])
# 删除time列
data data.drop([Time],axis1)
# 特征数据x,删除class列
x_all data.drop([Class],axis1)
# class为标签结果列
y_all data.Class
# # 训练集特征测试集特征,训练集结果测试集结果,test_size抽取的测试集百分比train_size 抽取的训练集百分比
x_tr_all,x_te_all,y_tr_all,y_te_all \train_test_split(x_all,y_all, test_size0.2,random_state1000)
# 样本不均衡
mpl.rcParams[font.sans-serif][Microsoft YaHei]
mpl.rcParams[axes.unicode_minus]False
labels_count pd.value_counts(y_all)
plt.title(正负样本数1)
plt.xlabel(类别)
plt.ylabel(频数)
labels_count.plot(kindbar)
plt.show()
#下采样
## 组合为后准备两个表格组合前datafarme,后serise,添加列直接赋值
np.random.seed(seed4)
# 随机种子
x_tr_all[Class] y_tr_all
data_tr x_tr_all
pt_eg data_tr[data_tr[Class] 0]
ng_eg data_tr[data_tr[Class] 1]
pt_eg pt_eg.sample(len(ng_eg))
data_c pd.concat([pt_eg,ng_eg])
x_data_c data_c.drop([Class],axis1)
y_data_c data_c[Class]
mpl.rcParams[font.sans-serif][Microsoft YaHei]
mpl.rcParams[axes.unicode_minus]False
labels_count pd.value_counts(y_data_c )
plt.title(正负样本数1)
plt.xlabel(类别)
plt.ylabel(频数)
labels_count.plot(kindbar)
plt.show()3 实际调整模型
不均衡样本下采样样本过采样样本训练模型代码及结果可以明显看到数据召回率上升。
代码展示
import time
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from numpy.random import sample
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
import pylab as mpl
# 标准化处理
scaler StandardScaler()
data pd.read_csv(creditcard.csv)
a data[[Amount]]
b data[Amount]
# z标准化处理Amount再存Amount中
data[Amount] scaler.fit_transform(data[[Amount]])
# 删除time列
data data.drop([Time],axis1)
# 特征数据x,删除class列
x_all data.drop([Class],axis1)
# class为标签结果列
y_all data.Class
# # 训练集特征测试集特征,训练集结果测试集结果,test_size抽取的测试集百分比train_size 抽取的训练集百分比
x_tr_all,x_te_all,y_tr_all,y_te_all \train_test_split(x_all,y_all, test_size0.2,random_state1000)
# 样本不均衡
scores []
c_range [0.01,0.1,1,10,100]
## 循环测试带入因子
for i in c_range:start_time time.time()lg LogisticRegression(Ci,penaltyl2,solverlbfgs,max_iter1000)# 模型迭代8次后的所有模型的recall值score cross_val_score(lg,x_tr_all,y_tr_all,cv5,scoringrecall)# score的平均值也就是recall的平均值score_m sum(score)/len(score)# scores列表添加均值recallscores.append(score_m)end_time time.time()
best_c c_range[np.argmax(scores)]
lg LogisticRegression(Cbest_c,penaltyl2,max_iter1000)
lg.fit(x_te_all,y_te_all)
te_pr lg.predict(x_te_all)
print(不均衡样本训练)
print(metrics.classification_report(y_te_all,te_pr))# 下采样
np.random.seed(seed4)
x_tr_all[Class] y_tr_all
data_tr x_tr_all
pt_eg data_tr[data_tr[Class] 0]
ng_eg data_tr[data_tr[Class] 1]
pt_eg pt_eg.sample(len(ng_eg))
data_c pd.concat([pt_eg,ng_eg])
x_data_c data_c.drop([Class],axis1)
# class为标签结果列
y_data_c data_c.Class
# # 交叉验证
scores []
c_range [0.01,0.1,1,10,100]
# 循环测试带入因子
for i in c_range:lg LogisticRegression(Ci,penaltyl2,solverlbfgs,max_iter1000)# 模型迭代8次后的所有模型的recall值score cross_val_score(lg,x_data_c,y_data_c,cv5,scoringrecall)# score的平均值也就是recall的平均值score_m sum(score)/len(score)# scores列表添加均值recallscores.append(score_m)
best_c c_range[np.argmax(scores)]
# 根据上面最大判断建立模型
lg LogisticRegression(Cbest_c,penaltyl2,max_iter1000)
lg.fit(x_data_c,y_data_c)
te_pr lg.predict(x_te_all)
print(下采样均衡样本训练)
print(metrics.classification_report(y_te_all,te_pr))# #过采样
scaler StandardScaler()
data pd.read_csv(creditcard.csv)
a data[[Amount]]
b data[Amount]
# z标准化处理Amount再存Amount中
data[Amount] scaler.fit_transform(data[[Amount]])
# 删除time列
data data.drop([Time],axis1)
# 特征数据x,删除class列
x_all data.drop([Class],axis1)
# class为标签结果列
y_all data.Class
x_tr_all,x_te_all,y_tr_all,y_te_all \train_test_split(x_all,y_all, test_size0.2,random_state1000)
from imblearn.over_sampling import SMOTE
ov SMOTE(random_state0)
x_tr_ov,y_tr_ov ov.fit_resample(x_tr_all,y_tr_all)
# 交叉验证
scores []
c_range [0.01,0.1,1,10,100]
## 循环测试带入因子
for i in c_range:# start_time time.time()lg LogisticRegression(Ci,penaltyl2,solverlbfgs,max_iter1000)# 模型迭代8次后的所有模型的recall值score cross_val_score(lg,x_tr_ov,y_tr_ov,cv5,scoringrecall)# score的平均值也就是recall的平均值score_m sum(score)/len(score)# scores列表添加均值recallscores.append(score_m)
best_c c_range[np.argmax(scores)]
lg LogisticRegression(Cbest_c,penaltyl2,max_iter1000)
lg.fit(x_tr_ov,y_tr_ov)
te_pr1 lg.predict(x_te_all)
print(过采样均衡样本训练)
print(metrics.classification_report(y_te_all,te_pr1))运行结果