什么是理财北京网站建设公司,WordPress建站步骤,佛山做网站公司哪家好,wordpress 微博功能基于R 4.2.2版本演示 一、写在前面
有不少大佬问做机器学习分类能不能用R语言#xff0c;不想学Python咯。
答曰#xff1a;可#xff01;用GPT或者Kimi转一下就得了呗。
加上最近也没啥内容写了#xff0c;就帮各位搬运一下吧。 二、R代码实现Xgboost分类
#xff08…基于R 4.2.2版本演示 一、写在前面
有不少大佬问做机器学习分类能不能用R语言不想学Python咯。
答曰可用GPT或者Kimi转一下就得了呗。
加上最近也没啥内容写了就帮各位搬运一下吧。 二、R代码实现Xgboost分类
1导入数据
我习惯用RStudio自带的导入功能 2建立Xgboost模型默认参数
# Load necessary libraries
library(caret)
library(pROC)
library(ggplot2)
library(xgboost)# Assume data is your dataframe containing the data
# Set seed to ensure reproducibility
set.seed(123)# Split data into training and validation sets (80% training, 20% validation)
trainIndex - createDataPartition(data$X, p 0.8, list FALSE)
trainData - data[trainIndex, ]
validData - data[-trainIndex, ]# Prepare matrices for XGBoost
dtrain - xgb.DMatrix(data as.matrix(trainData[, -which(names(trainData) X)]), label trainData$X)
dvalid - xgb.DMatrix(data as.matrix(validData[, -which(names(validData) X)]), label validData$X)# Define parameters for XGBoost
params - list(booster gbtree, objective binary:logistic, eta 0.1, gamma 0, max_depth 6, min_child_weight 1, subsample 0.8, colsample_bytree 0.8)# Train the XGBoost model
model - xgb.train(params params, data dtrain, nrounds 100, watchlist list(eval dtrain), verbose 1)# Predict on the training and validation sets
trainPredict - predict(model, dtrain)
validPredict - predict(model, dvalid)# Convert predictions to binary using 0.5 as threshold
#trainPredict - ifelse(trainPredict 0.5, 1, 0)
#validPredict - ifelse(validPredict 0.5, 1, 0)# Calculate ROC curves and AUC values
#trainRoc - roc(response trainData$X, predictor as.numeric(trainPredict))
#validRoc - roc(response validData$X, predictor as.numeric(validPredict))
trainRoc - roc(response as.numeric(trainData$X) - 1, predictor trainPredict)
validRoc - roc(response as.numeric(validData$X) - 1, predictor validPredict)# Plot ROC curves with AUC values
ggplot(data data.frame(fpr trainRoc$specificities, tpr trainRoc$sensitivities), aes(x 1 - fpr, y tpr)) geom_line(color blue) geom_area(alpha 0.2, fill blue) geom_abline(slope 1, intercept 0, linetype dashed, color black) ggtitle(Training ROC Curve) xlab(False Positive Rate) ylab(True Positive Rate) annotate(text, x 0.5, y 0.1, label paste(Training AUC , round(auc(trainRoc), 2)), hjust 0.5, color blue)ggplot(data data.frame(fpr validRoc$specificities, tpr validRoc$sensitivities), aes(x 1 - fpr, y tpr)) geom_line(color red) geom_area(alpha 0.2, fill red) geom_abline(slope 1, intercept 0, linetype dashed, color black) ggtitle(Validation ROC Curve) xlab(False Positive Rate) ylab(True Positive Rate) annotate(text, x 0.5, y 0.2, label paste(Validation AUC , round(auc(validRoc), 2)), hjust 0.5, color red)# Calculate confusion matrices based on 0.5 cutoff for probability
confMatTrain - table(trainData$X, trainPredict 0.5)
confMatValid - table(validData$X, validPredict 0.5)# Function to plot confusion matrix using ggplot2
plot_confusion_matrix - function(conf_mat, dataset_name) {conf_mat_df - as.data.frame(as.table(conf_mat))colnames(conf_mat_df) - c(Actual, Predicted, Freq)p - ggplot(data conf_mat_df, aes(x Predicted, y Actual, fill Freq)) geom_tile(color white) geom_text(aes(label Freq), vjust 1.5, color black, size 5) scale_fill_gradient(low white, high steelblue) labs(title paste(Confusion Matrix -, dataset_name, Set), x Predicted Class, y Actual Class) theme_minimal() theme(axis.text.x element_text(angle 45, hjust 1), plot.title element_text(hjust 0.5))print(p)
}# Now call the function to plot and display the confusion matrices
plot_confusion_matrix(confMatTrain, Training)
plot_confusion_matrix(confMatValid, Validation)# Extract values for calculations
a_train - confMatTrain[1, 1]
b_train - confMatTrain[1, 2]
c_train - confMatTrain[2, 1]
d_train - confMatTrain[2, 2]a_valid - confMatValid[1, 1]
b_valid - confMatValid[1, 2]
c_valid - confMatValid[2, 1]
d_valid - confMatValid[2, 2]# Training Set Metrics
acc_train - (a_train d_train) / sum(confMatTrain)
error_rate_train - 1 - acc_train
sen_train - d_train / (d_train c_train)
sep_train - a_train / (a_train b_train)
precision_train - d_train / (b_train d_train)
F1_train - (2 * precision_train * sen_train) / (precision_train sen_train)
MCC_train - (d_train * a_train - b_train * c_train) / sqrt((d_train b_train) * (d_train c_train) * (a_train b_train) * (a_train c_train))
auc_train - roc(response trainData$X, predictor trainPredict)$auc# Validation Set Metrics
acc_valid - (a_valid d_valid) / sum(confMatValid)
error_rate_valid - 1 - acc_valid
sen_valid - d_valid / (d_valid c_valid)
sep_valid - a_valid / (a_valid b_valid)
precision_valid - d_valid / (b_valid d_valid)
F1_valid - (2 * precision_valid * sen_valid) / (precision_valid sen_valid)
MCC_valid - (d_valid * a_valid - b_valid * c_valid) / sqrt((d_valid b_valid) * (d_valid c_valid) * (a_valid b_valid) * (a_valid c_valid))
auc_valid - roc(response validData$X, predictor validPredict)$auc# Print Metrics
cat(Training Metrics\n)
cat(Accuracy:, acc_train, \n)
cat(Error Rate:, error_rate_train, \n)
cat(Sensitivity:, sen_train, \n)
cat(Specificity:, sep_train, \n)
cat(Precision:, precision_train, \n)
cat(F1 Score:, F1_train, \n)
cat(MCC:, MCC_train, \n)
cat(AUC:, auc_train, \n\n)cat(Validation Metrics\n)
cat(Accuracy:, acc_valid, \n)
cat(Error Rate:, error_rate_valid, \n)
cat(Sensitivity:, sen_valid, \n)
cat(Specificity:, sep_valid, \n)
cat(Precision:, precision_valid, \n)
cat(F1 Score:, F1_valid, \n)
cat(MCC:, MCC_valid, \n)
cat(AUC:, auc_valid, \n)
在R语言中训练Xgboost模型时可调参数很多
1通用参数
这些参数用于控制XGBoost的整体功能
①booster: 选择每一步的模型类型常用的有
gbtree基于树的模型默认gblinear线性模型dartDropouts meet Multiple Additive Regression Trees
②nthread: 并行线程数默认为最大可用线程数。
③verbosity: 打印消息的详细程度0 (silent), 1 (warning), 2 (info), 3 (debug)。
2Booster 参数
控制每一步提升booster的行为
①eta (或 learning_rate): 学习率控制每步的收缩以防止过拟合。
②min_child_weight: 决定最小叶子节点样本权重和用于控制过拟合。
③max_depth: 树的最大深度限制树的增长以避免过拟合。
④max_leaf_nodes: 最大叶子节点数。
⑤gamma (或 min_split_loss): 分裂节点所需的最小损失函数下降值。
⑥subsample: 训练每棵树时用于随机采样的部分数据比例。
⑦colsample_bytree/colsample_bylevel/colsample_bynode: 构建树时每个级别的特征采样比例。
⑧lambda (或 reg_lambda): L2 正则化项权重。
⑨alpha (或 reg_alpha): L1 正则化项权重。
⑩scale_pos_weight: 在类别不平衡的情况下加权正类的权重。
n_estimators / nroundsBoosting 过程中的树的数量或者说是提升迭代的轮数。每轮迭代通常会产生一个新的模型通常是一棵树。
3学习任务参数
用于控制学习任务和相应的学习目标
①objective: 定义学习任务和相应的学习目标如
②binary:logistic: 二分类的逻辑回归返回预测概率。
③multi:softmax: 多分类的softmax需要设置 num_class类别数。
④reg:squarederror: 回归任务的平方误差。
⑤eval_metric: 验证数据的评估指标如 rmse、mae、logloss、error (分类错误率)、auc 等。
⑥seed: 随机数种子用于结果的可重复性。
5DART Booster特有参数
当 booster 设置为 dart 时
①sample_type: 采样类型。
②normalize_type: 归一化类型。
③rate_drop: 每次迭代中树的丢弃率。
④skip_drop: 跳过丢弃的概率。
在随便设置了一些参数值结果如下 从AUC来看Xgboost随便一跑直接就过拟合了验证集的性能相比训练集差距挺大的。得好好调参调参才行。 三、Xgboost手动调参原则
调参的一般策略是可以先使用网格搜索Grid Search、随机搜索Random Search或更高级的方法如贝叶斯优化来粗略地确定合适的参数范围然后在这些范围内细致地调整和验证以找到最优的模型配置。
主要调的参数max_depth、min_child_weight、gamma、subsample、colsample_bytree / colsample_bylevel / colsample_bynode、eta、lambda、alpha和n_estimators (或 nrounds)。
max_depth最大深度通常范围是3到10。较大的深度可能会导致过拟合尤其是在小数据集上。
min_child_weight最小子节点权重有助于控制过拟合。面对高度不平衡的类别时可以适当增加此值。
gamma伽马从0开始调整根据控制过拟合的需要逐渐增加。
subsample、colsample_bytree/colsample_bylevel/colsample_bynode子采样率、按树/层/节点的列采样率通常范围从0.5到1。这些参数控制了每一步的数据子采样。
eta学习率较小的值可以使训练更加稳健但需要更多的训练迭代。
lambda 和 alphaL2和L1正则化项在成本函数中添加正则化项。0到10的范围通常效果不错。
nrounds树的数量或迭代次数更多的树可以模拟更复杂的模式但也可能导致过拟合。
# Load necessary libraries
library(caret)
library(pROC)
library(ggplot2)
library(xgboost)# Assume data is your dataframe containing the data
# Set seed to ensure reproducibility
set.seed(123)# Convert the target variable to factor if not already
data$X - factor(data$X)# Split data into training and validation sets (80% training, 20% validation)
trainIndex - createDataPartition(data$X, p 0.8, list FALSE)
trainData - data[trainIndex, ]
validData - data[-trainIndex, ]# Prepare matrices for XGBoost
dtrain - xgb.DMatrix(data as.matrix(trainData[, -which(names(trainData) X)]), label as.numeric(trainData$X) - 1)
dvalid - xgb.DMatrix(data as.matrix(validData[, -which(names(validData) X)]), label as.numeric(validData$X) - 1)# Define parameter grid
depths - c(4, 6, 10)
weights - c(1, 5, 10)
gammas - c(0, 0.2, 0.5)
subsamples - c(0.5, 0.8, 0.9)
colsamples - c(0.5, 0.8, 0.9)
etas - c(0.01, 0.1, 0.2)
lambdas - c(0, 5, 10)
alphas - c(0, 1, 5)
nrounds - c(100, 250, 500)best_auc - 0
best_params - list()# Loop through parameter grid
for (max_depth in depths) {for (min_child_weight in weights) {for (gamma in gammas) {for (subsample in subsamples) {for (colsample_bytree in colsamples) {for (eta in etas) {for (lambda in lambdas) {for (alpha in alphas) {for (nround in nrounds) {# Set parameters for this iterationparams - list(booster gbtree,objective binary:logistic,eta eta,gamma gamma,max_depth max_depth,min_child_weight min_child_weight,subsample subsample,colsample_bytree colsample_bytree,lambda lambda,alpha alpha)# Train the modelmodel - xgb.train(params params, data dtrain, nrounds nround, watchlist list(eval dtrain), verbose 0)# Predict on the validation setvalidPredict - predict(model, dvalid)validPredictBinary - ifelse(validPredict 0.5, 1, 0)# Calculate AUCvalidRoc - roc(response as.numeric(validData$X) - 1, predictor validPredictBinary)auc_score - auc(validRoc)# Update best model if current AUC is betterif (auc_score best_auc) {best_auc - auc_scorebest_params - paramsbest_params$nrounds - nround}}}}}}}}}
}# Print the best AUC and corresponding parameters
print(paste(Best AUC:, best_auc))
print(Best Parameters:)
print(best_params)# After parameter tuning, train the model with best parameters
model - xgb.train(params best_params, data dtrain, nrounds best_params$nrounds, watchlist list(eval dtrain), verbose 0)# Predict on the training and validation sets using the final model
trainPredict - predict(model, dtrain)
validPredict - predict(model, dvalid)# Convert predictions to binary using 0.5 as threshold
#trainPredictBinary - ifelse(trainPredict 0.5, 1, 0)
#validPredictBinary - ifelse(validPredict 0.5, 1, 0)# Calculate ROC curves and AUC values
#trainRoc - roc(response trainData$X, predictor as.numeric(trainPredict))
#validRoc - roc(response validData$X, predictor as.numeric(validPredict))
trainRoc - roc(response as.numeric(trainData$X) - 1, predictor trainPredict)
validRoc - roc(response as.numeric(validData$X) - 1, predictor validPredict)# Plot ROC curves with AUC values
ggplot(data data.frame(fpr trainRoc$specificities, tpr trainRoc$sensitivities), aes(x 1 - fpr, y tpr)) geom_line(color blue) geom_area(alpha 0.2, fill blue) geom_abline(slope 1, intercept 0, linetype dashed, color black) ggtitle(Training ROC Curve) xlab(False Positive Rate) ylab(True Positive Rate) annotate(text, x 0.5, y 0.1, label paste(Training AUC , round(auc(trainRoc), 2)), hjust 0.5, color blue)ggplot(data data.frame(fpr validRoc$specificities, tpr validRoc$sensitivities), aes(x 1 - fpr, y tpr)) geom_line(color red) geom_area(alpha 0.2, fill red) geom_abline(slope 1, intercept 0, linetype dashed, color black) ggtitle(Validation ROC Curve) xlab(False Positive Rate) ylab(True Positive Rate) annotate(text, x 0.5, y 0.2, label paste(Validation AUC , round(auc(validRoc), 2)), hjust 0.5, color red)# Calculate confusion matrices based on 0.5 cutoff for probability
confMatTrain - table(trainData$X, trainPredict 0.5)
confMatValid - table(validData$X, validPredict 0.5)# Function to plot confusion matrix using ggplot2
plot_confusion_matrix - function(conf_mat, dataset_name) {conf_mat_df - as.data.frame(as.table(conf_mat))colnames(conf_mat_df) - c(Actual, Predicted, Freq)p - ggplot(data conf_mat_df, aes(x Predicted, y Actual, fill Freq)) geom_tile(color white) geom_text(aes(label Freq), vjust 1.5, color black, size 5) scale_fill_gradient(low white, high steelblue) labs(title paste(Confusion Matrix -, dataset_name, Set), x Predicted Class, y Actual Class) theme_minimal() theme(axis.text.x element_text(angle 45, hjust 1), plot.title element_text(hjust 0.5))print(p)
}# Now call the function to plot and display the confusion matrices
plot_confusion_matrix(confMatTrain, Training)
plot_confusion_matrix(confMatValid, Validation)# Extract values for calculations
a_train - confMatTrain[1, 1]
b_train - confMatTrain[1, 2]
c_train - confMatTrain[2, 1]
d_train - confMatTrain[2, 2]a_valid - confMatValid[1, 1]
b_valid - confMatValid[1, 2]
c_valid - confMatValid[2, 1]
d_valid - confMatValid[2, 2]# Training Set Metrics
acc_train - (a_train d_train) / sum(confMatTrain)
error_rate_train - 1 - acc_train
sen_train - d_train / (d_train c_train)
sep_train - a_train / (a_train b_train)
precision_train - d_train / (b_train d_train)
F1_train - (2 * precision_train * sen_train) / (precision_train sen_train)
MCC_train - (d_train * a_train - b_train * c_train) / sqrt((d_train b_train) * (d_train c_train) * (a_train b_train) * (a_train c_train))
auc_train - roc(response trainData$X, predictor trainPredict)$auc# Validation Set Metrics
acc_valid - (a_valid d_valid) / sum(confMatValid)
error_rate_valid - 1 - acc_valid
sen_valid - d_valid / (d_valid c_valid)
sep_valid - a_valid / (a_valid b_valid)
precision_valid - d_valid / (b_valid d_valid)
F1_valid - (2 * precision_valid * sen_valid) / (precision_valid sen_valid)
MCC_valid - (d_valid * a_valid - b_valid * c_valid) / sqrt((d_valid b_valid) * (d_valid c_valid) * (a_valid b_valid) * (a_valid c_valid))
auc_valid - roc(response validData$X, predictor validPredict)$auc# Print Metrics
cat(Training Metrics\n)
cat(Accuracy:, acc_train, \n)
cat(Error Rate:, error_rate_train, \n)
cat(Sensitivity:, sen_train, \n)
cat(Specificity:, sep_train, \n)
cat(Precision:, precision_train, \n)
cat(F1 Score:, F1_train, \n)
cat(MCC:, MCC_train, \n)
cat(AUC:, auc_train, \n\n)cat(Validation Metrics\n)
cat(Accuracy:, acc_valid, \n)
cat(Error Rate:, error_rate_valid, \n)
cat(Sensitivity:, sen_valid, \n)
cat(Specificity:, sep_valid, \n)
cat(Precision:, precision_valid, \n)
cat(F1 Score:, F1_valid, \n)
cat(MCC:, MCC_valid, \n)
cat(AUC:, auc_valid, \n)
结果输出 以上是找到的相对最优参数组合看看具体性能 似乎有点提升过拟合没那么明显了。验证集的性能也有所提高。
有兴趣可以继续调参。 五、最后
数据嘛
链接https://pan.baidu.com/s/1rEf6JZyzA1ia5exoq5OF7g?pwdx8xm
提取码x8xm