深圳做企业网站,泰安人才网最新招聘信息,营销推广网,汽车4s网站设计知识要点 estimater 有点没理解透 数据集是泰坦尼克号人员幸存数据. 读取数据#xff1a;train_df pd.read_csv(./data/titanic/train.csv) 显示数据特征#xff1a;train_df.info() 显示开头部分数据#xff1a;train_df.head() 提取目标特征#xff1a;y_train tr…知识要点 estimater 有点没理解透 数据集是泰坦尼克号人员幸存数据. 读取数据train_df pd.read_csv(./data/titanic/train.csv) 显示数据特征train_df.info() 显示开头部分数据train_df.head() 提取目标特征y_train train_df.pop(survived) 显示数据分布train_df.describe() 柱状图显示train_df.age.hist(bins 20) 横向柱状图: train_df.sex.value_counts().plot(kind barh) pd.concat([train_df, y_train], axis 1).groupby(sex).survived.mean().plot(kind barh) # 根据幸存率查看各类型的均值 提取不同特征的统计: train_df.embark_town.value_counts() 提取特征: vocab train_df[categorical_column].unique() tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(categorical_column, vocab)) # one_hot 编码 dataset批次设置: dataset dataset.repeat(epochs).batch(batch_size) 1 导包
from tensorflow import keras
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
2 数据导入
train_df pd.read_csv(./data/titanic/train.csv)
eval_df pd.read_csv(./data/titanic/eval.csv) # eval 评估 # 数据
print(train_df.info())
print(eval_df.info()) train_df.head() 3 目标值获取
y_train train_df.pop(survived)
y_eval eval_df.pop(survived)print(train_df.head())
print(eval_df.head())
print(y_train.head())
print(y_eval.head()) 4 特征处理
train_df.describe() # 观察年龄的数据分布
train_df.age.hist(bins 20) # 观察男女比例, 性别数量对比
train_df.sex.value_counts().plot(kind barh) # 仓位对比, 船舱类型
train_df[class].value_counts().plot(kind barh) # 看港口人数
train_df[embark_town].value_counts().plot(kind barh) pd.concat([train_df, y_train], axis 1).groupby(sex).survived.mean().plot(kind barh) train_df.embark_town.value_counts()
Southampton 450
Cherbourg 123
Queenstown 53
unknown 1
Name: embark_town, dtype: int64
# 区分离散特征和连续特征
categorical_columns [sex, n_siblings_spouses, parch, class, deck, embark_town, alone] # 离散特征
numeric_columns [age, fare]# 接受特征
feature_columns []
for categorical_column in categorical_columns:vocab train_df[categorical_column].unique() # 取出特征值print(vocab)# print(tf.feature_column.categorical_column_with_vocabulary_list(categorical_column, vocab)) # 创建vocabulary 的API# 将离散特征转换为one_hot形式的编码num tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(categorical_column, vocab))feature_columns.append(num) # 数据类型转换
for numeric_column in numeric_columns:feature_columns.append(tf.feature_column.numeric_column(numeric_column, dtype tf.float32))
5 dataset
# 创建生成dataset的方法
def make_dataset(data_df, label_df, epochs 10, shuffle True, batch_size 32):dataset tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))if shuffle:dataset dataset.shuffle(10000) # 打乱, 洗牌dataset dataset.repeat(epochs).batch(batch_size)return dataset
train_dataset make_dataset(train_df, y_train, batch_size 5)
# baseline_model
import os
output_dir baseline_model
if not os.path.exists(output_dir):os.mkdir(output_dir)baseline_estimator tf.compat.v1.estimator.BaselineClassifier(model_dir output_dir, n_classes 2)
# input_fn要求没有输入参数, 要求返回元组(x, y)或者可以返回(x, y)的dataset
baseline_estimator.train(input_fn lambda : make_dataset(train_df, y_train, epochs 100))
# baseline 是随机参数, 所以结果很差
baseline_estimator.evaluate(input_fn lambda : make_dataset(eval_df, y_eval, epochs 1,shuffle False, batch_size 20))
# linear_model
linear_output_dir linear_model
if not os.path.exists(linear_output_dir):os.mkdir(linear_output_dir)linear_estimator tf.estimator.LinearClassifier(feature_columns feature_columns,model_dir linear_output_dir)
linear_estimator.train(input_fn lambda :make_dataset(train_df, y_train, epochs 100))
# baseline 是随机参数, 所以结果很差
linear_estimator.evaluate(input_fn lambda : make_dataset(eval_df, y_eval, epochs 1, shuffle False,batch_size 20))
dnn_output_dir ./dnn_model
if not os.path.exists(dnn_output_dir):os.mkdir(dnn_output_dir)dnn_estimator tf.estimator.DNNClassifier(model_dir dnn_output_dir, # 存储地址n_classes 2, # 二分类feature_columns feature_columns, hidden_units [128, 128], # 隐藏层activation_fn tf.nn.relu, # 算法optimizer Adam) # 损失函数, 优化:optimizer
# dnn_estimator.train(input_fn lambda : make_dataset(train_df, y_train, epochs 100))dnn_estimator.train(input_fn lambda :make_dataset(train_df, y_train, epochs 100))
dnn_estimator.evaluate(input_fn lambda : make_dataset(eval_df, y_eval, epochs 1,shuffle False, batch_size 20))