import tensorflow as tf
from tensorflow.python.feature_column import feature_column_v2 as fc_v2
from tensorflow.python.feature_column import feature_column as fc
# 注意:只有方式2会检查输入数据是否符合feature_column的定义
def numeric_column():
column = tf.feature_column.numeric_column(
key="feature",
shape=(3,2,1,),
default_value=100,
dtype=tf.float32,
normalizer_fn=lambda x: x / 2)
features = {"feature": tf.constant(value=[
[[1, 2], [3, 4], [5, 6]],
[[7, 8], [9, 10], [11, 12]]
])}
# feature_column处理方式:1
feature_cache = fc_v2.FeatureTransformationCache(features=
features
)
rs_1 = column.get_dense_tensor(transformation_cache=feature_cache, state_manager=None)
# feature_column处理方式:2
net = tf.feature_column.input_layer(features,column)
# feature_column处理方式:3
builder = fc._LazyBuilder(features)
rs_3 = column._get_dense_tensor(builder,None)
with tf.Session() as sess:
print(sess.run(rs_1))
print(sess.run(net))
print(sess.run(rs_3))
numeric_column()
import tensorflow as tf
from tensorflow.python.feature_column import feature_column_v2 as fc_v2
from tensorflow.python.feature_column import feature_column as fc
from tensorflow.python.feature_column import feature_column_lib as fcl
from tensorflow.python.feature_column import sequence_feature_column as sqfc
def sequence_numeric_column():
# 用法同numeric_column基本一致
column = tf.feature_column.sequence_numeric_column(
key="feature",
# shape指定序列中每个元素的形状
# 最终返回结构的形状为[batch_size, element_count/sum(shape[:]) ,shape]
# 该值的设置只会影响dense_tensor。sequence_length只和实际输入数据有关
shape=(3,),
default_value=60,
dtype=tf.float32,
normalizer_fn=lambda x: x / 2)
column2 = tf.contrib.feature_column.sequence_numeric_column(
key="feature",
shape=(3,),
default_value=60,
dtype=tf.float32,
normalizer_fn=lambda x: x / 2)
features = {
# feature对应的值必须为SparseTensor
"feature": tf.SparseTensor(
# indices要按顺序写
indices=[
[0, 0, 1],
[0, 1, 0],
[0, 5, 0],
[0, 5, 1],
[1, 2, 1],
[1, 3, 0],
[1, 3, 1]
],
values=[4, 1, 7, 9, 3, 4., 4],
dense_shape=[2, 6, 2])
}
# 方式:1
feature_cache = feature_column_lib.FeatureTransformationCache(features=features)
rs_1 = column.get_sequence_dense_tensor(transformation_cache=feature_cache, state_manager=None)
# 方式:2
rs_2 = tf.contrib.feature_column.sequence_input_layer(features,column2)
builder = fc._LazyBuilder(features)
# 方式:3
rs_3 = column2._get_sequence_dense_tensor(builder,None)
with tf.Session() as sess:
print(sess.run(rs_1))
print("111"*20)
print(sess.run(rs_2))
print("222"*20)
print(sess.run(rs_3))
sequence_numeric_column()
input_layer的输入要求:
All items should be instances of classes derived from
`_DenseColumn` such as `numeric_column`, `embedding_column`,
`bucketized_column`, `indicator_column`. If you have categorical features,
you can wrap them with an `embedding_column` or `indicator_column`.
简单说就是input_layer的输入要求是dense稠密数据
tf.feature_column是一套处理数据的工具,我一般把它用做TensorFlow内的“特征工程”。
非sequence(非序列)固定长度的连续实数特征
sequence(序列)固定长度的连续实数特征(缺失值会用设置的默认值替代)
对于int型、string型分类特征:可以做成数值表示的特征、multi_hot特征、加权multi_hot特征、embedding特征、加权后的embedding特征。(适用于sequence和非sequence)
对于int型分类特征还可以做成one_hot特征。仅适用于非sequence。
还可以做int型、string型分类特征之间的交叉特征,以及share embedding特征
“fea_1”:0.123
“fea_2”:[0.123,0.222]
"fea_3":[1,3,5]
"fea_4":10
"fea_0":[ [[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]] ]
"fea_sparse_1" : tf.SparseTensor( # indices要按顺序写 indices=[ [0, 0, 1], [0, 1, 0], [0, 5, 0], [0, 5, 1], [1, 2, 1], [1, 3, 0], [1, 3, 1] ], values=[4, 1, 7, 9, 3, 4., 4], dense_shape=[2, 6, 2])
normalizer_fn=lambda x: x/2
# numeric_column只支持int和float类型
tf.feature_column.numeric_column(key="fea_1",shape=(1,),default_value=0,dtype=tf.float32,normalizer_fn=lambda x: ...)
tf.feature_column.numeric_column(key="fea_2",shape=(2,),default_value=0,dtype=tf.float32,normalizer_fn=lambda x: ...)
tf.feature_column.numeric_column(key="fea_3",shape=(3,),default_value=0,dtype=tf.int64,normalizer_fn=lambda x: ...)
对于fea_1、fea_2、fea_3、fea_4特征,可以将其放在一起作为"fea_num",这样生成的tfrecord包含的key将会减少,从而占用空间会减少。
column = tf.feature_column.sequence_numeric_column(
key="feature",
shape=(6,),
default_value=60,
dtype=tf.float32,
normalizer_fn=lambda x: x / 2)
# 适用于 tf.contrib.feature_column.sequence_input_layer
column2 = tf.contrib.feature_column.sequence_numeric_column(
key="feature",
shape=(6,),
default_value=60,
dtype=tf.float32,
normalizer_fn=lambda x: x / 2)
# 输入稀疏特征"fea_sparse_1"
# 结果:
TensorSequenceLengthPair(dense_tensor=array([[[60. , 2. , 0.5, 60. , 60. , 60. ],
[60. , 60. , 60. , 60. , 3.5, 4.5]],
[[60. , 60. , 60. , 60. , 60. , 1.5],
[ 2. , 2. , 60. , 60. , 60. , 60. ]]], dtype=float32), sequence_length=array([6, 4], dtype=int64))
# 3个2行2列的数据
"fea_5":[
[["value1", "value2"], ["value3", "value3"]],
[["value3", "value5"], ["value4", "value4"]],
[["value4", "value5"], ["value2", "value4"]]
]
# 下面两个都是2个1维数据
"fea_6":["value1","value2"]
"fea_7":[["value1"],["value2"]]
# 1个1维数据
"fea_8":["value1"]
# 2个1行两列的数据
"fea_9":[["value1","value3"],["value2","value4"]]
# 2个2行2列的数据
"fea_10":[
[["value1", "value2"], ["value3", "value3"]],
[["value3", "value5"], ["value4", "value4"]]
]
# 3个2行3列的稠密数据
"fea_11":[
[[1, 2, 3], [4, 5, 6]],
[[5, 6, 7], [8, 9, 10]],
[[8, 9, 10], [11, 12, 13]]
]
# 3个1行6列的稠密数据
"fea_12":[
[1, 2, 3, 4, 5, 6],
[5, 6, 7, 8, 9, 10],
[8, 9, 10, 11, 12, 13]
]
# 分类特征取值对应权重数据(对应的数据最好和权重的维度保持一致,且最多是2个维度)
"fea_weight_1":[
[1.1, 2.2, 3.3, 4.4, 5.5, 6.6],
[9.9, 8.8, 7.7, 6.6, 5.5, 4.4]
]
# 权重数据3个1行4列数据
"fea_weight_2":[
[1.1, 2.2, 3.3, 4.4],
[9.9, 8.8, 7.7, 6.6],
[3.4, 8.8, 2.2, 6.6]
]
# 3个1行4列特征分类特征数据
"fea_13":[
["value1", "value2","value3", "value3"],
["value3", "value5","value4", "value4"],
["value4", "value5","value2", "value4"]
]
# 2个3行2列的特征数据
"fea_14":[
[[1, 2], [3, 4], [5, 6]],
[[7, 7], [9, 10], [11, 12]]
]
int型数据同上
使用的四种方式:
注意:对于稠密densor特征其输入数据维度必须保持一致
# 类别取值为int类型
column = tf.feature_column.categorical_column_with_vocabulary_list(
key="feature",
vocabulary_list=[1, 2, 3, 4],
dtype=tf.int64,
default_value=-1,
# 作用同default_value,但是两者不能同时起作用。
# 将超出的取值映射到[len(vocabulary), len(vocabulary) + num_oov_buckets)内
# 默认取值为0
# 当该值不为0时,default_value必须设置为-1
# 当default_value和num_oov_buckets都取默认值时,会被映射为-1
num_oov_buckets=4)
# 类别取值为string类型
column = tf.feature_column.categorical_column_with_vocabulary_list(
key="feature",
vocabulary_list=["value1", "value2", "value3","value4"],
dtype=tf.string,
default_value=-1,
num_oov_buckets=4)
# 输入数据fea_5后转换的稀疏tensor张量结果:
SparseTensorValue(indices=array([[0, 0, 0],
[0, 0, 1],
[0, 1, 0],
[0, 1, 1],
[1, 0, 0],
[1, 0, 1],
[1, 1, 0],
[1, 1, 1],
[2, 0, 0],
[2, 0, 1],
[2, 1, 0],
[2, 1, 1]], dtype=int64), values=array([0, 1, 2, 2, 2, 6, 3, 3, 3, 6, 1, 3], dtype=int64), dense_shape=array([3, 2, 2], dtype=int64))
# 使用方式:1
# 转换为数值表示的稠密dense tensor结果:
[[[0 1]
[2 2]]
[[2 6]
[3 3]]
[[3 6]
[1 3]]]
# 使用方式:2
# 转换为multi_hot特征的结果(8列=vocabulary_list长度+num_oov_buckets):
[[[1. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 2. 0. 0. 0. 0. 0.]]
[[0. 0. 1. 0. 0. 0. 1. 0.]
[0. 0. 0. 2. 0. 0. 0. 0.]]
[[0. 0. 0. 1. 0. 0. 1. 0.]
[0. 1. 0. 1. 0. 0. 0. 0.]]]
# 使用方式:3
# 转换为embedding特征的结果(3列是自己设置的embedding的维度):
[[[-0.36440656 0.1924808 0.1217252 ] # 表征数据整体 ["value1", "value2"]
[ 0.71263236 -0.45157978 -0.3456324 ]] # 表征数据整体 ["value3", "value3"]
[[-0.18493024 -0.20456922 -0.3947454 ] # 表征数据整体 ["value3", "value5"]
[-0.19874108 0.6833139 -0.56441975]] # 表征数据整体 ["value4", "value4"]
[[-0.64061695 0.3628776 -0.50413907] # 表征数据整体 ["value4", "value5"]
[-0.28863966 0.14901578 0.16483489]]] # 表征数据整体 ["value2", "value4"]
# 注意:
# 使用input_layer时"fea_5"不可用而"fea_9"可用,好像是不支持维度太高的
# 输入数据:"fea_weight_2" 和 "fea_13" 后的加权特征结果(方式4、5):
# 使用方式:4
# 稠密的densor数值特征特征没用,有用的是embedding特征及加权的multi_hot特征:
IdWeightPair(id_tensor=SparseTensorValue(indices=array([[0, 0],
[0, 1],
[0, 2],
[0, 3],
[1, 0],
[1, 1],
[1, 2],
[1, 3],
[2, 0],
[2, 1],
[2, 2],
[2, 3]], dtype=int64), values=array([0, 1, 2, 2, 2, 6, 3, 3, 3, 6, 1, 3], dtype=int64), dense_shape=array([3, 4], dtype=int64)), weight_tensor=SparseTensorValue(indices=array([[0, 0],
[0, 1],
[0, 2],
[0, 3],
[1, 0],
[1, 1],
[1, 2],
[1, 3],
[2, 0],
[2, 1],
[2, 2],
[2, 3]], dtype=int64), values=array([1.1, 2.2, 3.3, 4.4, 9.9, 8.8, 7.7, 6.6, 3.4, 8.8, 2.2, 6.6],
dtype=float32), dense_shape=array([3, 4], dtype=int64)))
[[ 1.1 2.2 7.7 0. 0. 0. 0.
0. ]
[ 0. 0. 9.9 14.299999 0. 0. 8.8
0. ]
[ 0. 2.2 0. 10. 0. 0. 8.8
0. ]]
# 使用方式:5
# 加权的embedding
[[ 0.16342753 -0.07898534 -0.33816564 0.2438156 ]
[ 0.04507026 0.30109608 0.08584949 0.28742552]
[ 0.00048126 0.315775 0.1192891 0.21302155]]
column = tf.feature_column.sequence_categorical_column_with_vocabulary_list(
key="feature",
vocabulary_list=["value1", "value2", "value3"],
dtype=tf.string,
default_value=-1,
num_oov_buckets=2)
# 输入序列特征"fea_10"后的稀疏tensor结果:
SparseTensorValue(indices=array([[0, 0, 0],
[0, 0, 1],
[0, 1, 0],
[0, 1, 1],
[1, 0, 0],
[1, 0, 1],
[1, 1, 0],
[1, 1, 1]], dtype=int64), values=array([0, 1, 2, 2, 2, 3, 3, 3], dtype=int64), dense_shape=array([2, 2, 2], dtype=int64))
# 使用方式:1
# 转换为整数序列表示
[[[0 1]
[2 2]]
[[2 3]
[3 3]]]
# 使用方式:2
# 转换为multi_hot表示(维度为5=vocabulary_list+num_oov_buckets)
(array([[[1., 1., 0., 0., 0.],
[0., 0., 2., 0., 0.]],
[[0., 0., 1., 1., 0.],
[0., 0., 0., 2., 0.]]], dtype=float32), array([2, 2], dtype=int64))
# 使用方式:3
# 转换为embedding表示(维度为自己设置:3)
(array([[[ 0.54921925, 0.039222 , -0.20265868], # 表示 ["value1", "value2"]
[ 0.3889632 , 0.43282962, -0.2105029 ]], # 表示 ["value3", "value3"]
[[ 0.20231032, -0.11117572, -0.14481466], # 表示 ["value3", "value5"]
[ 0.01565746, -0.65518105, -0.07912641]] # 表示 ["value4", "value4"]
], dtype=float32), array([2, 2], dtype=int64))
# 注意使用input_layer时使用的是下面这个api:
# tf.contrib.feature_column.sequence_input_layer
# 这个api可以处理非sequence(非序列特征)的input_layer不能处理的"fea_5"特征
使用的三种方式:
同4.2若输入是稠密densor其输入特征必须维度一致:
同4.2、categorical_column_with_vocabulary_list
column = tf.feature_column.categorical_column_with_vocabulary_file(
key="feature",
vocabulary_file="valuelist",
dtype=tf.string,
default_value=None,
num_oov_buckets=3)
# 注意:
# 在使用input_layer时同样不能处理"fea_5"这样的多维特征数据
#
# 文件valuelist的内容如下:
value1
value2
value3
同4.2、categorical_column_with_vocabulary_list
column = tf.feature_column.sequence_categorical_column_with_vocabulary_file(
key="feature",
vocabulary_file="valuelist",
dtype=tf.string,
default_value=None,
num_oov_buckets=3)
# 结果及注意事项同4.2的序列sequence一样
使用的四种方式:
同4.2若输入是稠密densor(int型)其输入特征维度必须保持一致
column = tf.feature_column.categorical_column_with_identity(
key='feature',
# 取值范围为[0, num_buckets)
num_buckets=10,
# 数据不在[0, num_buckets)内时,将被映射的值。
# 默认为None,这种情况下,当存在未知数据,会报错。
# 要求default_value的取值在[0, num_buckets)内
default_value=3)
# 结果及注意内容同4.2完全一样
column = tf.feature_column.sequence_categorical_column_with_identity(
key='feature',
num_buckets=10,
default_value=3)
# 结果及注意同4.2完全一样
使用的三种方式:
同4.2、若输入是稠密densor数据其输入特征维度必须保持一致
# string类型
column = tf.feature_column.categorical_column_with_hash_bucket(
key="feature",
# hash的空间大小
hash_bucket_size=10,
# 只支持string和integer
# 数值类型也是进行hash映射
dtype=tf.string)
# int类型
column = tf.feature_column.categorical_column_with_hash_bucket(
key="feature",
hash_bucket_size=10,
dtype=tf.int64)
# 结果和注意内容同4.2完全一样
# 处理string类型
column = tf.feature_column.sequence_categorical_column_with_hash_bucket(
key="feature",
hash_bucket_size=10,
dtype=tf.string)
# 处理int类型
column = tf.feature_column.sequence_categorical_column_with_hash_bucket(
key="feature",
hash_bucket_size=10,
dtype=tf.int64)
# 结果和注意 同4.2完全一样
使用的四种方式:
同4.2、若输入是稠密densor数据其输入特征维度必须保持一致
# keys为原始输入特征数据时:
column = tf.feature_column.crossed_column(
# keys的类型还可以为CategoricalColumn(hash类型的category不行)
keys=["fea_9", "fea_12"],
hash_bucket_size=100,
hash_key=None)
# keys为非hash类型的category类别特征时:
column_voc = tf.feature_column.categorical_column_with_vocabulary_file(
key="fea_9",
vocabulary_file="valuelist",
dtype=tf.string,
default_value=None,
num_oov_buckets=3)
column_iden = tf.feature_column.categorical_column_with_identity(
key='fea_12',
num_buckets=10,
default_value=3)
column_cro = tf.feature_column.crossed_column(
keys=[column_voc,column_iden],
hash_bucket_size=10,
hash_key=None)
# 结果和注意同4.2的非序列(非sequence)完全一样
使用的方式:
输入为稠密特征densor
numeric_column = tf.feature_column.numeric_column(
key="feature",
shape=6,
default_value=0,
dtype=tf.float32)
column = tf.feature_column.bucketized_column(
# 1-D的numeric column
source_column=numeric_column,
# 要求列表为升序
boundaries=[3, 5, 7, 10])
# 输入为"fea_14"数值特征
# 结果为
# 输出方式:1
# 使用input_layer的方式的输出
[[1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0.
0. 0. 0. 1. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
1. 0. 0. 0. 0. 1.]]
# 输出方式:2
# get_dense_tensor的方式的输出
[[[[1. 0. 0. 0. 0.]
[1. 0. 0. 0. 0.]]
[[0. 1. 0. 0. 0.]
[0. 1. 0. 0. 0.]]
[[0. 0. 1. 0. 0.]
[0. 0. 1. 0. 0.]]]
[[[0. 0. 0. 1. 0.]
[0. 0. 0. 1. 0.]]
[[0. 0. 0. 1. 0.]
[0. 0. 0. 0. 1.]]
[[0. 0. 0. 0. 1.]
[0. 0. 0. 0. 1.]]]]
column : categorical_column_with_vocabulary_list
sequence_categorical_column_with_vocabulary_list
categorical_column_with_vocabulary_file
sequence_categorical_column_with_vocabulary_file
categorical_column_with_identity
sequence_categorical_column_with_identity
categorical_column_with_hash_bucket
sequence_categorical_column_with_hash_bucket
crossed_column
weighted_categorical_column
tf.feature_column.indicator_column(column)
column : categorical_column_with_vocabulary_list
sequence_categorical_column_with_vocabulary_list
categorical_column_with_vocabulary_file
sequence_categorical_column_with_vocabulary_file
categorical_column_with_identity
sequence_categorical_column_with_identity
categorical_column_with_hash_bucket
sequence_categorical_column_with_hash_bucket
crossed_column
weighted_categorical_column
tf.feature_column.embedding_column(column)
numeric_column = tf.feature_column.numeric_column(
key="feature",
shape=6,
default_value=0,
dtype=tf.float32)
column = tf.feature_column.bucketized_column(
# 1-D的numeric column
source_column=numeric_column,
# 要求列表为升序
boundaries=[3, 5, 7, 10])
column : categorical_column_with_vocabulary_list
sequence_categorical_column_with_vocabulary_list
categorical_column_with_vocabulary_file
sequence_categorical_column_with_vocabulary_file
categorical_column_with_identity
sequence_categorical_column_with_identity
categorical_column_with_hash_bucket
sequence_categorical_column_with_hash_bucket
crossed_column
weighted_categorical_column
tf.feature_column.shared_embeddings(column,column)
feature_schema = {
# featureA: 一维字符串特征,可用于分类特征(可加权):categorical_column_with*
"featureA": tf.io.FixedLenFeature(shape=(1,), dtype=tf.string, default_value="null"),
# featureB: 一维数值特征,可用于实数特征:numeric_column
"featureB": tf.io.FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=0.0),
# featureC: 三维字符串特征,可用于分类特征(可加权):categorical_column_with*
"featureC": tf.io.FixedLenFeature(shape=(3,), dtype=tf.string, default_value=["null", "null", "null"]),
# featureD: 二维数值特征,可用于实数特征:numeric_column
"featureD": tf.io.FixedLenFeature(shape=(2,), dtype=tf.int64, default_value=[0, 0]),
# featureE: 不固定维度字符串特征可用于分类特征(可加权):categorical_column_with*
"featureE": tf.io.VarLenFeature(dtype=tf.string),
# featureF: 不固定维度数值特征
"featureF": tf.io.VarLenFeature(dtype=tf.float32),
# featureG: 二维字符串序列特征
"featureG": tf.io.FixedLenSequenceFeature(shape=(2,), dtype=tf.string, allow_missing=True, default_value=None),
# featureH: 三维数值序列特征
"featureH": tf.io.FixedLenSequenceFeature(shape=(3,), dtype=tf.int64, allow_missing=True, default_value=None),
# featureI: 21 * 4 * 10 维字符串稀疏特征
"featureI": tf.io.SparseFeature(index_key=["featureI_Index0", "featureI_Index1", "featureI_Index2"],
value_key="featureI_value", dtype=tf.string, size=[21, 4, 10], already_sorted=False)
}
在TensorFlow1.x中:
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
sess.run(tf.tables_initializer())
print(sess.run(rs_1))
print(rs_1.eval())
print(tf.sparse_tensor_to_dense(rs_2.id_tensor,-1).eval())