WDL-特征生成
Posted 我家大宝最可爱
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了WDL-特征生成相关的知识,希望对你有一定的参考价值。
import json
import os
import tensorflow as tf
# FeatureGenerator的设计思想
# 将特征分为几类
# 1. 极度稀疏的特征,item_id,user_id等,这样的特征无法直接输入模型,需要通过hash之后变成onehot再输入模型,我们将这样的特征存入__sparse_indicator_map
# 2. 可枚举的稀疏特征,is_week,is_baoyou等,这样的特征维度较小,可以直接枚举后进行onehot编码,我们将这样的特征存入__indicator_map
# 3. 数值型特征,数值特征输入一般都是实数,可以直接输入到__numeric_map,如果对数据有切分的话,我们就存入__bucket_map
# 4. embeding特征,dembeding通常是稀疏特征产生的,不管是什么类型产生的,最终都是一组实数,所以统一存入__embedding_map
# 之后我们将这些特征的名字也保存到相应的list中,以方便后续输入模型的时候进行取出
# 模型保存的时候需要传入每个特征的placeholder,我们使用了什么特征就将这个特征添加到select_cloumns中,如果没有使用的话就不添加
class FeatureGenerator:
def __init__(self, fg_file):
self._CSV_COLUMN_DEFAULTS = []
self._CSV_COLUMNS = []
self.__indicator_map = {}
self.__sparse_indicator_map = {}
self.__embedding_map = {}
self.__bucket_map = {}
self.__numeric_map = {}
self.__feature_placeholder = {}
self.select_cloumns = set()
self.__type_map = {"double":tf.float32,"bigint":tf.int32,"string":tf.string}
self.features_name = {'sparse_indicator':[],'indicator':[],'embedding':[],'bucket':[],'numeric':[]}
self.fg_to_tf_feature(fg_file)
def fg_to_tf_feature(self,fg_file):
if not os.path.isfile(fg_file):
raise TypeError(fg_file + " does not exist")
else:
with open(fg_file, 'r') as f:
features = json.load(f)
features = features['features']
for feature in features:
feature_name = feature.get('feature_name', '')
default_value = feature.get('default_value', '')
value_type = feature.get('value_type', '')
self._CSV_COLUMN_DEFAULTS.append([default_value])
self._CSV_COLUMNS.append(feature_name)
if feature_name in ('clk_label', 'buy_label', 'label'):
continue
self.__feature_placeholder.update({feature_name:tf.placeholder(self.__type_map[value_type],shape=[None,1],name=feature_name)})
if value_type in ('bigint','double','BIGINT','DOUBLE'):
fc = tf.feature_column.numeric_column(feature_name)
self.__numeric_map[feature_name] = fc
self.features_name['numeric'].append(feature_name)
boundaries = feature.get('boundaries', None)
if boundaries is not None and len(boundaries) > 0:
fc_buckets = tf.feature_column.bucketized_column(fc, boundaries)
self.__bucket_map[feature_name] = fc_buckets
self.features_name['bucket'].append(feature_name)
elif value_type in ('string','STRING'):
vocabulary = feature.get('vocabulary', None)
hash_bucket_size = feature.get('hash_bucket_size', 0)
embedding_size = feature.get('embedding_dimension', 0)
if vocabulary is None or len(vocabulary) == 0:
if hash_bucket_size is None or hash_bucket_size == 0:
raise ValueError(
feature_name + ' hash_bucket_size is None or Zero')
fc = tf.feature_column.categorical_column_with_hash_bucket(
feature_name, hash_bucket_size)
fc_indicator = tf.feature_column.indicator_column(fc)
self.__sparse_indicator_map[feature_name] = fc_indicator
self.features_name['sparse_indicator'].append(feature_name)
else:
fc = tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary)
fc_indicator = tf.feature_column.indicator_column(fc)
self.__indicator_map[feature_name] = fc_indicator
self.features_name['indicator'].append(feature_name)
if embedding_size > 0:
fc_embedding = tf.feature_column.embedding_column(
fc, dimension=embedding_size)
self.__embedding_map[feature_name] = fc_embedding
self.features_name['embedding'].append(feature_name)
def cross_feature(self, features, hash_bucket_size, embedding_size=0):
self.select_cloumns.update(features)
feature_name = '_x_'.join(features)
crossd_feature = tf.feature_column.crossed_column(features, hash_bucket_size)
crossd_feature = tf.feature_column.indicator_column(crossd_feature)
self.__sparse_indicator_map[feature_name] = crossd_feature
self.features_name['sparse_indicator'].append(feature_name)
if embedding_size > 0:
crossd_feature = tf.feature_column.embedding_column(
crossd_feature, dimension=embedding_size)
self.__embedding_map[feature_name] = crossd_feature
self.features_name['embedding'].append(feature_name)
def add_feature(self,feature_type, feature_name):
if feature_type in ('i','indicator') :
placeholder = self.__indicator_map.get(feature_name,None)
elif feature_type in ('s','sparse_indicator') :
placeholder = self.__sparse_indicator_map.get(feature_name,None)
elif feature_type in ('e','embedding') :
placeholder = self.__embedding_map.get(feature_name,None)
elif feature_type in ('b','bucket') :
placeholder = self.__bucket_map.get(feature_name,None)
elif feature_type in ('n','numeric') :
placeholder = self.__numeric_map.get(feature_name,None)
else:
raise NameError("{} is not defined".format(feature_type))
if placeholder is None:
raise NameError("{} is not defined".format(feature_name))
else:
if(feature_name in self._CSV_COLUMNS):
self.select_cloumns.add(feature_name)
return placeholder
def save_model_feature(self):
p1 = dict((key, value) for key, value in self.__feature_placeholder.items() if key in self.select_cloumns)
return p1
以上是关于WDL-特征生成的主要内容,如果未能解决你的问题,请参考以下文章