TensorFlow Recommenders - ValueError: Shape must be rank 2 but is rank 3
Posted
技术标签:
【中文标题】TensorFlow Recommenders - ValueError: Shape must be rank 2 but is rank 3【英文标题】: 【发布时间】:2021-12-29 14:14:32 【问题描述】:注意 - 由于我确实需要流式加载数据而不是在内存中,请使用tf.data.experimental.make_csv_dataset
显示示例。另外,请使用我的确切数据集显示一个示例。
我正在尝试使用玩具数据集复制 this TensorFlow Recommenders tutorial。但是,我收到以下错误:
Epoch 1/5
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/tmp/ipykernel_7920/1393870474.py in <module>
106
107 # Train.
--> 108 model.fit(interactions, epochs=5)
109
110 # Evaluate.
~/anaconda3/envs/srs/lib/python3.9/site-packages/keras/utils/traceback_utils.py in error_handler(*args, **kwargs)
65 except Exception as e: # pylint: disable=broad-except
66 filtered_tb = _process_traceback_frames(e.__traceback__)
---> 67 raise e.with_traceback(filtered_tb) from None
68 finally:
69 del filtered_tb
~/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow/python/framework/func_graph.py in autograph_handler(*args, **kwargs)
1127 except Exception as e: # pylint:disable=broad-except
1128 if hasattr(e, "ag_error_metadata"):
-> 1129 raise e.ag_error_metadata.to_exception(e)
1130 else:
1131 raise
ValueError: in user code:
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/engine/training.py", line 878, in train_function *
return step_function(self, iterator)
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/engine/training.py", line 867, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/engine/training.py", line 860, in run_step **
outputs = model.train_step(data)
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow_recommenders/models/base.py", line 68, in train_step
loss = self.compute_loss(inputs, training=True)
File "/tmp/ipykernel_7920/1393870474.py", line 94, in compute_loss
return self.task(user_embeddings, channel_embeddings)
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
ValueError: Exception encountered when calling layer "retrieval" (type Retrieval).
in user code:
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow_recommenders/tasks/retrieval.py", line 143, in call *
metric_update_ops.append(
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow_recommenders/metrics/factorized_top_k.py", line 84, in update_state *
top_k_predictions, _ = self._candidates(query_embeddings, k=self._k)
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler **
raise e.with_traceback(filtered_tb) from None
ValueError: Exception encountered when calling layer "streaming" (type Streaming).
in user code:
File "/home/george/anaconda3/envs/srs/lib/python3.9/site-packages/tensorflow_recommenders/layers/factorized_top_k.py", line 441, in top_k *
joined_scores = tf.concat([state_scores, x_scores], axis=1)
ValueError: Shape must be rank 2 but is rank 3 for 'node concat = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](args_0, args_2, concat/axis)' with input shapes: [1,0], [?,1,1], [].
Call arguments received:
• queries=tf.Tensor(shape=(1, 64), dtype=float32)
• k=100
Call arguments received:
• query_embeddings=tf.Tensor(shape=(1, 64), dtype=float32)
• candidate_embeddings=tf.Tensor(shape=(1, 64), dtype=float32)
• sample_weight=None
• candidate_sampling_probability=None
• candidate_ids=None
• compute_metrics=True
这是我的代码:
from typing import Dict, Text
import pandas as pd
from pathlib import Path
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
df_interactions = pd.DataFrame(
'user_id': [
'00001446-da5f-4d17',
'00001446-da5f-4d17',
'00005ab5-c9e0-4b05-',
'00005ab5-c9e0-4b05-',
'000093dd-1a11-4600',
'000093dd-1a11-4600',
'00009b34-65b5-42c1',
'0000ae32-4a91-4bcd',
'0000ae32-4a91-4bcd',
'0000ae32-4a91-4bcd'
],
'channel_id': [
'1', '2', 'A56',
'3', 'B72', '2',
'M63', '2', '5', 'A56'
]
)
df_interactions.to_csv('experiment_interactions.csv', index=False)
df_channels = pd.DataFrame(
'channel_id': [
'1', '2', '3', '5', 'A56', 'B72', 'M63'
],
'channel_name': [
'Popular',
'Best',
'Highest Rated',
'Large Following',
'Nice',
'Retro',
'Modern'
]
)
df_channels.to_csv('experiment_channels.csv', index=False)
interactions = tf.data.experimental.make_csv_dataset(
file_pattern='experiment_interactions.csv',
column_defaults=[tf.string, tf.string],
batch_size=1
)
channels = tf.data.experimental.make_csv_dataset(
file_pattern='experiment_channels.csv',
column_defaults=[tf.string, tf.string],
batch_size=1
)
# Select the basic features.
interactions = interactions.map(lambda x:
"user_id": tf.strings.to_number(x["user_id"]),
"channel_id": tf.strings.to_number(x["channel_id"])
)
channels = channels.map(lambda x: tf.strings.to_number(x["channel_id"]))
# Build a model.
class Model(tfrs.Model):
def __init__(self):
super().__init__()
# Set up user representation.
self.user_model = tf.keras.layers.Embedding(
input_dim=2000, output_dim=64)
# Set up movie representation.
self.item_model = tf.keras.layers.Embedding(
input_dim=2000, output_dim=64)
# Set up a retrieval task and evaluation metrics over the
# entire dataset of candidates.
self.task = tfrs.tasks.Retrieval(
metrics=tfrs.metrics.FactorizedTopK(
candidates=channels.batch(1).map(self.item_model)
)
)
def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
user_embeddings = self.user_model(features["user_id"])
channel_embeddings = self.item_model(features["channel_id"])
return self.task(user_embeddings, channel_embeddings)
model = Model()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))
# Randomly shuffle data and split between train and test.
tf.random.set_seed(42)
#shuffled = interactions.shuffle(100000, seed=42, reshuffle_each_iteration=False)
#train = shuffled.take(80000)
#test = shuffled.skip(80000).take(20000)
# Train.
model.fit(interactions, epochs=5)
附加信息:
TensorFlow 版本:'2.7.0' TensorFlow 数据集版本:'4.4.0' 熊猫版本:'1.3.4'【问题讨论】:
【参考方案1】:您似乎对数据的预处理不正确。例如,您不能使用tf.strings.to_number
将00001446-da5f-4d17
转换为某个数字。它将引发错误,因为字符串包含的不仅仅是数字。此外,数据集中的每个样本都是一个数组,而不是单个样本:例如,通道 1 不是 1,而是 [1]。这是您问题中原始问题的原因。这是基于您的代码的简化工作示例:
from typing import Dict, Text
import pandas as pd
from pathlib import Path
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
df_interactions = pd.DataFrame(
'user_id': [
'4d17',
'4d17',
'4b05',
'4b05',
'93dd',
'93dd',
'9b34',
'4bcd',
'-4bcd',
'4bcd'
],
'channel_id': [
'1', '2', '6',
'3', '7', '2',
'8', '2', '5', '6'
]
)
df_channels = pd.DataFrame(
'channel_id': [
'1', '2', '3', '5', '6', '7', '8'
],
'channel_name': [
'Popular',
'Best',
'Highest Rated',
'Large Following',
'Nice',
'Retro',
'Modern'
]
)
df_channels = pd.DataFrame(
'channel_id': [
'1', '2', '3', '5', '6', '7', '8'
],
'channel_name': [
'Popular',
'Best',
'Highest Rated',
'Large Following',
'Nice',
'Retro',
'Modern'
]
)
interactions = tf.data.Dataset.from_tensor_slices((dict(df_interactions)))
interactions = interactions.map(lambda x:
"user_id": tf.strings.to_number(tf.strings.regex_replace(x["user_id"], '[^0-9^]', "")),
"channel_id": tf.strings.to_number(x["channel_id"])
)
channels = tf.data.Dataset.from_tensor_slices((dict(df_channels)))
channels = channels.map(lambda x: tf.strings.to_number(x["channel_id"]))
# Build a model.
class Model(tfrs.Model):
def __init__(self):
super().__init__()
# Set up user representation.
self.user_model = tf.keras.layers.Embedding(
input_dim=2000, output_dim=64)
# Set up movie representation.
self.item_model = tf.keras.layers.Embedding(
input_dim=2000, output_dim=64)
# Set up a retrieval task and evaluation metrics over the
# entire dataset of candidates.
self.task = tfrs.tasks.Retrieval(
metrics=tfrs.metrics.FactorizedTopK(
candidates=channels.batch(1).map(self.item_model)
)
)
def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
user_embeddings = self.user_model(features["user_id"])
channel_embeddings = self.item_model(features["channel_id"])
return self.task(user_embeddings, channel_embeddings)
model = Model()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))
tf.random.set_seed(42)
model.fit(interactions.batch(1), epochs=5)
Epoch 1/5
10/10 [==============================] - 1s 61ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e+00 - regularization_loss: 0.0000e+00 - total_loss: 0.0000e+00
Epoch 2/5
10/10 [==============================] - 1s 61ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e+00 - regularization_loss: 0.0000e+00 - total_loss: 0.0000e+00
Epoch 3/5
10/10 [==============================] - 1s 60ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e+00 - regularization_loss: 0.0000e+00 - total_loss: 0.0000e+00
Epoch 4/5
10/10 [==============================] - 1s 60ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e+00 - regularization_loss: 0.0000e+00 - total_loss: 0.0000e+00
Epoch 5/5
10/10 [==============================] - 1s 61ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0000 - factorized_top_k/top_5_categorical_accuracy: 1.0000 - factorized_top_k/top_10_categorical_accuracy: 1.0000 - factorized_top_k/top_50_categorical_accuracy: 1.0000 - factorized_top_k/top_100_categorical_accuracy: 1.0000 - loss: 0.0000e+00 - regularization_loss: 0.0000e+00 - total_loss: 0.0000e+00
<keras.callbacks.History at 0x7fe480d22f50>
如果您想将文件读入数据集,请尝试以下操作:
################## ORIGINAL DATASET ##################
df_interactions = pd.DataFrame(
'user_id': [
'00001446-da5f-4d17',
'00001446-da5f-4d17',
'00005ab5-c9e0-4b05-',
'00005ab5-c9e0-4b05-',
'000093dd-1a11-4600',
'000093dd-1a11-4600',
'00009b34-65b5-42c1',
'0000ae32-4a91-4bcd',
'0000ae32-4a91-4bcd',
'0000ae32-4a91-4bcd'
],
'channel_id': [
'1', '2', 'A56',
'3', 'B72', '2',
'M63', '2', '5', 'A56'
]
)
df_channels = pd.DataFrame(
'channel_id': [
'1', '2', '3', '5', '6', '7', '8'
],
'channel_name': [
'Popular',
'Best',
'Highest Rated',
'Large Following',
'Nice',
'Retro',
'Modern'
]
)
"""
################## MODIFIED DATASET ##################
df_interactions = pd.DataFrame(
'user_id': [
'4d17',
'4d17',
'4b05',
'4b05',
'93dd',
'93dd',
'9b34',
'4bcd',
'-4bcd',
'4bcd'
],
'channel_id': [
'1', '2', '6',
'3', '7', '2',
'8', '2', '5', '6'
]
)
df_channels = pd.DataFrame(
'channel_id': [
'1', '2', '3', '5', '6', '7', '8'
],
'channel_name': [
'Popular',
'Best',
'Highest Rated',
'Large Following',
'Nice',
'Retro',
'Modern'
]
)
"""
df_channels.to_csv('experiment_channels.csv', index=False)
df_interactions.to_csv('experiment_interactions.csv', index=False)
channels = tf.data.experimental.CsvDataset('experiment_channels.csv', [tf.string, tf.string], header=True)
interactions = tf.data.experimental.CsvDataset('experiment_interactions.csv', [tf.string, tf.string], header=True)
def preprocess_channels(x, y):
return x
def preprocess_interactions(x, y):
return
"user_id": tf.strings.regex_replace(x, '[^0-9^]', ""),
"channel_id": y
channels = channels.map(preprocess_channels)
interactions = interactions.map(preprocess_interactions)
interactions_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
interactions_vocabulary.adapt(interactions.map(lambda x: x["user_id"]))
channels_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
channels_vocabulary.adapt(channels)
# Build a model.
class Model(tfrs.Model):
def __init__(self):
super().__init__()
self.user_model = tf.keras.Sequential([
interactions_vocabulary,
tf.keras.layers.Embedding(interactions_vocabulary.vocabulary_size(), 64)
])
self.item_model = tf.keras.Sequential([
channels_vocabulary,
tf.keras.layers.Embedding(channels_vocabulary.vocabulary_size(), 64)
])
# Set up a retrieval task and evaluation metrics over the
# entire dataset of candidates.
self.task = tfrs.tasks.Retrieval(
metrics=tfrs.metrics.FactorizedTopK(
candidates=channels.batch(1).map(self.item_model)
)
)
def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
user_embeddings = self.user_model(features["user_id"])
channel_embeddings = self.item_model(features["channel_id"])
return self.task(user_embeddings, channel_embeddings)
model = Model()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))
tf.random.set_seed(42)
model.fit(interactions.batch(1), epochs=5)
注意这个例子使用tf.keras.layers.StringLookup
。
tf.data.experimental.CsvDataset
类提供了一个最小的 CSV 数据集接口。
但是,您比使用更高级的 API(如 tf.data.experimental.make_csv_dataset
)灵活得多。查看docs 了解更多信息。
【讨论】:
感谢您的回答!我按原样尝试了您的代码并得到了一个不同的错误:“InvalidArgumentError”,更多细节在这里:docs.google.com/document/d/…这可能是由于 TFRS 版本造成的吗?我的是2.7.0。我已经在上面帖子的底部进行了更新。 我在 google colab 上运行了所有东西......并像这样安装了 tfrs!pip install tensorflow_recommenders
..但你是对的......有些地方是错误的
我刚刚尝试了 Colab,但仍然遇到同样的错误...您介意看看这里吗? colab.research.google.com/drive/…谢谢!
由于来自user_ids
的大量数字经过预处理后,embedding
层不断出现错误,因为它们的大小超过了预定义的input_dim
。我只是简化了你的玩具数据集。它现在应该可以工作了。
更新了答案,底部有tf.keras.layers.StringLookup
。以上是关于TensorFlow Recommenders - ValueError: Shape must be rank 2 but is rank 3的主要内容,如果未能解决你的问题,请参考以下文章
谷歌开源推荐系统库(TensorFlow Recommenders)
TensorFlow Recommenders 迎来更新 — 可扩展检索和特征交互建模
TensorFlow Recommenders - ValueError: Shape must be rank 2 but is rank 3
Deep & Cross Network 6 使用 TensorFlow 构建推荐系统
Eclipse安装Code Recommenders代码联想模糊提示,解决安装Code Recommenders代码提示神器出错。
Eclipse安装Code Recommenders代码联想模糊提示,解决安装Code Recommenders代码提示神器出错。