Skip to content

Commit

Permalink
+ Fix #94 (typo error in tf feature_embedding)
Browse files Browse the repository at this point in the history
+ Fix issue in build_dataset for skipping rebuilding dataset
+ Fix typo error in DIEN
  • Loading branch information
xpai committed Jul 11, 2024
1 parent e3a9ffe commit 9551f33
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 41 deletions.
7 changes: 5 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
[Doing] Add support for saving pb file, exporting embeddings
[Doing] Add support of multi-gpu training

**FuxiCTR v2.3.2, 2024-07-14**
+ [Fix] Fix typo error in copy_from of version v2.3.1
**FuxiCTR v2.3.2, 2024-07-11**
+ [Feature] Add TransAct model
+ [Feature] Add new feature type `embedding`, supporting [`meta`, `numeric`, `embedding`, `categorical`, `sequence`]
+ [Fix] Fix typo error in copy_from of version v2.3.1
+ [Fix] Fix issue in build_dataset for skipping rebuilding dataset
+ [Fix] Fix typo error in DIEN (AUGRUCell->AGRUCell)
+ [Fix] Fix typo error in feature_embedding of tf version ([#94](https://github.com/reczoo/FuxiCTR/issues/94))

**FuxiCTR v2.3.1, 2024-06-09**
+ [Fix] Fix customized preprossors based on polars and update demos
Expand Down
74 changes: 37 additions & 37 deletions fuxictr/preprocess/build_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,47 +82,47 @@ def build_dataset(feature_encoder, train_data=None, valid_data=None, test_data=N
if rebuild_dataset:
feature_map_path = os.path.join(feature_encoder.data_dir, "feature_map.json")
if os.path.exists(feature_map_path):
logging.warn(f"Skip rebuilding {feature_map_path}. "
logging.warn(f"Skip rebuilding {feature_map_path}. "
+ "Please delete it manually if rebuilding is required.")
else:
# Load data files
train_ddf = feature_encoder.read_data(train_data, **kwargs)
valid_ddf = None
test_ddf = None

# Load data files
train_ddf = feature_encoder.read_data(train_data, **kwargs)
valid_ddf = None
test_ddf = None

# Split data for train/validation/test
if valid_size > 0 or test_size > 0:
valid_ddf = feature_encoder.read_data(valid_data, **kwargs)
test_ddf = feature_encoder.read_data(test_data, **kwargs)
# TODO: check split_train_test in lazy mode
train_ddf, valid_ddf, test_ddf = split_train_test(train_ddf, valid_ddf, test_ddf,
valid_size, test_size, split_type)

# fit and transform train_ddf
train_ddf = feature_encoder.preprocess(train_ddf)
feature_encoder.fit(train_ddf, rebuild_dataset=True, **kwargs)
transform(feature_encoder, train_ddf, 'train', block_size=data_block_size)
del train_ddf
gc.collect()

# Transfrom valid_ddf
if valid_ddf is None and (valid_data is not None):
valid_ddf = feature_encoder.read_data(valid_data, **kwargs)
if valid_ddf is not None:
valid_ddf = feature_encoder.preprocess(valid_ddf)
transform(feature_encoder, valid_ddf, 'valid', block_size=data_block_size)
del valid_ddf
# Split data for train/validation/test
if valid_size > 0 or test_size > 0:
valid_ddf = feature_encoder.read_data(valid_data, **kwargs)
test_ddf = feature_encoder.read_data(test_data, **kwargs)
# TODO: check split_train_test in lazy mode
train_ddf, valid_ddf, test_ddf = split_train_test(train_ddf, valid_ddf, test_ddf,
valid_size, test_size, split_type)

# fit and transform train_ddf
train_ddf = feature_encoder.preprocess(train_ddf)
feature_encoder.fit(train_ddf, rebuild_dataset=True, **kwargs)
transform(feature_encoder, train_ddf, 'train', block_size=data_block_size)
del train_ddf
gc.collect()

# Transfrom test_ddf
if test_ddf is None and (test_data is not None):
test_ddf = feature_encoder.read_data(test_data, **kwargs)
if test_ddf is not None:
test_ddf = feature_encoder.preprocess(test_ddf)
transform(feature_encoder, test_ddf, 'test', block_size=data_block_size)
del test_ddf
gc.collect()
logging.info("Transform csv data to parquet done.")
# Transfrom valid_ddf
if valid_ddf is None and (valid_data is not None):
valid_ddf = feature_encoder.read_data(valid_data, **kwargs)
if valid_ddf is not None:
valid_ddf = feature_encoder.preprocess(valid_ddf)
transform(feature_encoder, valid_ddf, 'valid', block_size=data_block_size)
del valid_ddf
gc.collect()

# Transfrom test_ddf
if test_ddf is None and (test_data is not None):
test_ddf = feature_encoder.read_data(test_data, **kwargs)
if test_ddf is not None:
test_ddf = feature_encoder.preprocess(test_ddf)
transform(feature_encoder, test_ddf, 'test', block_size=data_block_size)
del test_ddf
gc.collect()
logging.info("Transform csv data to parquet done.")

train_data, valid_data, test_data = (
os.path.join(feature_encoder.data_dir, "train"), \
Expand Down
2 changes: 1 addition & 1 deletion fuxictr/tensorflow/layers/embeddings/feature_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def __init__(self,
continue

if feature_spec["type"] == "numeric":
self.embedding_layers[feature] = tf.keras.layers.Dense(feat_emb_dim, user_bias=False)
self.embedding_layers[feature] = tf.keras.layers.Dense(feat_emb_dim, use_bias=False)
elif feature_spec["type"] == "categorical":
padding_idx = feature_spec.get("padding_idx", None)
embedding_matrix = Embedding(feature_spec["vocab_size"],
Expand Down
2 changes: 1 addition & 1 deletion model_zoo/DIEN/src/DIEN.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ def __init__(self, input_size, hidden_size, bias=True, gru_type='AUGRU'):
if gru_type == "AUGRU":
self.gru_cell = AUGRUCell(input_size, hidden_size, bias=bias)
elif gru_type == "AGRU":
self.gru_cell = AUGRUCell(input_size, hidden_size, bias=bias)
self.gru_cell = AGRUCell(input_size, hidden_size, bias=bias)

def forward(self, packed_seq_emb, attn_score=None, h=None):
assert isinstance(packed_seq_emb, PackedSequence) and isinstance(attn_score, PackedSequence), \
Expand Down

0 comments on commit 9551f33

Please sign in to comment.