&#24403;&#25105;&#20351;&#29992;&#22686;&#37327;&#35757;&#32451;&#30340;&#26102;&#20505;&#65292;&#21482;&#25913;&#21464;&#20462;&#25913;&#19968;&#20010;feature&#30340;zch_size&#30340;&#22823;&#23567;&#65292;&#20854;&#20182;&#25152;&#26377;&#37197;&#32622;&#37117;&#19981;&#25913;&#21464;&#65292;&#20250;&#20986;&#38169;&#12290;

&#24403;&#25105;&#20351;&#29992;&#22686;&#37327;&#35757;&#32451;&#30340;&#26102;&#20505;&#65292;&#21482;&#25913;&#21464;&#20462;&#25913;&#19968;&#20010;feature&#30340;zch_size&#30340;&#22823;&#23567;&#65292;&#20854;&#20182;&#25152;&#26377;&#37197;&#32622;&#37117;&#19981;&#25913;&#21464;&#65292;&#20250;&#20986;&#38169;&#12290;

&#20462;&#25913;&#21069;&#37197;&#32622;
feature_configs {
    id_feature {
        feature_name: "uuid"
        expression: "user:uuid"
        embedding_dim: 8
        default_value: ""
        zch: {
            zch_size: 10
            eviction_interval: 2
            distance_lfu {
                decay_exponent: 1.0
            }
        }
    }
}

------------------------------------------------------------------------------

&#20462;&#25913;&#21518;&#37197;&#32622;
feature_configs {
    id_feature {
        feature_name: "uuid"
        expression: "user:uuid"
        embedding_dim: 8
        default_value: ""
        zch: {
            zch_size: 2000000
            eviction_interval: 2
            distance_lfu {
                decay_exponent: 1.0
            }
        }
    }
}

------------------------------------------------------------------------------

&#25253;&#38169;&#26085;&#24535;&#65306;
root@dsw-241660-5fd89544d5-7bd9m:/mnt/workspace# torchrun  --master_addr=localhost --master_port=32555 \
    --nnodes=1 --nproc-per-node=1 --node_rank=0 \
    -m tzrec.train_eval \
    --pipeline_config_path /mnt/workspace/czy/deepfm_criteo_sng.config \
    --train_input_path /mnt/data/easyrec/data/test_10w/* \
    --eval_input_path /mnt/data/easyrec/data/test_10w/* \
    --model_dir   /mnt/workspace/czy/easyrec/experiments/deep_fm \
    --continue_train  True
package: pyfg
custom_lib_path: /opt/conda/lib/python3.11/site-packages
Initializing pyfg...
pyfg version: 0.5.9 loaded
/opt/conda/lib/python3.11/site-packages/torch_tensorrt/fx/tracer/acc_tracer/acc_ops.py:895: UserWarning: Unable to import torchvision related libraries.: No module named 'torchvision'. Please install torchvision lib in order to lower stochastic_depth
  warnings.warn(
[2025-05-06 06:46:27,417][WARNING] Unable to import quantization op. Please install modelopt library (https://github.com/NVIDIA/TensorRT-Model-Optimizer?tab=readme-ov-file#installation) to add support for compiling quantized models
[2025-05-06 06:46:27,595][WARNING] TensorRT-LLM is not installed. Please install TensorRT-LLM or set TRTLLM_PLUGINS_PATH to the directory containing libnvinfer_plugin_tensorrt_llm.so to use converters for torch.distributed ops
[05/06/2025-06:46:27] [TRT] [W] Functionality provided through tensorrt.plugin module is experimental.
I20250506 06:46:27.731312 1016492 str_utils.cc:158] ENV: DISABLE_FG_PRECISION=true
I20250506 06:46:27.731360 1016492 str_utils.cc:38] AVX supported
I20250506 06:46:27.731369 1016492 str_utils.cc:40] FMA supported
I20250506 06:46:27.731379 1016492 str_utils.cc:57] AVX-512F supported
I20250506 06:46:27.731386 1016492 str_utils.cc:63] AVX-512VL supported
I20250506 06:46:27.731395 1016492 str_utils.cc:69] AVX-512BW supported
I20250506 06:46:27.731402 1016492 str_utils.cc:75] AVX-512DQ supported
I20250506 06:46:27.731410 1016492 str_utils.cc:202] support avx512: true
I20250506 06:46:27.731418 1016492 str_utils.cc:211] will use avx512
I20250506 06:46:27.731444 1016492 fg_handler.cc:128] Feature Num: 7
I20250506 06:46:27.731484 1016492 factory.cc:93] Registered 37 feature functions
I20250506 06:46:27.731709 1016492 fg_handler.cc:182] Parsed 7 features.
I20250506 06:46:27.731724 1016492 fg_handler.cc:1203] FgHandler (1) initialized ok, version: 0.5.9
[2025-05-06 06:46:28,167][WARNING] /opt/conda/lib/python3.11/site-packages/torch/utils/data/dataloader.py:624: UserWarning: This DataLoader will create 8 worker processes in total. Our suggested max number of worker in current system is 4, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
  warnings.warn(

I20250506 06:46:28.280966 1016492 fg_handler.cc:128] Feature Num: 7
I20250506 06:46:28.281198 1016492 fg_handler.cc:182] Parsed 7 features.
I20250506 06:46:28.281222 1016492 fg_handler.cc:1203] FgHandler (2) initialized ok, version: 0.5.9
I20250506 06:46:28.314764 1016541 fg_handler.cc:88] New FgHandlerImpl(0x55aad1d24220) created by FgHandler(1)
I20250506 06:46:28.316426 1016542 fg_handler.cc:88] New FgHandlerImpl(0x55aad1d24220) created by FgHandler(1)
I20250506 06:46:28.327490 1016544 fg_handler.cc:88] New FgHandlerImpl(0x55aad1d24220) created by FgHandler(1)
I20250506 06:46:28.345464 1016545 fg_handler.cc:88] New FgHandlerImpl(0x55aad65329c0) created by FgHandler(1)
I20250506 06:46:28.353870 1016543 fg_handler.cc:88] New FgHandlerImpl(0x55aad1d24220) created by FgHandler(1)
I20250506 06:46:28.378043 1016546 fg_handler.cc:88] New FgHandlerImpl(0x55aad1d24220) created by FgHandler(1)
I20250506 06:46:28.428983 1016548 fg_handler.cc:88] New FgHandlerImpl(0x55aad1d24220) created by FgHandler(1)
I20250506 06:46:28.430663 1016547 fg_handler.cc:88] New FgHandlerImpl(0x55aad1d24220) created by FgHandler(1)
I20250506 06:46:28.627410 1016576 fg_handler.cc:88] New FgHandlerImpl(0x55aad6a0c400) created by FgHandler(2)
I20250506 06:46:28.664819 1016575 fg_handler.cc:88] New FgHandlerImpl(0x55aad6a0c400) created by FgHandler(2)
I20250506 06:46:28.691635 1016589 fg_handler.cc:88] New FgHandlerImpl(0x55aad6a0c400) created by FgHandler(2)
I20250506 06:46:28.724189 1016586 fg_handler.cc:88] New FgHandlerImpl(0x55aad651c5b0) created by FgHandler(2)
I20250506 06:46:28.739431 1016590 fg_handler.cc:88] New FgHandlerImpl(0x55aad6a0c400) created by FgHandler(2)
I20250506 06:46:28.729600 1016579 fg_handler.cc:88] New FgHandlerImpl(0x55aad6a0c400) created by FgHandler(2)
I20250506 06:46:28.742722 1016587 fg_handler.cc:88] New FgHandlerImpl(0x55aad6a0c400) created by FgHandler(2)
I20250506 06:46:28.732587 1016588 fg_handler.cc:88] New FgHandlerImpl(0x55aad6a0c400) created by FgHandler(2)
[2025-05-06 06:46:29,125][INFO] module: model.embedding_group.emb_impls.__BASE__.mc_ebc

          param           | sharding type | compute kernel | ranks
------------------------- | ------------- | -------------- | -----
uuid_emb_wide             | row_wise      | fused          | [0]  
book_title_emb_wide       | row_wise      | fused          | [0]  
sid_emb_wide              | row_wise      | fused          | [0]  
uuid_emb                  | row_wise      | fused          | [0]  
book_title_emb            | row_wise      | fused          | [0]  
clk_storyid_seq_30d_emb   | row_wise      | fused          | [0]  
sid_emb                   | row_wise      | fused          | [0]  
user_online_time_lang_emb | row_wise      | fused          | [0]  

          param           | shard offsets |  shard sizes  |   placement  
------------------------- | ------------- | ------------- | -------------
uuid_emb_wide             | [0, 0]        | [2000000, 4]  | rank:0/cuda:0
book_title_emb_wide       | [0, 0]        | [10000000, 4] | rank:0/cuda:0
sid_emb_wide              | [0, 0]        | [10000000, 4] | rank:0/cuda:0
uuid_emb                  | [0, 0]        | [2000000, 8]  | rank:0/cuda:0
book_title_emb            | [0, 0]        | [10000000, 8] | rank:0/cuda:0
clk_storyid_seq_30d_emb   | [0, 0]        | [10000000, 8] | rank:0/cuda:0
sid_emb                   | [0, 0]        | [10000000, 8] | rank:0/cuda:0
user_online_time_lang_emb | [0, 0]        | [10000000, 8] | rank:0/cuda:0
[2025-05-06 06:46:29,128][WARNING] /opt/conda/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py:863: UserWarning: `_get_pg_default_device` will be deprecated, it only stays for backward-compatiblity reason. If you need to find a device for object collectives, please use `_get_object_coll_device`. If you need to query the device types supported by group, please use `_device_capability(group)`. 
  warnings.warn(

[2025-05-06 06:46:29,895][INFO] Restoring checkpoint from /mnt/workspace/czy/easyrec/experiments/deep_fm/model.ckpt-1599...
[2025-05-06 06:46:29,895][INFO] Restoring model state from /mnt/workspace/czy/easyrec/experiments/deep_fm/model.ckpt-1599/model...
[2025-05-06 06:46:29,902][WARNING] /opt/conda/lib/python3.11/site-packages/torch/distributed/checkpoint/planner_helpers.py:316: FutureWarning: Please use DTensor instead and we are deprecating ShardedTensor.
  device = getattr(value, "device", None)

[rank0]: Traceback (most recent call last):
[rank0]:   File "<frozen runpy>", line 198, in _run_module_as_main
[rank0]:   File "<frozen runpy>", line 88, in _run_code
[rank0]:   File "/opt/conda/lib/python3.11/site-packages/tzrec/train_eval.py", line 57, in <module>
[rank0]:     train_and_evaluate(
[rank0]:   File "/opt/conda/lib/python3.11/site-packages/tzrec/main.py", line 667, in train_and_evaluate
[rank0]:     _train_and_evaluate(
[rank0]:   File "/opt/conda/lib/python3.11/site-packages/tzrec/main.py", line 419, in _train_and_evaluate
[rank0]:     checkpoint_util.restore_model(
[rank0]:   File "/opt/conda/lib/python3.11/site-packages/tzrec/utils/checkpoint_util.py", line 168, in restore_model
[rank0]:     model.load_state_dict(state_dict)
[rank0]:   File "/opt/conda/lib/python3.11/site-packages/torchrec/distributed/model_parallel.py", line 471, in load_state_dict
[rank0]:     return self._load_state_dict(self, state_dict, prefix, strict)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/opt/conda/lib/python3.11/site-packages/torchrec/distributed/model_parallel.py", line 495, in _load_state_dict
[rank0]:     m_keys, u_keys = self._load_state_dict(
[rank0]:                      ^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/opt/conda/lib/python3.11/site-packages/torchrec/distributed/model_parallel.py", line 495, in _load_state_dict
[rank0]:     m_keys, u_keys = self._load_state_dict(
[rank0]:                      ^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/opt/conda/lib/python3.11/site-packages/torchrec/distributed/model_parallel.py", line 495, in _load_state_dict
[rank0]:     m_keys, u_keys = self._load_state_dict(
[rank0]:                      ^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   [Previous line repeated 3 more times]
[rank0]:   File "/opt/conda/lib/python3.11/site-packages/torchrec/distributed/model_parallel.py", line 489, in _load_state_dict
[rank0]:     return module.load_state_dict(state_dict, strict=strict)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 2561, in load_state_dict
[rank0]:     load(self, state_dict)
[rank0]:   File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 2549, in load
[rank0]:     load(child, child_state_dict, child_prefix)  # noqa: F821
[rank0]:     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 2549, in load
[rank0]:     load(child, child_state_dict, child_prefix)  # noqa: F821
[rank0]:     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 2549, in load
[rank0]:     load(child, child_state_dict, child_prefix)  # noqa: F821
[rank0]:     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/opt/conda/lib/python3.11/site-packages/torch/nn/modules/module.py", line 2554, in load
[rank0]:     out = hook(module, incompatible_keys)
[rank0]:           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/opt/conda/lib/python3.11/site-packages/torchrec/modules/mc_modules.py", line 200, in _load_state_dict_post_hook
[rank0]:     module.validate_state()
[rank0]:   File "/opt/conda/lib/python3.11/site-packages/torchrec/modules/mc_modules.py", line 1368, in validate_state
[rank0]:     start in self._output_segments_tensor
[rank0]: AssertionError: shard within range [0, 2000000] cannot be built out of segements tensor([ 0, 10, -1,  ..., -1, -1, -1], device='cuda:0')
I20250506 06:48:01.530891 1016492 fg_handler.cc:1157] Destroy FgHandler (1)
I20250506 06:48:01.531891 1016492 fg_handler.cc:1157] Destroy FgHandler (2)
I20250506 06:48:02.180599 1016492 dl_loader.cc:183] DLLoader::~DLLoader() Done
E0506 06:48:02.595000 1016469 site-packages/torch/distributed/elastic/multiprocessing/api.py:869] failed (exitcode: 1) local_rank: 0 (pid: 1016492) of binary: /opt/conda/bin/python3.11
Traceback (most recent call last):
  File "/opt/conda/bin/torchrun", line 8, in <module>
    sys.exit(main())
             ^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/torch/distributed/run.py", line 918, in main
    run(args)
  File "/opt/conda/lib/python3.11/site-packages/torch/distributed/run.py", line 909, in run
    elastic_launch(
  File "/opt/conda/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 138, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 269, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
tzrec.train_eval FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2025-05-06_06:48:02
  host      : dsw-241660-5fd89544d5-7bd9m
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 1016492)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
root@dsw-241660-5fd89544d5-7bd9m:/mnt/workspace# 

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

当我使用增量训练的时候，只改变修改一个feature的zch_size的大小，其他所有配置都不改变，会出错。 #176

tzrec.train_eval FAILED

Failures:
<NO_OTHER_FAILURES>

Root Cause (first observed failure):
[0]:
time : 2025-05-06_06:48:02
host : dsw-241660-5fd89544d5-7bd9m
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 1016492)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

当我使用增量训练的时候，只改变修改一个feature的zch_size的大小，其他所有配置都不改变，会出错。 #176

Description

tzrec.train_eval FAILED

Failures: <NO_OTHER_FAILURES>

Root Cause (first observed failure): [0]: time : 2025-05-06_06:48:02 host : dsw-241660-5fd89544d5-7bd9m rank : 0 (local_rank: 0) exitcode : 1 (pid: 1016492) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

Failures:
<NO_OTHER_FAILURES>

Root Cause (first observed failure):
[0]:
time : 2025-05-06_06:48:02
host : dsw-241660-5fd89544d5-7bd9m
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 1016492)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html