Issues with Model Performance on Smaller Roboflow Datsets

Hey @Peterande ! First of all, thank you so much for this interesting contribution.

We at Roboflow are trying to benchmark D-FINE's performance on smaller roboflow datasets, like those in [Roboflow 100](https://www.rf100.org/) and we are seeing sub 5% mAP as well as crashes for each dataset that we try, both when using the o365 weights and without.

We're using the finetuning config you have defined here and simply replacing the dataset link with a Roboflow dataset. We want to make sure we represent your work in the best light, so do you have any advice on how to get better metrics here? We notice that you recommended trying training from scratch for all the other people encountering similar issues, but I didn't see anyone respond that that worked for them, and we haven't seen it help much for us either.

Here's how we've been launching the benchmark. Please let us know if we're doing something obviously wrong!

```python
import roboflow
import os
import json
import subprocess
import torch
import fire
import re


model_name_to_config_map = {
    "dfine_s": "configs/dfine/custom/objects365/dfine_hgnetv2_s_obj2custom.yml",
    "dfine_m": "configs/dfine/custom/objects365/dfine_hgnetv2_m_obj2custom.yml",
    "dfine_l": "configs/dfine/custom/objects365/dfine_hgnetv2_l_obj2custom.yml",
    "dfine_x": "configs/dfine/custom/objects365/dfine_hgnetv2_x_obj2custom.yml",
}
dataset_config_template = "configs/dataset/custom_detection.yml"

generated_model_config_base_dir = "configs/dfine/custom/roboflow"
generated_dataset_config_base_dir = "configs/dataset/roboflow"

model_name_to_o365_checkpoint_map = {
    "dfine_s": "https://github.com/Peterande/storage/releases/download/dfinev1.0/dfine_s_obj365.pth",
    "dfine_m": "https://github.com/Peterande/storage/releases/download/dfinev1.0/dfine_m_obj365.pth",
    "dfine_l": "https://github.com/Peterande/storage/releases/download/dfinev1.0/dfine_l_obj365.pth",
    "dfine_x": "https://github.com/Peterande/storage/releases/download/dfinev1.0/dfine_x_obj365.pth",
}


def train_on_roboflow_url(roboflow_url, model_name="dfine_s", output_dir="./output"):
    # load dataset and related info
    print(f"Downloading dataset from {roboflow_url}")
    dataset = roboflow.download_dataset(roboflow_url, "coco")
    
    dataset_train_image_folder = os.path.join(dataset.location, "train")
    dataset_train_annotation_file = os.path.join(dataset_train_image_folder, "_annotations.coco.json")

    dataset_val_image_folder = os.path.join(dataset.location, "valid")

    with open(dataset_train_annotation_file, "r") as f:
        train_annotations = json.load(f)

    num_classes = len(train_annotations["categories"])

    del train_annotations

    # construct dataset config
    print(f"Creating dataset config in {output_dir}")
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(generated_dataset_config_base_dir, exist_ok=True)
    os.makedirs(generated_model_config_base_dir, exist_ok=True)

    with open(dataset_config_template, "r") as f:
        dataset_config = f.read()

    dataset_config = dataset_config.replace("num_classes: 777 # your dataset classes", f"num_classes: {num_classes}")

    dataset_config = dataset_config.replace("/data/yourdataset/train", dataset_train_image_folder)
    dataset_config = dataset_config.replace("train.json", "_annotations.coco.json")

    dataset_config = dataset_config.replace("/data/yourdataset/val", dataset_val_image_folder)
    dataset_config = dataset_config.replace("val.json", "_annotations.coco.json")

    dataset_filename = f"{dataset.name}_dfine_dataset_config.yml"
    dataset_config_save_name = os.path.join(generated_dataset_config_base_dir, dataset_filename)

    with open(dataset_config_save_name, "w") as f:
        f.write(dataset_config)
    
    # construct model config
    print(f"Creating model config in {output_dir}")
    model_config = model_name_to_config_map[model_name]

    with open(model_config, "r") as f:
        model_config = f.read()
    
    # ensure the standardized bs and epochs
    # model_config = model_config.replace("epochs: 64", "epochs: 1")
    epochs = re.search(r"epochs: (\d+)", model_config).group(1)
    model_config = model_config.replace(f"epochs: {epochs}", "epochs: 100")
    # model_config = model_config.replace("total_batch_size: 128", "total_batch_size: 16")
    # model_config = model_config.replace("total_batch_size: 256", "total_batch_size: 16")
    model_config = model_config.replace("train_dataloader:", "train_dataloader:\n  total_batch_size: 16")
    epoch = re.search(r"epoch: (\d+)", model_config).group(1)
    model_config = model_config.replace(f"epoch: {epoch}", "epoch: 90")
    stop_epoch = re.search(r"stop_epoch: (\d+)", model_config).group(1)
    model_config = model_config.replace(f"stop_epoch: {stop_epoch}", "stop_epoch: 90")

    model_config = model_config.replace("dataset/custom_detection.yml", os.path.join("..", dataset_config_save_name))

    # model_config = model_config.replace("\'../", "\'../../")  # since the model config is in an extra subdir

    train_output_dir = os.path.join(output_dir, f"{dataset.name}_{model_name}_train_output")
    model_size = model_name.split("_")[1]
    model_config = model_config.replace(f"output_dir: ./output/dfine_hgnetv2_{model_size}_obj2custom", f"output_dir: {train_output_dir}")

    model_config_save_name = os.path.join(generated_model_config_base_dir, f"{dataset.name}_{model_name}_model_config.yml")

    with open(model_config_save_name, "w") as f:
        f.write(model_config)
    
    # train model
    o365_checkpoint_url = model_name_to_o365_checkpoint_map[model_name]
    o365_checkpoint_name = o365_checkpoint_url.split("/")[-1]
    o365_checkpoint_path = os.path.join(output_dir, o365_checkpoint_name)

    if not os.path.exists(o365_checkpoint_path):
        print(f"Downloading O365 checkpoint from {o365_checkpoint_url}")
        subprocess.run(["wget", o365_checkpoint_url, "-O", o365_checkpoint_path])

    print(f"Training model in {train_output_dir}")
    num_gpus = torch.cuda.device_count()
    train_result = subprocess.run([
        "torchrun",
        "--nproc_per_node", str(num_gpus),
        "--rdzv_endpoint", "localhost:0",
        "--rdzv-backend", "c10d",
        "train.py",
        "-c", model_config_save_name,
        "--use-amp",
        "--seed=0",
        "-t", o365_checkpoint_path
    ])

    # get test set performance
    dataset_config = dataset_config.replace("valid", "test")

    with open(dataset_config_save_name, "w") as f:
        f.write(dataset_config)
    
    stg1_checkpoint_path = os.path.join(train_output_dir, "best_stg1.pth")
    stg2_checkpoint_path = os.path.join(train_output_dir, "best_stg2.pth")

    if os.path.exists(stg2_checkpoint_path):
        print(f"Testing with STG2 checkpoint {stg2_checkpoint_path}")
        checkpoint_path = stg2_checkpoint_path
    elif os.path.exists(stg1_checkpoint_path):
        print(f"Testing with STG1 checkpoint {stg1_checkpoint_path}")
        checkpoint_path = stg1_checkpoint_path
    else:
        raise ValueError(f"No checkpoint found in {train_output_dir}")
    
    test_result = subprocess.run([
        "torchrun",
        "--nproc_per_node", str(num_gpus),
        "--rdzv_endpoint", "localhost:0",
        "--rdzv-backend", "c10d",
        "train.py",
        "-c", model_config_save_name,
        "--test-only",
        "-r", checkpoint_path
    ])

    test_stats_pth = os.path.join(train_output_dir, "test_stats.pth")
    test_stats = torch.load(test_stats_pth, weights_only=False)

    results_json = {
        "model_name": model_name,
        "map": test_stats["coco_eval_bbox"][0],
        "map50": test_stats["coco_eval_bbox"][1],
        "url": roboflow_url,
    }

    results_json_pth = os.path.join(train_output_dir, "results.json")
    with open(results_json_pth, "w") as f:
        json.dump(results_json, f, indent=2)
    
    
if __name__ == "__main__":
    fire.Fire(train_on_roboflow_url)
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Issues with Model Performance on Smaller Roboflow Datsets #214

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issues with Model Performance on Smaller Roboflow Datsets #214

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions