Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/e2e_test.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: ete_test

run-name: 'e2e_test-${{ github.event.inputs.run_case }}'
permissions:
contents: write
pages: write
Expand Down
159 changes: 159 additions & 0 deletions autotest/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,165 @@ case:
lr: 0
timeout: 10800

qwen3-5-sft-fp8-vl:
-
type: sft
parameters:
config: autotest/config/qwen3_5_fp8_vl.py
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
resource:
cpus_per_task: 80
envs:
- MODEL_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
- DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
- MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
- XTUNER_DETERMINISTIC=true
- XTUNER_ACTIVATION_OFFLOAD=1
- XTUNER_USE_FA3=1
assert_info:
base_metric: qwen3-5-sft-fp8-vl/00f7e16/tracker.jsonl
check_metrics:
grad_norm: 10
loss/local_loss: 5
loss/reduced_balancing_loss: 5
loss/reduced_llm_loss: 5
lr: 0
memory/max_memory_GB: 0.2
runtime_info/tgs: 0.05
runtime_info/text_tokens: 0
timeout: 10800

qwen3-5-sft-tp2-vl:
-
type: sft
parameters:
config: autotest/config/qwen3_5_tp2_vl.py
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
resource:
cpus_per_task: 80
envs:
- MODEL_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
- DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
- MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
- XTUNER_DETERMINISTIC=true
assert_info:
base_metric: qwen3-5-sft-tp2-vl/00f7e16/tracker.jsonl
check_metrics:
grad_norm: 10
loss/local_loss: 5
loss/reduced_balancing_loss: 5
loss/reduced_llm_loss: 5
lr: 0
memory/max_memory_GB: 0.2
runtime_info/tgs: 0.05
runtime_info/text_tokens: 0
timeout: 10800

qwen3-5-sft-sp4-resume-vl:
-
type: sft
parameters:
config: autotest/config/qwen3_5_moe_30BA3_sp4_vl.py
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
resource:
envs:
- MODEL_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
- DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
- MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
- XTUNER_DETERMINISTIC=true
- XTUNER_GC_ENABLE=1
assert_info:
base_metric: qwen3-5-sft-sp4-resume-vl/00f7e16/tracker.jsonl
check_metrics:
grad_norm: 10
loss/local_loss: 5
loss/reduced_balancing_loss: 5
loss/reduced_llm_loss: 10
lr: 0
memory/max_memory_GB: 0.2
runtime_info/tgs: 0.05
runtime_info/text_tokens: 0
timeout: 10800

-
type: sft
pre_action:
command: 'python ./autotest/utils/update_meta.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-5-sft-sp4-resume-vl sft'
parameters:
config: autotest/config/qwen3_5_moe_30BA3_sp4_vl.py
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
resource:
memory_per_task: 1200
envs:
- MODEL_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
- DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
- MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
- XTUNER_DETERMINISTIC=true
- XTUNER_GC_ENABLE=1
assert_info:
base_metric: qwen3-5-sft-sp4-resume-vl/00f7e16_resume/tracker.jsonl
check_metrics:
grad_norm: 10
loss/local_loss: 5
loss/reduced_balancing_loss: 5
loss/reduced_llm_loss: 5
lr: 0
memory/max_memory_GB: 0.2
runtime_info/tgs: 0.05
runtime_info/text_tokens: 0
post_action:
command: 'python ./autotest/utils/resume_validation.py /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output qwen3-5-sft-sp4-resume-vl sft 10 "runtime_info/text_tokens,runtime_info/efficient_attn_ratio,loss/reduced_balancing_loss,loss/reduced_llm_loss,loss/local_loss"'
timeout: 10800

qwen3-5-sft-mtp:
-
type: sft
parameters:
config: autotest/config/qwen3_5_mtp.py
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
resource:
envs:
- MODEL_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
- DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/alpaca
- XTUNER_DETERMINISTIC=true
assert_info:
base_metric: qwen3-5-sft-mtp/00f7e16/tracker.jsonl
check_metrics:
grad_norm: 0.05
loss/local_loss: 0.000001
loss/reduced_balancing_loss: 0.000001
loss/reduced_llm_loss: 0.000001
lr: 0
memory/max_memory_GB: 0.2
runtime_info/tgs: 0.05
runtime_info/text_tokens: 0
timeout: 10800

qwen3-5-sft-mtp-vl:
-
type: sft
parameters:
config: autotest/config/qwen3_5_mtp_vl.py
output_path: /mnt/shared-storage-user/llmrazor-share/qa-llm-cicd/test_output
resource:
envs:
- MODEL_PATH=/mnt/shared-storage-user/llmrazor-share/model/Qwen3.5-35B-A3B
- DATA_PATH=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
- MEDIA_ROOT=/mnt/shared-storage-user/llmrazor-share/data/ci_vl
- XTUNER_DETERMINISTIC=true
assert_info:
base_metric: qwen3-5-sft-mtp-vl/00f7e16/tracker.jsonl
check_metrics:
grad_norm: 0.000001
loss/local_loss: 0.000001
loss/reduced_balancing_loss: 0.000001
loss/reduced_llm_loss: 0.000001
lr: 0
memory/max_memory_GB: 0.2
runtime_info/tgs: 0.05
runtime_info/text_tokens: 0
timeout: 10800

qwen3-rl-lmdeploy:
-
type: rl
Expand Down
67 changes: 67 additions & 0 deletions autotest/config/qwen3_5_fp8_vl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os

from xtuner.v1.config import AdamWConfig, FSDPConfig, LRConfig
from xtuner.v1.datasets import Qwen3VLTokenizeFnConfig
from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
from xtuner.v1.float8.config import Float8Config, ScalingGranularity
from xtuner.v1.loss.ce_loss import CELossConfig
from xtuner.v1.model import Qwen3_5_VLMoE35BA3Config
from xtuner.v1.train import TrainerConfig


MEDIA_ROOT = os.environ["MEDIA_ROOT"]
MODEL_PATH = os.environ["MODEL_PATH"]
DATA_PATH = os.environ["DATA_PATH"]

float8_cfg = Float8Config(
scaling_granularity_gemm=ScalingGranularity.TILEWISE,
scaling_granularity_grouped_gemm=ScalingGranularity.TILEWISE,
)

moe_cfg = Qwen3_5_VLMoE35BA3Config(compile_cfg=False, float8_cfg=float8_cfg)

optim_cfg = AdamWConfig(lr=6e-05)
lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
fsdp_cfg = FSDPConfig(
cpu_offload=False,
)

dataset_config = [
{
"dataset": DatasetConfig(
name="sft",
anno_path=DATA_PATH,
class_name="VLMJsonlDataset",
media_root=MEDIA_ROOT,
sample_ratio=1.0,
),
"tokenize_fn": Qwen3VLTokenizeFnConfig(
processor_path=MODEL_PATH,
max_length=16384,
add_vision_id=True,
),
},
]

dataloader_config = DataloaderConfig(
dataset_config_list=dataset_config,
pack_max_length=16384,
collator="qwen3_vl_sft_collator",
)

loss_cfg = CELossConfig(mode="chunk", chunk_size=1024)

trainer = TrainerConfig(
load_from=MODEL_PATH,
model_cfg=moe_cfg,
optim_cfg=optim_cfg,
fsdp_cfg=fsdp_cfg,
dataloader_cfg=dataloader_config,
lr_cfg=lr_cfg,
loss_cfg=loss_cfg,
tokenizer_path=MODEL_PATH,
global_batch_size=16,
total_epoch=1,
work_dir=f"{os.environ['WORK_DIR']}",
seed=0,
)
66 changes: 66 additions & 0 deletions autotest/config/qwen3_5_moe_30BA3_sp4_vl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import os

from xtuner.v1.config import AdamWConfig, FSDPConfig, LRConfig
from xtuner.v1.datasets import Qwen3VLTokenizeFnConfig
from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
from xtuner.v1.loss.ce_loss import CELossConfig
from xtuner.v1.model import Qwen3_5_VLMoE35BA3Config
from xtuner.v1.train import ResumeConfig, TrainerConfig


MEDIA_ROOT = os.environ["MEDIA_ROOT"]
MODEL_PATH = os.environ["MODEL_PATH"]
DATA_PATH = os.environ["DATA_PATH"]


moe_cfg = Qwen3_5_VLMoE35BA3Config(compile_cfg=False)

optim_cfg = AdamWConfig(lr=6e-05)
lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
fsdp_cfg = FSDPConfig(
cpu_offload=False,
tp_size=2,
)

dataset_config = [
{
"dataset": DatasetConfig(
name="sft",
anno_path=DATA_PATH,
class_name="VLMJsonlDataset",
media_root=MEDIA_ROOT,
sample_ratio=1.0,
),
"tokenize_fn": Qwen3VLTokenizeFnConfig(
processor_path=MODEL_PATH,
max_length=16384,
add_vision_id=True,
),
},
]

dataloader_config = DataloaderConfig(
dataset_config_list=dataset_config,
pack_max_length=16384,
collator="qwen3_vl_sft_collator",
)

loss_cfg = CELossConfig(mode="chunk", chunk_size=1024)

trainer = TrainerConfig(
load_from=MODEL_PATH,
model_cfg=moe_cfg,
optim_cfg=optim_cfg,
fsdp_cfg=fsdp_cfg,
dataloader_cfg=dataloader_config,
lr_cfg=lr_cfg,
loss_cfg=loss_cfg,
tokenizer_path=MODEL_PATH,
global_batch_size=16,
total_epoch=1,
work_dir=f"{os.environ['WORK_DIR']}",
seed=0,
resume_cfg=ResumeConfig(auto_resume=True),
checkpoint_interval=10,
checkpoint_maxkeep=2,
)
50 changes: 50 additions & 0 deletions autotest/config/qwen3_5_mtp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os

from xtuner.v1.config import AdamWConfig, FSDPConfig, LRConfig
from xtuner.v1.datasets.config import DataloaderConfig, DatasetConfig
from xtuner.v1.datasets.sft_tokenize_fn import OpenaiTokenizeFunctionConfig
from xtuner.v1.loss.ce_loss import CELossConfig
from xtuner.v1.model import Qwen3_5_VLMoE35BA3Config
from xtuner.v1.module.mtp import MTPConfig
from xtuner.v1.train import TrainerConfig


MODEL_PATH = os.environ["MODEL_PATH"]
DATA_PATH = os.environ["DATA_PATH"]


moe_cfg = Qwen3_5_VLMoE35BA3Config(compile_cfg=False)
moe_cfg.text_config.mtp_config = MTPConfig(num_layers=1, loss_scaling_factor=0.1)

optim_cfg = AdamWConfig(lr=6e-05)
lr_cfg = LRConfig(lr_type="cosine", lr_min=1e-6)
fsdp_cfg = FSDPConfig(
cpu_offload=False,
)

dataset_config = [
{
"dataset": DatasetConfig(name="alpaca", anno_path=DATA_PATH, sample_ratio=1.0),
"tokenize_fn": OpenaiTokenizeFunctionConfig(chat_template="qwen3", max_length=16384),
},
]

dataloader_config = DataloaderConfig(pack_max_length=16384)

loss_cfg = CELossConfig(mode="chunk", chunk_size=1024, loss_reduction="square")

trainer = TrainerConfig(
load_from=MODEL_PATH,
model_cfg=moe_cfg,
optim_cfg=optim_cfg,
fsdp_cfg=fsdp_cfg,
dataset_cfg=dataset_config,
dataloader_cfg=dataloader_config,
lr_cfg=lr_cfg,
loss_cfg=loss_cfg,
tokenizer_path=MODEL_PATH,
global_batch_size=16,
total_epoch=1,
work_dir=f"{os.environ['WORK_DIR']}",
seed=0,
)
Loading
Loading