[webui] Fix WebUI training hang from subprocess log pipe (#10584)

Co-authored-by: 凉夜 <liangye@liangyedeMacBook-Air.local>
This commit is contained in:
summernight
2026-06-17 15:36:40 +08:00
committed by GitHub
parent 8669a22e9c
commit 8792f06161

View File

@@ -16,7 +16,7 @@ import json
import os
from collections.abc import Generator
from copy import deepcopy
from subprocess import PIPE, Popen, TimeoutExpired
from subprocess import Popen, TimeoutExpired
from typing import TYPE_CHECKING, Any
from transformers.utils import is_torch_npu_available
@@ -375,7 +375,16 @@ class Runner:
env["FORCE_TORCHRUN"] = "1"
# NOTE: DO NOT USE shell=True to avoid security risk
self.trainer = Popen(["llamafactory-cli", "train", save_cmd(args)], env=env, stderr=PIPE, text=True)
webui_log_path = os.path.join(args["output_dir"], "webui_subprocess.log")
webui_log = open(webui_log_path, "a", encoding="utf-8")
self.trainer = Popen(
["llamafactory-cli", "train", save_cmd(args)],
env=env,
stdout=webui_log,
stderr=webui_log,
text=True,
)
webui_log.close()
yield from self.monitor()
def _build_config_dict(self, data: dict["Component", Any]) -> dict[str, Any]:
@@ -451,6 +460,16 @@ class Runner:
else:
finish_log = load_eval_results(os.path.join(output_path, "all_results.json")) + "\n\n" + running_log
else:
if stderr is None:
webui_log_path = os.path.join(output_path, "webui_subprocess.log")
if os.path.exists(webui_log_path):
with open(webui_log_path, "rb") as f:
f.seek(0, os.SEEK_END)
f.seek(max(f.tell() - 20000, 0))
stderr = f.read().decode("utf-8", errors="replace")
else:
stderr = "No subprocess log file found."
print(stderr)
finish_info = ALERTS["err_failed"][lang]
finish_log = ALERTS["err_failed"][lang] + f" Exit code: {return_code}\n\n```\n{stderr}\n```\n"