[webui] Fix WebUI training hang from subprocess log pipe (#10584)

Co-authored-by: 凉夜 <liangye@liangyedeMacBook-Air.local>
This commit is contained in:
summernight
2026-06-17 15:36:40 +08:00
committed by GitHub
parent 8669a22e9c
commit 8792f06161

View File

@@ -16,7 +16,7 @@ import json
import os import os
from collections.abc import Generator from collections.abc import Generator
from copy import deepcopy from copy import deepcopy
from subprocess import PIPE, Popen, TimeoutExpired from subprocess import Popen, TimeoutExpired
from typing import TYPE_CHECKING, Any from typing import TYPE_CHECKING, Any
from transformers.utils import is_torch_npu_available from transformers.utils import is_torch_npu_available
@@ -375,7 +375,16 @@ class Runner:
env["FORCE_TORCHRUN"] = "1" env["FORCE_TORCHRUN"] = "1"
# NOTE: DO NOT USE shell=True to avoid security risk # NOTE: DO NOT USE shell=True to avoid security risk
self.trainer = Popen(["llamafactory-cli", "train", save_cmd(args)], env=env, stderr=PIPE, text=True) webui_log_path = os.path.join(args["output_dir"], "webui_subprocess.log")
webui_log = open(webui_log_path, "a", encoding="utf-8")
self.trainer = Popen(
["llamafactory-cli", "train", save_cmd(args)],
env=env,
stdout=webui_log,
stderr=webui_log,
text=True,
)
webui_log.close()
yield from self.monitor() yield from self.monitor()
def _build_config_dict(self, data: dict["Component", Any]) -> dict[str, Any]: def _build_config_dict(self, data: dict["Component", Any]) -> dict[str, Any]:
@@ -451,6 +460,16 @@ class Runner:
else: else:
finish_log = load_eval_results(os.path.join(output_path, "all_results.json")) + "\n\n" + running_log finish_log = load_eval_results(os.path.join(output_path, "all_results.json")) + "\n\n" + running_log
else: else:
if stderr is None:
webui_log_path = os.path.join(output_path, "webui_subprocess.log")
if os.path.exists(webui_log_path):
with open(webui_log_path, "rb") as f:
f.seek(0, os.SEEK_END)
f.seek(max(f.tell() - 20000, 0))
stderr = f.read().decode("utf-8", errors="replace")
else:
stderr = "No subprocess log file found."
print(stderr) print(stderr)
finish_info = ALERTS["err_failed"][lang] finish_info = ALERTS["err_failed"][lang]
finish_log = ALERTS["err_failed"][lang] + f" Exit code: {return_code}\n\n```\n{stderr}\n```\n" finish_log = ALERTS["err_failed"][lang] + f" Exit code: {return_code}\n\n```\n{stderr}\n```\n"