mirror of
https://github.com/hiyouga/LLaMA-Factory.git
synced 2026-06-25 08:38:55 +08:00
[webui] Fix WebUI training hang from subprocess log pipe (#10584)
Co-authored-by: 凉夜 <liangye@liangyedeMacBook-Air.local>
This commit is contained in:
@@ -16,7 +16,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
from collections.abc import Generator
|
from collections.abc import Generator
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from subprocess import PIPE, Popen, TimeoutExpired
|
from subprocess import Popen, TimeoutExpired
|
||||||
from typing import TYPE_CHECKING, Any
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
from transformers.utils import is_torch_npu_available
|
from transformers.utils import is_torch_npu_available
|
||||||
@@ -375,7 +375,16 @@ class Runner:
|
|||||||
env["FORCE_TORCHRUN"] = "1"
|
env["FORCE_TORCHRUN"] = "1"
|
||||||
|
|
||||||
# NOTE: DO NOT USE shell=True to avoid security risk
|
# NOTE: DO NOT USE shell=True to avoid security risk
|
||||||
self.trainer = Popen(["llamafactory-cli", "train", save_cmd(args)], env=env, stderr=PIPE, text=True)
|
webui_log_path = os.path.join(args["output_dir"], "webui_subprocess.log")
|
||||||
|
webui_log = open(webui_log_path, "a", encoding="utf-8")
|
||||||
|
self.trainer = Popen(
|
||||||
|
["llamafactory-cli", "train", save_cmd(args)],
|
||||||
|
env=env,
|
||||||
|
stdout=webui_log,
|
||||||
|
stderr=webui_log,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
webui_log.close()
|
||||||
yield from self.monitor()
|
yield from self.monitor()
|
||||||
|
|
||||||
def _build_config_dict(self, data: dict["Component", Any]) -> dict[str, Any]:
|
def _build_config_dict(self, data: dict["Component", Any]) -> dict[str, Any]:
|
||||||
@@ -451,6 +460,16 @@ class Runner:
|
|||||||
else:
|
else:
|
||||||
finish_log = load_eval_results(os.path.join(output_path, "all_results.json")) + "\n\n" + running_log
|
finish_log = load_eval_results(os.path.join(output_path, "all_results.json")) + "\n\n" + running_log
|
||||||
else:
|
else:
|
||||||
|
if stderr is None:
|
||||||
|
webui_log_path = os.path.join(output_path, "webui_subprocess.log")
|
||||||
|
if os.path.exists(webui_log_path):
|
||||||
|
with open(webui_log_path, "rb") as f:
|
||||||
|
f.seek(0, os.SEEK_END)
|
||||||
|
f.seek(max(f.tell() - 20000, 0))
|
||||||
|
stderr = f.read().decode("utf-8", errors="replace")
|
||||||
|
else:
|
||||||
|
stderr = "No subprocess log file found."
|
||||||
|
|
||||||
print(stderr)
|
print(stderr)
|
||||||
finish_info = ALERTS["err_failed"][lang]
|
finish_info = ALERTS["err_failed"][lang]
|
||||||
finish_log = ALERTS["err_failed"][lang] + f" Exit code: {return_code}\n\n```\n{stderr}\n```\n"
|
finish_log = ALERTS["err_failed"][lang] + f" Exit code: {return_code}\n\n```\n{stderr}\n```\n"
|
||||||
|
|||||||
Reference in New Issue
Block a user