From 5308424705646e701da3ccae024bc05b76845a4b Mon Sep 17 00:00:00 2001 From: Kingsley Date: Thu, 5 Jun 2025 13:22:01 +0800 Subject: [PATCH] [script] add Script description for qwen_omni_merge (#8293) --- scripts/qwen_omni_merge.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/qwen_omni_merge.py b/scripts/qwen_omni_merge.py index 695f73b3..e7722e38 100644 --- a/scripts/qwen_omni_merge.py +++ b/scripts/qwen_omni_merge.py @@ -12,6 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""Why we need this script for qwen_omni? + +Because the qwen_omni model is constructed by two parts: +1. [Thinker]:[audio_encoder, vision_encoder, LLM backbone], which our repository does support to post-training. +2. [Talker]: [audio_decoder, wave_model], which is not supported to post-training without specific tokenizer. +When we post-training the model, we exactly train the [Thinker] part, and the [Talker] part is dropped. +So, to get the complete model, we need to merge the [Talker] part back to the [Thinker] part. +LoRA mode: [Thinker + LoRA weights] + [Original Talker] -> [Omni model] +Full mode: [Thinker] + [Original Talker] -> [Omni model] +For Processor, we do saved the processor from trained model instead of the original model. +""" + import os import shutil