微调模型之视频编码日志（1）

本文最后更新于8 天前，其中的信息可能已经过时，如有错误请发送邮件到big_fw@foxmail.com

1.目的

收集视频编码运行过程中的日志，对日志进行分析，判断是否有风险，有低风险可自行处理，高风险发出警告。

2.收集数据集

数据集样式

  {
    "instruction": "Analyze the ffmpeg transcoding log and extract the transcoding status, PSNR value, any error messages, and their resolution.",
    "input": "ffmpeg version N-random-g73a871f3b25 shared. Build: gcc-latest.\nInput #0, mov,mp4, from 'input_video_63.mp4':\n  Duration: 00:00:09.00, start: 0.000000, bitrate: 1000 kb/s\n    Stream #0:0: Video: h264 (Main), yuv420p, 1920x1080, 26 fps\n    Stream #0:1: Audio: aac, 48000 Hz, stereo\n[libx264 @ 0x8a927a4d530184e9] PSNR Y:42.50 U:42.63 V:40.40 Avg:42.00 Global:31.90\nOutput #0, mp4, to 'output_video_93.mp4':\n  Stream #0:0: Video: h264 (H.264 Main)\n  Stream #0:1: Audio: aac\nframe=  234 fps= 26 q=29.0 size=     22527kB time=00:00:09.00 bitrate=1956kbits/s speed=2.5x\nvideo:20274kB audio:2252kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.000%\n",
    "output": "{\n  \"successful\": true,\n  \"psnr_value\": 31.90,\n  \"error_message\": \"\",\n  \"resolution_steps\": \"Output file generated as expected. No further action needed.\"\n}"
  },

对数据集进行清洗

import json
#导入未清洗的数据
def convert_alpaca_to_chatml(alpaca_data_path, output_path):
    with open(alpaca_data_path, "r", encoding="utf-8") as f:
        alpaca_data = json.load(f)
    
    chatml_data = []
    for item in alpaca_data:
        # 拼接提示词
        user_input = f"Analyze the following FFmpeg transcoding log to determine the transcoding outcome, PSNR, any error messages, and resolution steps\n{item['input']}".strip()
        # 存成 ChatML 格式
        chatml_data.append({
            "conversations": [
                {"role": "user", "content": user_input},
                {"role": "assistant", "content": item["output"]}
            ]
        })
    
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(chatml_data, f, ensure_ascii=False, indent=2)

# 调用函数
convert_alpaca_to_chatml("alpaca_data.json", "chatml_data.json")

得到处理后的数据，接下来就可以进行微调了

  {
    "conversations": [
      {
        "role": "user",
        "content": "Analyze the following FFmpeg transcoding log to determine the transcoding outcome, PSNR, any error messages, and resolution steps\nffmpeg version N-random-g73a871f3b25 shared. Build: gcc-latest.\nInput #0, mov,mp4, from 'input_video_63.mp4':\n  Duration: 00:00:09.00, start: 0.000000, bitrate: 1000 kb/s\n    Stream #0:0: Video: h264 (Main), yuv420p, 1920x1080, 26 fps\n    Stream #0:1: Audio: aac, 48000 Hz, stereo\n[libx264 @ 0x8a927a4d530184e9] PSNR Y:42.50 U:42.63 V:40.40 Avg:42.00 Global:31.90\nOutput #0, mp4, to 'output_video_93.mp4':\n  Stream #0:0: Video: h264 (H.264 Main)\n  Stream #0:1: Audio: aac\nframe=  234 fps= 26 q=29.0 size=     22527kB time=00:00:09.00 bitrate=1956kbits/s speed=2.5x\nvideo:20274kB audio:2252kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.000%"
      },
      {
        "role": "assistant",
        "content": "{\n  \"successful\": true,\n  \"psnr_value\": 31.90,\n  \"error_message\": \"\",\n  \"resolution_steps\": \"Output file generated as expected. No further action needed.\"\n}"
      }
    ]
  },

3.微调

3.1 加载基座模型

import os # 用于操作文件和目录
from unsloth import FastLanguageModel # 引入unsloth的快速语言模型加载接口
import torch # 引入Pytorch，用于深度学习模型的运行
max_seq_length = 2048 # 设置最大序列长度，决定模型能处理的输入的token上限，目前设置为2k
dtype = None # 数据类型（精度），设置为None，unsloth能够自动选择合适的类型
load_in_4bit = True # 以4bit量化的模式加载模型
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 08-31 20:50:34 [__init__.py:241] Automatically detected platform cuda.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.10: Fast Llama patching. Transformers: 4.56.0. vLLM: 0.10.1.1.
   \\   /|    NVIDIA GeForce RTX 4060 Ti. Num GPUs = 1. Max memory: 15.996 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

我使用的是主机，4060ti 16G的版本，cuda版本是12.6，pytorch版本是2.7.1，环境安装有文章专门介绍

2. 设置 LoRA 适配器

#model = FastLanguageModel.get_peft_model(model)
model = FastLanguageModel.get_peft_model(
   model, # 传入模型文件
   r = 16, # LoRA rank，秩大小，决定低秩矩阵的维度，越大代表模型表达能力越强，但参数更多
   target_modules = ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj",], # 指定哪些模块应用LoRA
   lora_alpha = 16, # 缩放因子（LoRA α），用于调节 LoRA 的输出 
                    # 有效权重 = α / r * LoRA权重
                    # 通常设置与 r 相同或更大一些（比如 16, 32）
   lora_dropout = 0, # LoRA dropout，防止过拟合，训练时丢弃部分连接
                     # 数据越少，越要dropout，数据越少越有可能过拟合
   bias = "none",    
   use_gradient_checkpointing = "unsloth", 
                   # 梯度检查点：降低显存消耗
                   # "unsloth" 代表使用 Unsloth 优化过的版本，适合大模型微调
   random_state = 3407, # 随机种子
   use_rslora = False, # 是否使用 rank-stabilized LoRA（更稳定，但略慢）
   loftq_config = None, # LoRA + QLoRA 混合配置（LoftQ）
                        # 如果不用 QLoRA（量化训练），就保持 None
)

Unsloth 2025.8.10 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.

3.加载数据集

from datasets import load_dataset
dataset = load_dataset('json', data_files='../data-cleaning/chatml_data.json', split='train')
# 编写提示词
chat_template = """Analyze the FFmpeg video transcoding log below and provide the transcoding status, PSNR value, any detected error message, and suggested resolution steps.

### instruction:
{INPUT}

### output:
{OUTPUT}"""

from unsloth import apply_chat_template
# 加载数据集
dataset = apply_chat_template(
    dataset,
    tokenizer=tokenizer,
    chat_template=chat_template,
   
)
print(dataset[0])

4.设置训练参数

# 从 trl 库导入 SFTTrainer，用于进行监督微调（Supervised Fine-Tuning）
from trl import SFTTrainer

# 从 transformers 库导入 TrainingArguments，用于配置训练参数
from transformers import TrainingArguments

# 从 unsloth 自定义模块导入函数，判断当前设备是否支持 bf16
from unsloth import is_bfloat16_supported

# 创建 SFTTrainer 对象，负责模型的微调
trainer = SFTTrainer(
    model = model,                  # 需要微调的模型对象
    tokenizer = tokenizer,          # 模型对应的 tokenizer，用于将文本转换为 token
    train_dataset = dataset,        # 训练数据集
    dataset_text_field = "text",    # 数据集中包含文本的字段名
    max_seq_length = max_seq_length,# 模型输入的最大序列长度
    dataset_num_proc = 2,           # 数据集预处理时使用的进程数量，用于加速
    packing = False,                # 是否启用序列打包以节省显存，这里不启用
    args = TrainingArguments(       # 训练参数配置
        per_device_train_batch_size = 2,  # 每个设备（GPU/TPU）上的训练批量大小
        gradient_accumulation_steps = 4,  # 梯度累积步数，用于模拟更大的 batch size
        warmup_steps = 5,                  # 学习率预热步数
        max_steps = 60,                    # 最大训练步数
        learning_rate = 2e-4,              # 初始学习率
        fp16 = not is_bfloat16_supported(),# 是否启用 fp16（半精度）训练，如果不支持 bf16 就使用 fp16
        bf16 = is_bfloat16_supported(),    # 是否启用 bf16（bfloat16）训练，通常在 A100/TPU 上更稳定
        logging_steps = 1,                 # 每训练多少步记录一次日志
        optim = "adamw_8bit",              # 优化器类型，这里使用 8-bit AdamW，节省显存
        weight_decay = 0.01,               # 权重衰减系数
        lr_scheduler_type = "linear",      # 学习率调度器类型，这里使用线性衰减
        seed = 3407,                       # 随机种子，保证训练可复现
        output_dir = "outputs",            # 模型输出路径
        report_to = "none",                # 日志报告目标，这里不使用任何外部日志系统
    ),
)

5.开始训练

trainer_stats = trainer.train()

 [60/60 05:56, Epoch 0/1]
Step	Training Loss	entropy
1	1.443200	0
2	1.383600	No Log
3	1.815600	No Log
4	1.432700	No Log
5	0.914500	No Log
6	1.646100	No Log
7	1.166800	No Log
8	0.972200	No Log
9	1.193800	No Log
10	0.923500	No Log
11	0.933300	No Log
12	0.885000	No Log
13	0.673500	No Log
14	0.646800	No Log
15	0.661600	No Log
16	0.577800	No Log
17	0.617900	No Log
18	0.639100	No Log
19	0.513300	No Log
20	0.663100	No Log
21	0.522300	No Log
22	0.505600	No Log
23	0.693100	No Log
24	0.573300	No Log
25	0.577200	No Log
26	0.446100	No Log
27	0.679900	No Log
28	0.653400	No Log
29	0.596300	No Log
30	0.524200	No Log
31	0.570900	No Log
32	0.500600	No Log
33	0.515300	No Log
34	0.502300	No Log
35	0.501500	No Log
36	0.538300	No Log
37	0.495400	No Log
38	0.682400	No Log
39	0.468300	No Log
40	0.514700	No Log
41	0.551300	No Log
42	0.600800	No Log
43	0.528800	No Log
44	0.477600	No Log
45	0.494400	No Log
46	0.628500	No Log
47	0.385600	No Log
48	0.506600	No Log
49	0.412600	No Log
50	0.542200	No Log
51	0.504200	No Log
52	0.553600	No Log
53	0.544600	No Log
54	0.508500	No Log
55	0.461000	No Log
56	0.434000	No Log
57	0.678700	No Log
58	0.366900	No Log
59	0.486300	No Log
60	0.515300	No Log

6. 测试 – 转码成功

FastLanguageModel.for_inference(model) 
messages = [                   
    {"role": "user", "content": "Analyze the FFmpeg video transcoding log below and provide the transcoding status, PSNR value, any detected error message, and suggested resolution steps.ffmpeg -i input.mp4 -c:v libx264 -crf 23 -f null -\nffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 FFmpeg developers\nInput #0, mov,mp4,m4a,3gp,3g2,mj2, from 'input.mp4':\n  Duration: 00:00:10.00, start: 0.000000, bitrate: 1000 kb/s\n    Stream #0:0: Video: h264 (High), yuv420p, 1280x720, 25 fps\nOutput #0, null, to 'pipe:1':\n  Metadata:\n    encoder         : Lavf58.76.100\n    Stream #0:0: Video: h264 (libx264), yuv420p, 1280x720, q=-1--1, 25 fps\nStream mapping:\n  Stream #0:0 -> #0:0 (h264 (native) -> libx264 (libx264))\nPress [q] to stop, [?] for help.\nframe=  250 fps= 25 q=28.0 size=N/A time=00:00:10.00 bitrate=N/A speed=1.00x\nvideo:0kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.000000%"},
]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids, streamer = text_streamer, max_new_tokens = 128, pad_token_id = tokenizer.eos_token_id)

{"successful": true, "psnr": 48.35, "error_message": "", "resolution_steps": ""}<|eot_id|>

7. 测试 – 转码失败

FastLanguageModel.for_inference(model) 
messages = [                   
    {"role": "user", "content": "Analyze the FFmpeg video transcoding log below and provide the transcoding status, PSNR value, any detected error message, and suggested resolution steps.ffmpeg -i input.mp4 -c:v copy -b:v 1G -c:a copy output_large_bitrate.mp4\nffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 FFmpeg developers\n[mp4 @ 0x...] Value 1000000000 for parameter 'video_bit_rate' is out of range [-2147483648 - 2147483647]\nCould not write header for output file 'output_large_bitrate.mp4': Invalid argument\nConversion failed!"},
]
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids, streamer = text_streamer, max_new_tokens = 128, pad_token_id = tokenizer.eos_token_id)

{"success": false, "psnr": 0.00, "error": "Invalid argument for video bitrate (-b:v) or audio bitrate (-b:a) parameter.", "resolution": "Check the allowed range for bitrate parameters in FFmpeg documentation. Ensure the provided value is within the valid range. If using a specific bitrate, verify the file size and resolution requirements."}<|eot_id|>

8.保存模型

model.save_pretrained_merged(
    "ffmpeg_log_analyze_model",
    tokenizer,
    save_method = "merged_4bit_forced",  # 强制在4bit下合并
)

因为我们使用的本来就是4bit的基座模型，所以保存也用4bit进行保存。