Qwen3-ASR-1.7B部署及测试

前言

vLLM安装必须在Linux环境或者在Windows下的wsl2中安装。

环境变量

HF环境变量

1
HF_HUB_CACHE

MODELSCOPE环境变量

1
MODELSCOPE_CACHE

设置

1
2
echo 'export MODELSCOPE_CACHE=/mnt/d/AI' >> ~/.bashrc
source ~/.bashrc

注意

两者实际下载的层级结构不一致,不能共用同一个路径

获取

1
2
3
4
5
6
import os

model_path = os.path.join(
os.environ.get("MODELSCOPE_CACHE", ""), "models", "Qwen", "Qwen3-ASR-1.7B"
)
print(model_path)

加载模型可以使用完整路径

1
2
3
4
5
6
7
8
9
model = Qwen3ASRModel.from_pretrained(
os.path.join(
os.environ.get("MODELSCOPE_CACHE", ""), "models", "Qwen", "Qwen3-ASR-1.7B"
),
dtype=torch.bfloat16,
device_map="cuda:0",
max_inference_batch_size=32,
max_new_tokens=256,
)

创建虚拟环境

创建环境

1
2
uv venv --python 3.12
source .venv/bin/activate

下载模型

安装ModelScope

1
uv pip install modelscope

下载模型

1
2
3
modelscope download --model Qwen/Qwen3-ASR-1.7B

modelscope download --model Qwen/Qwen3-ASR-0.6B

vLLM 后端

安装依赖

1
uv pip install -U qwen-asr[vllm]

测试

main01.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import os
import torch
from qwen_asr import Qwen3ASRModel

model = Qwen3ASRModel.from_pretrained(
os.path.join(
os.environ.get("MODELSCOPE_CACHE", ""), "models", "Qwen", "Qwen3-ASR-0.6B"
),
dtype=torch.bfloat16,
device_map="cuda:0",
max_inference_batch_size=32,
max_new_tokens=256,
)

results = model.transcribe(
audio="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav",
language=None, # set "English" to force the language
)

print(results[0].language)
print(results[0].text)

服务方式

1
2
3
qwen-asr-serve Qwen/Qwen3-ASR-0.6B --gpu-memory-utilization 0.8 --host 0.0.0.0 --port 8000

qwen-asr-serve /mnt/d/AI/models/Qwen/Qwen3-ASR-0.6B --gpu-memory-utilization 0.8 --host 0.0.0.0 --port 8000

调用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import requests

url = "http://localhost:8000/v1/chat/completions"
headers = {"Content-Type": "application/json"}

data = {
"messages": [
{
"role": "user",
"content": [
{
"type": "audio_url",
"audio_url": {
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav"
},
}
],
}
]
}

response = requests.post(url, headers=headers, json=data, timeout=300)
response.raise_for_status()
content = response.json()['choices'][0]['message']['content']
print(content)

# parse ASR output if you want
from qwen_asr import parse_asr_output
language, text = parse_asr_output(content)
print(language)
print(text)

流式推理

main.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import io
import urllib.request
from typing import Tuple

import numpy as np
import soundfile as sf

from qwen_asr import Qwen3ASRModel


ASR_MODEL_PATH = "Qwen/Qwen3-ASR-1.7B"
URL_EN = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav"


def _download_audio_bytes(url: str, timeout: int = 30) -> bytes:
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=timeout) as resp:
return resp.read()


def _read_wav_from_bytes(audio_bytes: bytes) -> Tuple[np.ndarray, int]:
with io.BytesIO(audio_bytes) as f:
wav, sr = sf.read(f, dtype="float32", always_2d=False)
return np.asarray(wav, dtype=np.float32), int(sr)


def _resample_to_16k(wav: np.ndarray, sr: int) -> np.ndarray:
"""Simple resample to 16k if needed (uses linear interpolation; good enough for a test)."""
if sr == 16000:
return wav.astype(np.float32, copy=False)
wav = wav.astype(np.float32, copy=False)
dur = wav.shape[0] / float(sr)
n16 = int(round(dur * 16000))
if n16 <= 0:
return np.zeros((0,), dtype=np.float32)
x_old = np.linspace(0.0, dur, num=wav.shape[0], endpoint=False)
x_new = np.linspace(0.0, dur, num=n16, endpoint=False)
return np.interp(x_new, x_old, wav).astype(np.float32)


def run_streaming_case(asr: Qwen3ASRModel, wav16k: np.ndarray, step_ms: int) -> None:
sr = 16000
step = int(round(step_ms / 1000.0 * sr))

print(f"\n===== streaming step = {step_ms} ms =====")
state = asr.init_streaming_state(
unfixed_chunk_num=2,
unfixed_token_num=5,
chunk_size_sec=2.0,
)

pos = 0
call_id = 0
while pos < wav16k.shape[0]:
seg = wav16k[pos : pos + step]
pos += seg.shape[0]
call_id += 1
asr.streaming_transcribe(seg, state)
print(f"[call {call_id:03d}] language={state.language!r} text={state.text!r}")

asr.finish_streaming_transcribe(state)
print(f"[final] language={state.language!r} text={state.text!r}")


def main() -> None:
# Streaming is vLLM-only and no forced aligner supported.
asr = Qwen3ASRModel.LLM(
model=ASR_MODEL_PATH,
gpu_memory_utilization=0.8,
max_new_tokens=32, # set a small value for streaming
)

audio_bytes = _download_audio_bytes(URL_EN)
wav, sr = _read_wav_from_bytes(audio_bytes)
wav16k = _resample_to_16k(wav, sr)

for step_ms in [500, 1000, 2000, 4000]:
run_streaming_case(asr, wav16k, step_ms)


if __name__ == "__main__":
main()