实时语音转文字

AI 2026-06-25 8

基于 Python 与 NVIDIA NeMo 的语音转文字实现方案，涵盖实时麦克风输入转写与音频文件离线转写两类场景。系统采用 nemotron-3.5-asr-streaming-0.6b.nemo 模型作为核心识别引擎，在 CPU 环境下完成模型加载、音频采集、分段处理与文本输出。针对多语言 Prompt 模型在实际调用过程中可能出现的语言字段丢失、日志干扰及转写链路兼容性问题，方案通过显式构造临时 manifest 、传递语言标识、静默第三方警告输出以及清理结果尾部语言标签等方式进行了工程化适配。最终实现的系统既支持麦克风实时采集后按时间片段连续识别，也支持对指定音频文件进行直接转写，具有部署简单、可扩展性强、适合本地化使用等特点，可为个人语音记录、会议纪要生成、音频内容整理等应用提供技术参考。

模型下载地址：https://hf-mirror.com/nvidia/nemotron-3.5-asr-streaming-0.6b/tree/main

python 环境

absl-py==2.4.0
accelerate==1.14.0
aiohappyeyeballs==2.6.2
aiohttp==3.14.1
aiosignal==1.4.0
aistore==1.25.0
alembic==1.18.4
annotated-doc==0.0.4
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.14.0
asttokens==3.0.1
attrs==26.1.0
audioread==3.1.0
braceexpand==0.1.7
certifi==2026.6.17
cffi==2.0.0
charset-normalizer==3.4.7
click==8.4.2
cloudpickle==3.1.2
colorama==0.4.6
colorlog==6.10.1
contourpy==1.3.3
cycler==0.12.1
Cython==3.2.6
cytoolz==1.1.0
datasets==5.0.0
decorator==5.3.1
dill==0.4.1
distro==1.9.0
editdistance==0.8.1
einops==0.8.2
executing==2.2.1
fiddle==0.3.0
filelock==3.29.4
flatbuffers==25.12.19
fonttools==4.63.0
frozenlist==1.8.0
fsspec==2024.12.0
gitdb==4.0.12
GitPython==3.1.50
graphviz==0.21
grpcio==1.81.1
h11==0.16.0
hf-xet==1.5.1
httpcore==1.0.9
httpx==0.28.1
huggingface_hub==0.36.2
humanize==4.15.0
hydra-core==1.3.2
idna==3.18
indic_numtowords==1.1.0
inflect==7.5.0
intervaltree==3.2.1
ipython==9.14.1
ipython_pygments_lexers==1.1.1
jedi==0.20.0
Jinja2==3.1.6
jiter==0.15.0
jiwer==3.1.0
joblib==1.5.3
kaldi-native-fbank==1.22.3
kaldi-python-io==1.2.2
kaldialign==0.9.1
kiwisolver==1.5.0
lazy-loader==0.5
lhotse==1.33.0
libcst==1.8.6
librosa==0.11.0
lightning==2.4.0
lightning-utilities==0.15.3
llvmlite==0.47.0
lxml==6.1.1
Mako==1.3.12
Markdown==3.10.2
markdown-it-py==4.2.0
MarkupSafe==3.0.3
marshmallow==4.3.0
matplotlib==3.11.0
matplotlib-inline==0.2.2
mdurl==0.1.2
mediapy==1.1.6
ml_dtypes==0.5.4
more-itertools==11.1.0
mpmath==1.3.0
msgpack==1.2.1
msgspec==0.21.1
multidict==6.7.1
multiprocess==0.70.19
narwhals==2.22.1
nemo-toolkit @ https://github.com/NVIDIA/NeMo/archive/refs/heads/main.zip#sha256=849feec5f4c6aae2f9618840e69b4fcff27d4b3a5409a2104c721c93115bffa3
networkx==3.6.1
numba==0.65.1
numexpr==2.13.1
numpy==2.4.6
nv-one-logger-core==2.3.1
nv-one-logger-pytorch-lightning-integration==2.3.1
nv-one-logger-training-telemetry==2.3.1
omegaconf==2.3.0
onnx==1.22.0
onnxruntime==1.27.0
openai==2.43.0
optuna==4.9.0
overrides==7.7.0
packaging==24.2
pandas==3.0.3
parso==0.8.7
peft==0.19.1
pexpect==4.9.0
pillow==12.2.0
platformdirs==4.10.0
pooch==1.9.0
portalocker==3.2.0
prompt_toolkit==3.0.52
propcache==0.5.2
protobuf==5.29.6
psutil==7.2.2
ptyprocess==0.7.0
pure_eval==0.2.3
pyannote-core==6.0.1
pyannote-database==6.1.1
pyannote-metrics==4.1
pyarrow==24.0.0
pycparser==3.0
pydantic==2.13.4
pydantic_core==2.46.4
pydub==0.25.1
Pygments==2.20.0
pyloudnorm==0.2.0
pyparsing==3.3.2
python-dateutil==2.9.0.post0
pytorch-lightning==2.6.5
PyYAML==6.0.3
RapidFuzz==3.14.5
regex==2026.5.9
requests==2.34.2
resampy==0.4.3
rich==15.0.0
ruamel.yaml==0.19.1
sacrebleu==2.6.0
sacremoses==0.1.1
safetensors==0.8.0
scikit-learn==1.9.0
scipy==1.18.0
sentencepiece==0.2.1
sentry-sdk==2.63.0
setuptools==81.0.0
shellingham==1.5.4
six==1.17.0
smart_open==7.6.1
smmap==5.0.3
sniffio==1.3.1
sortedcontainers==2.4.0
sounddevice==0.5.5
soundfile==0.14.0
sox==1.5.0
soxr==1.1.0
SQLAlchemy==2.0.51
stack-data==0.6.3
StrEnum==0.4.15
sympy==1.14.0
tabulate==0.10.0
tenacity==9.1.4
tensorboard==2.20.0
tensorboard-data-server==0.7.2
text-unidecode==1.3
threadpoolctl==3.6.0
tokenizers==0.22.2
toml==0.10.2
toolz==1.1.0
torch==2.12.1
torchaudio==2.11.0
torchmetrics==1.9.0
torchvision==0.27.1
tqdm==4.68.3
traitlets==5.15.1
transformers==4.57.6
typeguard==4.5.2
typer==0.25.1
typing-inspection==0.4.2
typing_extensions==4.15.0
urllib3==2.7.0
wandb==0.28.0
wcwidth==0.8.1
webdataset==1.0.2
Werkzeug==3.1.8
wget==3.2
whisper_normalizer==0.1.12
wrapt==2.2.2
xxhash==3.7.1
yarl==1.24.2

实时语音转文字 python代码：

import contextlib
import io
import json
import queue
import re
import tempfile
from pathlib import Path

import numpy as np
import sounddevice as sd
import soundfile as sf
import torch

try:
    import nemo.collections.asr as nemo_asr
    from nemo.utils import logging as nemo_logging
except ImportError as exc:
    raise SystemExit("未安装 NVIDIA NeMo ASR。请先执行: pip install 'nemo_toolkit[asr]'") from exc

# ================= 这里配置绝对路径 =================
model_path = Path(" /nemotron-3.5-asr-streaming-0.6b.nemo")
sample_rate = 16000  # 麦克风采样率
channels = 1  # 单声道
dtype = "float32"
chunk_duration = 2  # 每次处理 2 秒音频
target_lang = "zh-CN"  # 可改为 "auto" / "en-US" 等模型支持的语言 ID

if not model_path.is_file():
    raise SystemExit(f"未找到 .nemo 模型文件: {model_path}")

# ================= 初始化模型 =================
print("加载 .nemo 模型中...")
nemo_logging.set_verbosity(nemo_logging.ERROR)
asr_model = nemo_asr.models.ASRModel.restore_from(
    restore_path=str(model_path),
    map_location=torch.device("cpu"),
)
asr_model.freeze()

# ================= 音频流处理 =================
audio_queue = queue.Queue()


def audio_callback(indata, _frames, _time, status):
    """麦克风回调函数，把音频数据放入队列"""
    if status:
        print(f"音频状态: {status}")
    # 转成 mono 并放入队列
    audio_data = indata[:, 0].copy() if indata.shape[1] > 1 else indata[:, 0]
    audio_queue.put(audio_data)


def transcribe_chunk(audio_array: np.ndarray) -> str:
    """将一段麦克风音频写入临时 wav 和 manifest，再交给 NeMo 转写。"""
    # 当前 NeMo prompt 模型在直接传 wav 路径时会丢失语言字段，这里显式写 manifest 绕过。
    with tempfile.TemporaryDirectory() as tmp_dir:
        wav_path = Path(tmp_dir) / "chunk.wav"
        manifest_path = Path(tmp_dir) / "chunk_manifest.json"

        sf.write(wav_path, audio_array, sample_rate)

        manifest_entry = {
            "audio_filepath": str(wav_path),
            "duration": len(audio_array) / sample_rate,
            "text": "",
            "lang": target_lang,
        }
        manifest_path.write_text(json.dumps(manifest_entry) + "\n", encoding="utf-8")

        with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
            result = asr_model.transcribe(
                [str(manifest_path)],
                batch_size=1,
                verbose=False,
            )

    if isinstance(result, list) and result:
        first = result[0]
        if isinstance(first, str):
            return re.sub(r"\s*<[^>]+>\s*$", "", first).strip()
        if hasattr(first, "text"):
            return re.sub(r"\s*<[^>]+>\s*$", "", str(first.text)).strip()
    return ""


# ================= 启动麦克风 =================
print("\n开始录音，按 Ctrl+C 停止...\n")

audio_buffer = []

try:
    with sd.InputStream(
        samplerate=sample_rate,
        channels=channels,
        dtype=dtype,
        callback=audio_callback,
        blocksize=int(sample_rate * chunk_duration),
    ):
        while True:
            # 从队列获取音频数据
            try:
                audio_chunk = audio_queue.get(timeout=5)
            except queue.Empty:
                print("5 秒内没有收到麦克风数据，请检查麦克风权限和输入设备")
                continue
            audio_buffer.extend(audio_chunk.tolist())

            # 每累积 chunk_duration 秒的音频就识别一次
            if len(audio_buffer) >= sample_rate * chunk_duration:
                audio_array = np.array(audio_buffer[: sample_rate * chunk_duration])
                audio_buffer = audio_buffer[sample_rate * chunk_duration :]

                # 执行识别
                text = transcribe_chunk(audio_array)
                if text:
                    print(f"实时语音转文字： {text}")

except KeyboardInterrupt:
    print("\n\n已停止录音")
except Exception as e:
    print(f"错误: {e}")

语音文件转文字：

import argparse
import contextlib
import io
import json
import re
import tempfile
from pathlib import Path

import soundfile as sf
import torch

try:
    import nemo.collections.asr as nemo_asr
    from nemo.utils import logging as nemo_logging
except ImportError as exc:
    raise SystemExit("未安装 NVIDIA NeMo ASR。请先执行: pip install 'nemo_toolkit[asr]'") from exc

# 这里是绝对路径
DEFAULT_MODEL_PATH = Path("/nemotron-3.5-asr-streaming-0.6b.nemo")


def parse_args():
    parser = argparse.ArgumentParser(description="将音频文件转换为文本")
    parser.add_argument("audio_file", help="输入音频文件路径，例如 demo.wav")
    parser.add_argument(
        "--model",
        default=str(DEFAULT_MODEL_PATH),
        help="`.nemo` 模型文件路径",
    )
    parser.add_argument(
        "--lang",
        default="zh-CN",
        help='语言 ID，例如 "zh-CN"、"en-US"、"auto"',
    )
    return parser.parse_args()


def load_model(model_path: Path):
    if not model_path.is_file():
        raise SystemExit(f"未找到 .nemo 模型文件: {model_path}")

    nemo_logging.set_verbosity(nemo_logging.ERROR)
    model = nemo_asr.models.ASRModel.restore_from(
        restore_path=str(model_path),
        map_location=torch.device("cpu"),
    )
    model.freeze()
    return model


def normalize_text(text: str) -> str:
    return re.sub(r"\s*<[^>]+>\s*$", "", text).strip()


def transcribe_audio_file(model, audio_file: Path, target_lang: str) -> str:
    if not audio_file.is_file():
        raise SystemExit(f"未找到音频文件: {audio_file}")

    info = sf.info(str(audio_file))

    with tempfile.TemporaryDirectory() as tmp_dir:
        manifest_path = Path(tmp_dir) / "input_manifest.json"
        manifest_entry = {
            "audio_filepath": str(audio_file.resolve()),
            "duration": float(info.duration),
            "text": "",
            "lang": target_lang,
        }
        manifest_path.write_text(json.dumps(manifest_entry) + "\n", encoding="utf-8")

        with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
            result = model.transcribe(
                [str(manifest_path)],
                batch_size=1,
                verbose=False,
            )

    if isinstance(result, list) and result:
        first = result[0]
        if isinstance(first, str):
            return normalize_text(first)
        if hasattr(first, "text"):
            return normalize_text(str(first.text))

    return ""


def main():
    args = parse_args()
    audio_file = Path(args.audio_file).expanduser().resolve()
    model_path = Path(args.model).expanduser().resolve()

    print("加载模型中...")
    model = load_model(model_path)

    text = transcribe_audio_file(model, audio_file, args.lang)
    if text:
        print(text)
    else:
        print("未识别到文本")


if __name__ == "__main__":
    main()