在 python 中的单词上拆分语音音频文件
Split speech audio file on words in python
我觉得这是一个相当普遍的问题,但我还没有找到合适的答案。我有很多人类语音的音频文件,我想打破单词,这可以通过查看波形中的暂停来启发式地完成,但是任何人都可以指出 function/library 中的 python这是自动的?
您可以查看 Audiolab It provides a decent API to convert the voice samples into numpy 个数组。
Audiolab 模块使用 libsndfile C++ 库来完成繁重的工作。
然后您可以解析数组以找到较低的值来找到暂停。
一个更简单的方法是使用 pydub module. recent addition of silent utilities 完成所有繁重的工作,例如 setting up silence threahold
、setting up silence length
。等等,与提到的其他方法相比,大大简化了代码。
这是一个演示实现,灵感来自 here
设置:
我在文件 "a-z.wav" 中有一个音频文件,其中包含从 A
到 Z
的英语口语字母。在当前工作目录中创建了一个子目录 splitAudio
。执行演示代码后,文件被分成 26 个单独的文件,每个音频文件存储每个音节。
观察:
部分音节被截断,可能需要修改以下参数,
min_silence_len=500
silence_thresh=-16
人们可能想根据自己的要求调整这些。
演示代码:
from pydub import AudioSegment
from pydub.silence import split_on_silence
sound_file = AudioSegment.from_wav("a-z.wav")
audio_chunks = split_on_silence(sound_file,
# must be silent for at least half a second
min_silence_len=500,
# consider it silent if quieter than -16 dBFS
silence_thresh=-16
)
for i, chunk in enumerate(audio_chunks):
out_file = ".//splitAudio//chunk{0}.wav".format(i)
print "exporting", out_file
chunk.export(out_file, format="wav")
输出:
Python 2.7.9 (default, Dec 10 2014, 12:24:55) [MSC v.1500 32 bit (Intel)] on win32
Type "copyright", "credits" or "license()" for more information.
>>> ================================ RESTART ================================
>>>
exporting .//splitAudio//chunk0.wav
exporting .//splitAudio//chunk1.wav
exporting .//splitAudio//chunk2.wav
exporting .//splitAudio//chunk3.wav
exporting .//splitAudio//chunk4.wav
exporting .//splitAudio//chunk5.wav
exporting .//splitAudio//chunk6.wav
exporting .//splitAudio//chunk7.wav
exporting .//splitAudio//chunk8.wav
exporting .//splitAudio//chunk9.wav
exporting .//splitAudio//chunk10.wav
exporting .//splitAudio//chunk11.wav
exporting .//splitAudio//chunk12.wav
exporting .//splitAudio//chunk13.wav
exporting .//splitAudio//chunk14.wav
exporting .//splitAudio//chunk15.wav
exporting .//splitAudio//chunk16.wav
exporting .//splitAudio//chunk17.wav
exporting .//splitAudio//chunk18.wav
exporting .//splitAudio//chunk19.wav
exporting .//splitAudio//chunk20.wav
exporting .//splitAudio//chunk21.wav
exporting .//splitAudio//chunk22.wav
exporting .//splitAudio//chunk23.wav
exporting .//splitAudio//chunk24.wav
exporting .//splitAudio//chunk25.wav
exporting .//splitAudio//chunk26.wav
>>>
使用IBM STT。使用 timestamps=true
你会在系统检测到它们被说出时得到单词 break up。
还有许多其他很酷的功能,例如 word_alternatives_threshold
可获取单词的其他可能性,以及 word_confidence
可获得系统预测单词的置信度。将 word_alternatives_threshold
设置在(0.1 和 0.01)之间以获得真正的想法。
这需要登录,之后您可以使用生成的用户名和密码。
IBM STT已经是提到的speechrecognition模块的一部分,但是要获取单词时间戳,您需要修改函数。
提取和修改后的表格如下所示:
def extracted_from_sr_recognize_ibm(audio_data, username=IBM_USERNAME, password=IBM_PASSWORD, language="en-US", show_all=False, timestamps=False,
word_confidence=False, word_alternatives_threshold=0.1):
assert isinstance(username, str), "``username`` must be a string"
assert isinstance(password, str), "``password`` must be a string"
flac_data = audio_data.get_flac_data(
convert_rate=None if audio_data.sample_rate >= 16000 else 16000, # audio samples should be at least 16 kHz
convert_width=None if audio_data.sample_width >= 2 else 2 # audio samples should be at least 16-bit
)
url = "https://stream-fra.watsonplatform.net/speech-to-text/api/v1/recognize?{}".format(urlencode({
"profanity_filter": "false",
"continuous": "true",
"model": "{}_BroadbandModel".format(language),
"timestamps": "{}".format(str(timestamps).lower()),
"word_confidence": "{}".format(str(word_confidence).lower()),
"word_alternatives_threshold": "{}".format(word_alternatives_threshold)
}))
request = Request(url, data=flac_data, headers={
"Content-Type": "audio/x-flac",
"X-Watson-Learning-Opt-Out": "true", # prevent requests from being logged, for improved privacy
})
authorization_value = base64.standard_b64encode("{}:{}".format(username, password).encode("utf-8")).decode("utf-8")
request.add_header("Authorization", "Basic {}".format(authorization_value))
try:
response = urlopen(request, timeout=None)
except HTTPError as e:
raise sr.RequestError("recognition request failed: {}".format(e.reason))
except URLError as e:
raise sr.RequestError("recognition connection failed: {}".format(e.reason))
response_text = response.read().decode("utf-8")
result = json.loads(response_text)
# return results
if show_all: return result
if "results" not in result or len(result["results"]) < 1 or "alternatives" not in result["results"][0]:
raise Exception("Unknown Value Exception")
transcription = []
for utterance in result["results"]:
if "alternatives" not in utterance:
raise Exception("Unknown Value Exception. No Alternatives returned")
for hypothesis in utterance["alternatives"]:
if "transcript" in hypothesis:
transcription.append(hypothesis["transcript"])
return "\n".join(transcription)
pyAudioAnalysis 如果单词被清楚地分开(自然语音中很少出现这种情况),则可以对音频文件进行分段。包比较好用:
python pyAudioAnalysis/pyAudioAnalysis/audioAnalysis.py silenceRemoval -i SPEECH_AUDIO_FILE_TO_SPLIT.mp3 --smoothing 1.0 --weight 0.3
有关我的 blog 的更多详细信息。
我的函数变体,可能会更容易根据您的需要进行修改:
from scipy.io.wavfile import write as write_wav
import numpy as np
import librosa
def zero_runs(a):
iszero = np.concatenate(([0], np.equal(a, 0).view(np.int8), [0]))
absdiff = np.abs(np.diff(iszero))
ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
return ranges
def split_in_parts(audio_path, out_dir):
# Some constants
min_length_for_silence = 0.01 # seconds
percentage_for_silence = 0.01 # eps value for silence
required_length_of_chunk_in_seconds = 60 # Chunk will be around this value not exact
sample_rate = 16000 # Set to None to use default
# Load audio
waveform, sampling_rate = librosa.load(audio_path, sr=sample_rate)
# Create mask of silence
eps = waveform.max() * percentage_for_silence
silence_mask = (np.abs(waveform) < eps).astype(np.uint8)
# Find where silence start and end
runs = zero_runs(silence_mask)
lengths = runs[:, 1] - runs[:, 0]
# Left only large silence ranges
min_length_for_silence = min_length_for_silence * sampling_rate
large_runs = runs[lengths > min_length_for_silence]
lengths = lengths[lengths > min_length_for_silence]
# Mark only center of silence
silence_mask[...] = 0
for start, end in large_runs:
center = (start + end) // 2
silence_mask[center] = 1
min_required_length = required_length_of_chunk_in_seconds * sampling_rate
chunks = []
prev_pos = 0
for i in range(min_required_length, len(waveform), min_required_length):
start = i
end = i + min_required_length
next_pos = start + silence_mask[start:end].argmax()
part = waveform[prev_pos:next_pos].copy()
prev_pos = next_pos
if len(part) > 0:
chunks.append(part)
# Add last part of waveform
part = waveform[prev_pos:].copy()
chunks.append(part)
print('Total chunks: {}'.format(len(chunks)))
new_files = []
for i, chunk in enumerate(chunks):
out_file = out_dir + "chunk_{}.wav".format(i)
print("exporting", out_file)
write_wav(out_file, sampling_rate, chunk)
new_files.append(out_file)
return new_files
我觉得这是一个相当普遍的问题,但我还没有找到合适的答案。我有很多人类语音的音频文件,我想打破单词,这可以通过查看波形中的暂停来启发式地完成,但是任何人都可以指出 function/library 中的 python这是自动的?
您可以查看 Audiolab It provides a decent API to convert the voice samples into numpy 个数组。 Audiolab 模块使用 libsndfile C++ 库来完成繁重的工作。
然后您可以解析数组以找到较低的值来找到暂停。
一个更简单的方法是使用 pydub module. recent addition of silent utilities 完成所有繁重的工作,例如 setting up silence threahold
、setting up silence length
。等等,与提到的其他方法相比,大大简化了代码。
这是一个演示实现,灵感来自 here
设置:
我在文件 "a-z.wav" 中有一个音频文件,其中包含从 A
到 Z
的英语口语字母。在当前工作目录中创建了一个子目录 splitAudio
。执行演示代码后,文件被分成 26 个单独的文件,每个音频文件存储每个音节。
观察:
部分音节被截断,可能需要修改以下参数,
min_silence_len=500
silence_thresh=-16
人们可能想根据自己的要求调整这些。
演示代码:
from pydub import AudioSegment
from pydub.silence import split_on_silence
sound_file = AudioSegment.from_wav("a-z.wav")
audio_chunks = split_on_silence(sound_file,
# must be silent for at least half a second
min_silence_len=500,
# consider it silent if quieter than -16 dBFS
silence_thresh=-16
)
for i, chunk in enumerate(audio_chunks):
out_file = ".//splitAudio//chunk{0}.wav".format(i)
print "exporting", out_file
chunk.export(out_file, format="wav")
输出:
Python 2.7.9 (default, Dec 10 2014, 12:24:55) [MSC v.1500 32 bit (Intel)] on win32
Type "copyright", "credits" or "license()" for more information.
>>> ================================ RESTART ================================
>>>
exporting .//splitAudio//chunk0.wav
exporting .//splitAudio//chunk1.wav
exporting .//splitAudio//chunk2.wav
exporting .//splitAudio//chunk3.wav
exporting .//splitAudio//chunk4.wav
exporting .//splitAudio//chunk5.wav
exporting .//splitAudio//chunk6.wav
exporting .//splitAudio//chunk7.wav
exporting .//splitAudio//chunk8.wav
exporting .//splitAudio//chunk9.wav
exporting .//splitAudio//chunk10.wav
exporting .//splitAudio//chunk11.wav
exporting .//splitAudio//chunk12.wav
exporting .//splitAudio//chunk13.wav
exporting .//splitAudio//chunk14.wav
exporting .//splitAudio//chunk15.wav
exporting .//splitAudio//chunk16.wav
exporting .//splitAudio//chunk17.wav
exporting .//splitAudio//chunk18.wav
exporting .//splitAudio//chunk19.wav
exporting .//splitAudio//chunk20.wav
exporting .//splitAudio//chunk21.wav
exporting .//splitAudio//chunk22.wav
exporting .//splitAudio//chunk23.wav
exporting .//splitAudio//chunk24.wav
exporting .//splitAudio//chunk25.wav
exporting .//splitAudio//chunk26.wav
>>>
使用IBM STT。使用 timestamps=true
你会在系统检测到它们被说出时得到单词 break up。
还有许多其他很酷的功能,例如 word_alternatives_threshold
可获取单词的其他可能性,以及 word_confidence
可获得系统预测单词的置信度。将 word_alternatives_threshold
设置在(0.1 和 0.01)之间以获得真正的想法。
这需要登录,之后您可以使用生成的用户名和密码。
IBM STT已经是提到的speechrecognition模块的一部分,但是要获取单词时间戳,您需要修改函数。
提取和修改后的表格如下所示:
def extracted_from_sr_recognize_ibm(audio_data, username=IBM_USERNAME, password=IBM_PASSWORD, language="en-US", show_all=False, timestamps=False,
word_confidence=False, word_alternatives_threshold=0.1):
assert isinstance(username, str), "``username`` must be a string"
assert isinstance(password, str), "``password`` must be a string"
flac_data = audio_data.get_flac_data(
convert_rate=None if audio_data.sample_rate >= 16000 else 16000, # audio samples should be at least 16 kHz
convert_width=None if audio_data.sample_width >= 2 else 2 # audio samples should be at least 16-bit
)
url = "https://stream-fra.watsonplatform.net/speech-to-text/api/v1/recognize?{}".format(urlencode({
"profanity_filter": "false",
"continuous": "true",
"model": "{}_BroadbandModel".format(language),
"timestamps": "{}".format(str(timestamps).lower()),
"word_confidence": "{}".format(str(word_confidence).lower()),
"word_alternatives_threshold": "{}".format(word_alternatives_threshold)
}))
request = Request(url, data=flac_data, headers={
"Content-Type": "audio/x-flac",
"X-Watson-Learning-Opt-Out": "true", # prevent requests from being logged, for improved privacy
})
authorization_value = base64.standard_b64encode("{}:{}".format(username, password).encode("utf-8")).decode("utf-8")
request.add_header("Authorization", "Basic {}".format(authorization_value))
try:
response = urlopen(request, timeout=None)
except HTTPError as e:
raise sr.RequestError("recognition request failed: {}".format(e.reason))
except URLError as e:
raise sr.RequestError("recognition connection failed: {}".format(e.reason))
response_text = response.read().decode("utf-8")
result = json.loads(response_text)
# return results
if show_all: return result
if "results" not in result or len(result["results"]) < 1 or "alternatives" not in result["results"][0]:
raise Exception("Unknown Value Exception")
transcription = []
for utterance in result["results"]:
if "alternatives" not in utterance:
raise Exception("Unknown Value Exception. No Alternatives returned")
for hypothesis in utterance["alternatives"]:
if "transcript" in hypothesis:
transcription.append(hypothesis["transcript"])
return "\n".join(transcription)
pyAudioAnalysis 如果单词被清楚地分开(自然语音中很少出现这种情况),则可以对音频文件进行分段。包比较好用:
python pyAudioAnalysis/pyAudioAnalysis/audioAnalysis.py silenceRemoval -i SPEECH_AUDIO_FILE_TO_SPLIT.mp3 --smoothing 1.0 --weight 0.3
有关我的 blog 的更多详细信息。
我的函数变体,可能会更容易根据您的需要进行修改:
from scipy.io.wavfile import write as write_wav
import numpy as np
import librosa
def zero_runs(a):
iszero = np.concatenate(([0], np.equal(a, 0).view(np.int8), [0]))
absdiff = np.abs(np.diff(iszero))
ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
return ranges
def split_in_parts(audio_path, out_dir):
# Some constants
min_length_for_silence = 0.01 # seconds
percentage_for_silence = 0.01 # eps value for silence
required_length_of_chunk_in_seconds = 60 # Chunk will be around this value not exact
sample_rate = 16000 # Set to None to use default
# Load audio
waveform, sampling_rate = librosa.load(audio_path, sr=sample_rate)
# Create mask of silence
eps = waveform.max() * percentage_for_silence
silence_mask = (np.abs(waveform) < eps).astype(np.uint8)
# Find where silence start and end
runs = zero_runs(silence_mask)
lengths = runs[:, 1] - runs[:, 0]
# Left only large silence ranges
min_length_for_silence = min_length_for_silence * sampling_rate
large_runs = runs[lengths > min_length_for_silence]
lengths = lengths[lengths > min_length_for_silence]
# Mark only center of silence
silence_mask[...] = 0
for start, end in large_runs:
center = (start + end) // 2
silence_mask[center] = 1
min_required_length = required_length_of_chunk_in_seconds * sampling_rate
chunks = []
prev_pos = 0
for i in range(min_required_length, len(waveform), min_required_length):
start = i
end = i + min_required_length
next_pos = start + silence_mask[start:end].argmax()
part = waveform[prev_pos:next_pos].copy()
prev_pos = next_pos
if len(part) > 0:
chunks.append(part)
# Add last part of waveform
part = waveform[prev_pos:].copy()
chunks.append(part)
print('Total chunks: {}'.format(len(chunks)))
new_files = []
for i, chunk in enumerate(chunks):
out_file = out_dir + "chunk_{}.wav".format(i)
print("exporting", out_file)
write_wav(out_file, sampling_rate, chunk)
new_files.append(out_file)
return new_files