- use native speech detector

This commit is contained in:
Dmytro Bogovych 2023-09-14 15:44:55 +03:00
parent 48743574ad
commit 0bf8134feb
4 changed files with 37 additions and 23 deletions

View File

@ -36,8 +36,8 @@ class AgentConfig:
# Should the first task run immediately ?
ForceRun = False
# Use silence eraser or not (speech detector is used in this case)
UseSilenceEraser = True
# Use external speech detector if needed
UseSpeechDetector = False
# Path to log file
LogPath : Path = None
@ -90,7 +90,7 @@ class AgentConfig:
if 'speech_detector' in config:
if config['speech_detector']:
self.UseSilenceEraser = False
self.UseSpeechDetector = True
if 'audio' in config:
audio = config['audio']

View File

@ -68,6 +68,9 @@ def detect_degraded_signal(file_test: Path, file_reference: Path) -> SignalBound
# Seems some problem with recording, return zero boundaries
return SignalBoundaries()
if CONFIG.UseSpeechDetector:
r = bt_signal.find_reference_signal_via_speechdetector(file_test)
else:
r = bt_signal.find_reference_signal(file_test)
if r.offset_start == 0.0 and is_caller:
@ -78,6 +81,9 @@ def detect_degraded_signal(file_test: Path, file_reference: Path) -> SignalBound
def detect_reference_signal(file_reference: Path) -> SignalBoundaries:
# Run silence eraser on reference file as well
if CONFIG.UseSpeechDetector:
result = bt_signal.find_reference_signal_via_speechdetector(file_reference)
else:
result = bt_signal.find_reference_signal(file_reference)
return result

View File

@ -3,33 +3,22 @@
import sys
import os
import pathlib
from utils_types import SignalBoundaries
from utils_sevana import speech_detector
from pydub import silence, AudioSegment
class SignalBoundaries:
# Offset from start (in seconds)
offset_start: float
# Offset from finish (in seconds)
offset_finish: float
def __init__(self, offset_start = 0.0, offset_finish = 0.0) -> None:
self.offset_start = offset_start
self.offset_finish = offset_finish
def __repr__(self) -> str:
return f'[offset_start: {round(self.offset_start, 3)}, offset_finish : {round(self.offset_finish, 3)}]'
SILENCE_DELTA = 16
def find_reference_signal(input_file: pathlib.Path, output_file: pathlib.Path = None, use_end_offset: bool = True) -> SignalBoundaries:
myaudio = AudioSegment.from_wav(str(input_file))
dBFS = myaudio.dBFS
# Find silence intervals
intervals = silence.detect_nonsilent(myaudio, min_silence_len=1000, silence_thresh=dBFS-17, seek_step=50)
intervals = silence.detect_nonsilent(myaudio, min_silence_len=1000, silence_thresh=dBFS-SILENCE_DELTA, seek_step=50)
# Translate to seconds
intervals = [((start/1000),(stop/1000)) for start,stop in intervals] #in sec
intervals = [((start/1000),(stop/1000)) for start,stop in intervals] # in sec
# print(intervals)
@ -48,6 +37,12 @@ def find_reference_signal(input_file: pathlib.Path, output_file: pathlib.Path =
return SignalBoundaries()
def find_reference_signal_via_speechdetector(input_file: pathlib.Path) -> SignalBoundaries:
bounds = speech_detector(str(input_file))
r = SignalBoundaries(bounds[0], bounds[1])
return bounds
if __name__ == '__main__':
if len(sys.argv) < 2:
print(f'Please specify input filename.')

View File

@ -6,12 +6,25 @@ import utils
import json
from crontab import CronTab
# Exit codes
EXIT_OK = 0
EXIT_ERROR = 1
class SignalBoundaries:
# Offset from start (in seconds)
offset_start: float
# Offset from finish (in seconds)
offset_finish: float
def __init__(self, offset_start = 0.0, offset_finish = 0.0) -> None:
self.offset_start = offset_start
self.offset_finish = offset_finish
def __repr__(self) -> str:
return f'[offset_start: {round(self.offset_start, 3)}, offset_finish : {round(self.offset_finish, 3)}]'
class Phone:
identifier: int = 0
name: str = ""