在 Python 中使用 Microsoft Azure Speech-to-text 的字幕/说明文字

Posted 2023-03-07

技术标签:

【中文标题】在 Python 中使用 Microsoft Azure Speech-to-text 的字幕/说明文字【英文标题】：Subtitles/captions with Microsoft Azure Speech-to-text in Python 【发布时间】：2020-06-24 11:31:02 【问题描述】：

我一直试图弄清楚如何使用 Python 中的 Microsoft Azure 语音识别服务制作字幕，但无法弄清楚。我已经按照其他人在这里回答的关于获取单个单词的提示进行了操作，但即使将它们格式化为 .srt 或 .vtt 似乎也很复杂。代码如下：

import azure.cognitiveservices.speech as speechsdk


def speech_recognize_continuous_from_file():
    """performs continuous speech recognition with input from an audio file"""
    # <SpeechContinuousRecognitionWithFile>
    speech_key, service_region = "api-key", "serive-region"
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    
    audio_filename = "for example: video.wav"
    audio_config = speechsdk.audio.AudioConfig(filename=audio_filename)
    
    speech_config.speech_recognition_language="en-US"
    speech_config.request_word_level_timestamps()

    speech_config.enable_dictation()
    speech_config.output_format = speechsdk.OutputFormat(1)
    
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    done = False
    
    results = []
    
    transcript = []
    words = []
    
    def handle_final_result(evt):
        import json
        results = json.loads(evt.result.json)
        transcript.append(results['DisplayText'])
    confidence_list_temp = [item.get('Confidence') for item in results['NBest']]
    max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
        words.extend(results['NBest'][max_confidence_index]['Words'])
    def stop_cb(evt):
        """callback that stops continuous recognition upon receiving an event `evt`"""
        print('CLOSING on '.format(evt))
        speech_recognizer.stop_continuous_recognition()
        nonlocal done
        done = True
        print("Transcript display list:\n")
        print(transcript)
        print("\nWords\n")
        print(words)
        print("\n")


    speech_recognizer.recognized.connect(handle_final_result)
    # Connect callbacks to the events fired by the speech recognizer
    speech_recognizer.recognizing.connect(lambda evt: format(evt))
    speech_recognizer.recognized.connect(lambda evt: format(evt))
    speech_recognizer.session_started.connect(lambda evt: format(evt))
    speech_recognizer.session_stopped.connect(lambda evt: format(evt))
    speech_recognizer.canceled.connect(lambda evt: format(evt))
    # stop continuous recognition on either session stopped or canceled events
    speech_recognizer.session_stopped.connect(stop_cb)
    speech_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    speech_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)

    
    with open('Azure_Raw.txt','w') as f:
        f.write('\n'.join(results))

sample_long_running_recognize(storage_uri)

我在字幕上找到的唯一其他“教程”是 Google Cloud 的，它给出了我正在寻找的结果（是的，我自己测试过），但 Azure 显然根本不像 G-云：https://medium.com/searce/generate-srt-file-subtitles-using-google-clouds-speech-to-text-api-402b2f1da3bd

所以基本上：我如何将 3 秒的语音文本转换为 .srt 格式，如下所示：

1
00:00:00,000 --> 00:00:03,000
This is the first sentence that

2
00:00:03,000 --> 00:00:06,000
continues after 3 seconds or so

【问题讨论】：

【参考方案1】：

因此，如果您仔细观察 - Azure 语音服务的 JSON 输出 - 它与其他服务的输出略有不同。

对于提到的配置，在您选择最佳匹配后输出如下所示

['Duration': 3900000, 'Offset': 500000, 'Word': "what's",
 'Duration': 1300000, 'Offset': 4500000, 'Word': 'the',
 'Duration': 2900000, 'Offset': 5900000, 'Word': 'weather',
 'Duration': 4800000, 'Offset': 8900000, 'Word': 'like']

有三个输出 - Word、Duration 和偏移

持续时间 - 单词拼写的第 100 纳秒时间偏移量 - 从视频开始算起的第 100 纳秒中的秒数

你必须利用它来构建你的时间线

import azure.cognitiveservices.speech as speechsdk
import os
import time
import pprint
import json
import srt
import datetime

 
path = os.getcwd()
# Creates an instance of a speech config with specified subscription key and service region.
# Replace with your own subscription key and region identifier from here: https://aka.ms/speech/sdkregion
speech_key, service_region = "<>", "<>"
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

# Creates an audio configuration that points to an audio file.
# Replace with your own audio filename.
audio_filename = "sample.wav"
audio_input = speechsdk.audio.AudioConfig(filename=audio_filename)

# Creates a recognizer with the given settings
speech_config.speech_recognition_language="en-US"
speech_config.request_word_level_timestamps()



speech_config.enable_dictation()
speech_config.output_format = speechsdk.OutputFormat(1)

speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_input)

#result = speech_recognizer.recognize_once()
all_results = []
results = []
transcript = []
words = []


#https://docs.microsoft.com/en-us/python/api/azure-cognitiveservices-speech/azure.cognitiveservices.speech.recognitionresult?view=azure-python
def handle_final_result(evt):
    import json
    all_results.append(evt.result.text) 
    results = json.loads(evt.result.json)
    transcript.append(results['DisplayText'])
    confidence_list_temp = [item.get('Confidence') for item in results['NBest']]
    max_confidence_index = confidence_list_temp.index(max(confidence_list_temp))
    words.extend(results['NBest'][max_confidence_index]['Words'])



done = False

def stop_cb(evt):
    print('CLOSING on '.format(evt))
    speech_recognizer.stop_continuous_recognition()
    global done
    done= True
    
speech_recognizer.recognized.connect(handle_final_result) 
#Connect callbacks to the events fired by the speech recognizer    
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: '.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: '.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: '.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED '.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED '.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)

speech_recognizer.start_continuous_recognition()

while not done:
    time.sleep(.5)
    
print("Printing all results:")
print(all_results)

speech_to_text_response = words

def convertduration(t):
    x= t/10000
    return int((x / 1000)), (x % 1000)


##-- Code to Create Subtitle --#

#3 Seconds
bin = 3.0
duration = 0 
transcriptions = []
transcript = ""
index,prev=0,0
wordstartsec,wordstartmicrosec=0,0
for i in range(len(speech_to_text_response)):
    #Forms the sentence until the bin size condition is met
    transcript = transcript + " " + speech_to_text_response[i]["Word"]
    #Checks whether the elapsed duration is less than the bin size
    if(int((duration / 10000000)) < bin): 
        wordstartsec,wordstartmicrosec=convertduration(speech_to_text_response[i]["Offset"])
        duration= duration+speech_to_text_response[i]["Offset"]-prev
        prev=speech_to_text_response[i]["Offset"]
                #transcript = transcript + " " + speech_to_text_response[i]["Word"]
    else : 
        index=index+1
        #transcript = transcript + " " + speech_to_text_response[i]["Word"]
        transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
        duration = 0 
        #print(transcript)
        transcript=""



transcriptions.append(srt.Subtitle(index, datetime.timedelta(0, wordstartsec, wordstartmicrosec), datetime.timedelta(0, wordstartsec+bin, 0), transcript))
subtitles = srt.compose(transcriptions)
with open("subtitle.srt", "w") as f:
    f.write(subtitles)

附上输出供您参考：

Output

希望这会有所帮助:)

【讨论】：

现在这是一个很好的答案！我终于可以看到这与 Google Cloud 版本之间的相似之处了。这也让我意识到我必须磨练我的 JSON 知识（我不知道为什么，但我遇到了麻烦，这是最简单的事情之一）。非常感谢沙迪亚！

以上是关于在 Python 中使用 Microsoft Azure Speech-to-text 的字幕/说明文字的主要内容，如果未能解决你的问题，请参考以下文章