百度的语音转文字API只支持最长60s的语音,如果要识别长语音,需要先切分,识别后再把结果合并。音频采样率和声道数好像也有影响,需要做修改,以下是一个成功案例。
源代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
| from aip import AipSpeech from pydub import AudioSegment import os
def audio_to_text(audio_path): APP_ID = '' API_KEY = '' SECRET_KEY = '' client = AipSpeech(APP_ID, API_KEY, SECRET_KEY) audio = AudioSegment.from_wav(audio_path) audio_length = audio.duration_seconds audio=audio.set_frame_rate(16000) audio=audio.set_channels(1) max_len=60 audios=[] if audio_length>max_len: for i in range(int(audio_length//max_len)): audios.append(audio[i*max_len*1000:(i+1)*max_len*1000]) audios.append(audio[int(audio_length//max_len)*max_len*1000:]) else: audios.append(audio) results='' for i in range(len(audios)): audios[i].export('temp.wav',format='wav') with open('temp.wav', 'rb') as fp: result = client.asr(fp.read(), 'wav', 16000, {'dev_pid': 1537,}) if result['err_no'] == 0: print(result['result'][0]) results=results+result['result'][0] else: print(result['err_msg']) os.remove('temp.wav') return results
|