1
1
import asyncio
2
2
import os
3
3
import re
4
+ from datetime import datetime
4
5
from xml .sax .saxutils import unescape
5
6
from edge_tts .submaker import mktimestamp
6
7
from loguru import logger
7
8
from edge_tts import submaker , SubMaker
8
9
import edge_tts
9
10
from moviepy .video .tools import subtitles
10
11
12
+ from app .config import config
11
13
from app .utils import utils
12
14
13
15
14
- def get_all_voices (filter_locals = None ) -> list [str ]:
16
+ def get_all_azure_voices (filter_locals = None ) -> list [str ]:
15
17
if filter_locals is None :
16
18
filter_locals = ["zh-CN" , "en-US" , "zh-HK" , "zh-TW" ]
17
19
voices_str = """
@@ -956,6 +958,34 @@ def get_all_voices(filter_locals=None) -> list[str]:
956
958
957
959
Name: zu-ZA-ThembaNeural
958
960
Gender: Male
961
+
962
+
963
+ Name: en-US-AvaMultilingualNeural-V2
964
+ Gender: Female
965
+
966
+ Name: en-US-AndrewMultilingualNeural-V2
967
+ Gender: Male
968
+
969
+ Name: en-US-EmmaMultilingualNeural-V2
970
+ Gender: Female
971
+
972
+ Name: en-US-BrianMultilingualNeural-V2
973
+ Gender: Male
974
+
975
+ Name: de-DE-FlorianMultilingualNeural-V2
976
+ Gender: Male
977
+
978
+ Name: de-DE-SeraphinaMultilingualNeural-V2
979
+ Gender: Female
980
+
981
+ Name: fr-FR-RemyMultilingualNeural-V2
982
+ Gender: Male
983
+
984
+ Name: fr-FR-VivienneMultilingualNeural-V2
985
+ Gender: Female
986
+
987
+ Name: zh-CN-XiaoxiaoMultilingualNeural-V2
988
+ Gender: Female
959
989
""" .strip ()
960
990
voices = []
961
991
name = ''
@@ -986,11 +1016,26 @@ def get_all_voices(filter_locals=None) -> list[str]:
986
1016
def parse_voice_name (name : str ):
987
1017
# zh-CN-XiaoyiNeural-Female
988
1018
# zh-CN-YunxiNeural-Male
1019
+ # zh-CN-XiaoxiaoMultilingualNeural-V2-Female
989
1020
name = name .replace ("-Female" , "" ).replace ("-Male" , "" ).strip ()
990
1021
return name
991
1022
992
1023
1024
+ def is_azure_v2_voice (voice_name : str ):
1025
+ voice_name = parse_voice_name (voice_name )
1026
+ print (voice_name )
1027
+ if voice_name .endswith ("-V2" ):
1028
+ return voice_name .replace ("-V2" , "" ).strip ()
1029
+ return ""
1030
+
1031
+
993
1032
def tts (text : str , voice_name : str , voice_file : str ) -> [SubMaker , None ]:
1033
+ if is_azure_v2_voice (voice_name ):
1034
+ return azure_tts_v2 (text , voice_name , voice_file )
1035
+ return azure_tts_v1 (text , voice_name , voice_file )
1036
+
1037
+
1038
+ def azure_tts_v1 (text : str , voice_name : str , voice_file : str ) -> [SubMaker , None ]:
994
1039
text = text .strip ()
995
1040
for i in range (3 ):
996
1041
try :
@@ -1019,6 +1064,80 @@ async def _do() -> SubMaker:
1019
1064
return None
1020
1065
1021
1066
1067
+ def azure_tts_v2 (text : str , voice_name : str , voice_file : str ) -> [SubMaker , None ]:
1068
+ voice_name = is_azure_v2_voice (voice_name )
1069
+ if not voice_name :
1070
+ logger .error (f"invalid voice name: { voice_name } " )
1071
+ raise ValueError (f"invalid voice name: { voice_name } " )
1072
+ text = text .strip ()
1073
+
1074
+ def _format_duration_to_offset (duration ) -> int :
1075
+ if isinstance (duration , str ):
1076
+ time_obj = datetime .strptime (duration , "%H:%M:%S.%f" )
1077
+ milliseconds = (time_obj .hour * 3600000 ) + (time_obj .minute * 60000 ) + (time_obj .second * 1000 ) + (
1078
+ time_obj .microsecond // 1000 )
1079
+ return milliseconds * 10000
1080
+
1081
+ if isinstance (duration , int ):
1082
+ return duration
1083
+
1084
+ return 0
1085
+
1086
+ for i in range (3 ):
1087
+ try :
1088
+ logger .info (f"start, voice name: { voice_name } , try: { i + 1 } " )
1089
+
1090
+ import azure .cognitiveservices .speech as speechsdk
1091
+
1092
+ sub_maker = SubMaker ()
1093
+
1094
+ def speech_synthesizer_word_boundary_cb (evt : speechsdk .SessionEventArgs ):
1095
+ # print('WordBoundary event:')
1096
+ # print('\tBoundaryType: {}'.format(evt.boundary_type))
1097
+ # print('\tAudioOffset: {}ms'.format((evt.audio_offset + 5000)))
1098
+ # print('\tDuration: {}'.format(evt.duration))
1099
+ # print('\tText: {}'.format(evt.text))
1100
+ # print('\tTextOffset: {}'.format(evt.text_offset))
1101
+ # print('\tWordLength: {}'.format(evt.word_length))
1102
+
1103
+ duration = _format_duration_to_offset (str (evt .duration ))
1104
+ offset = _format_duration_to_offset (evt .audio_offset )
1105
+ sub_maker .subs .append (evt .text )
1106
+ sub_maker .offset .append ((offset , offset + duration ))
1107
+
1108
+ # Creates an instance of a speech config with specified subscription key and service region.
1109
+ speech_key = config .azure .get ("speech_key" , "" )
1110
+ service_region = config .azure .get ("speech_region" , "" )
1111
+ audio_config = speechsdk .audio .AudioOutputConfig (filename = voice_file , use_default_speaker = True )
1112
+ speech_config = speechsdk .SpeechConfig (subscription = speech_key ,
1113
+ region = service_region )
1114
+ speech_config .speech_synthesis_voice_name = voice_name
1115
+ # speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestSentenceBoundary,
1116
+ # value='true')
1117
+ speech_config .set_property (property_id = speechsdk .PropertyId .SpeechServiceResponse_RequestWordBoundary ,
1118
+ value = 'true' )
1119
+
1120
+ speech_config .set_speech_synthesis_output_format (
1121
+ speechsdk .SpeechSynthesisOutputFormat .Audio48Khz192KBitRateMonoMp3 )
1122
+ speech_synthesizer = speechsdk .SpeechSynthesizer (audio_config = audio_config ,
1123
+ speech_config = speech_config )
1124
+ speech_synthesizer .synthesis_word_boundary .connect (speech_synthesizer_word_boundary_cb )
1125
+
1126
+ result = speech_synthesizer .speak_text_async (text ).get ()
1127
+ if result .reason == speechsdk .ResultReason .SynthesizingAudioCompleted :
1128
+ logger .success (f"azure v2 speech synthesis succeeded: { voice_file } " )
1129
+ return sub_maker
1130
+ elif result .reason == speechsdk .ResultReason .Canceled :
1131
+ cancellation_details = result .cancellation_details
1132
+ logger .error (f"azure v2 speech synthesis canceled: { cancellation_details .reason } " )
1133
+ if cancellation_details .reason == speechsdk .CancellationReason .Error :
1134
+ logger .error (f"azure v2 speech synthesis error: { cancellation_details .error_details } " )
1135
+ logger .info (f"completed, output file: { voice_file } " )
1136
+ except Exception as e :
1137
+ logger .error (f"failed, error: { str (e )} " )
1138
+ return None
1139
+
1140
+
1022
1141
def _format_text (text : str ) -> str :
1023
1142
# text = text.replace("\n", " ")
1024
1143
text = text .replace ("[" , " " )
@@ -1131,15 +1250,20 @@ def get_audio_duration(sub_maker: submaker.SubMaker):
1131
1250
1132
1251
1133
1252
if __name__ == "__main__" :
1134
- voices = get_all_voices ()
1135
- print (voices )
1253
+ voice_name = "zh-CN-XiaoxiaoMultilingualNeural-V2-Female"
1254
+ voice_name = parse_voice_name (voice_name )
1255
+ voice_name = is_azure_v2_voice (voice_name )
1256
+ print (voice_name )
1257
+
1258
+ voices = get_all_azure_voices ()
1136
1259
print (len (voices ))
1137
1260
1138
1261
1139
1262
async def _do ():
1140
1263
temp_dir = utils .storage_dir ("temp" )
1141
1264
1142
1265
voice_names = [
1266
+ "zh-CN-XiaoxiaoMultilingualNeural" ,
1143
1267
# 女性
1144
1268
"zh-CN-XiaoxiaoNeural" ,
1145
1269
"zh-CN-XiaoyiNeural" ,
@@ -1174,6 +1298,7 @@ async def _do():
1174
1298
业绩解读
1175
1299
利润方面,2023全年贵州茅台,>归母净利润增速为19%,其中营业收入正贡献18%,营业成本正贡献百分之一,管理费用正贡献百分之一点四。(注:归母净利润增速值=营业收入增速+各科目贡献,展示贡献/拖累的前四名科目,且要求贡献值/净利润增速>15%)
1176
1300
"""
1301
+ text = "静夜思是唐代诗人李白创作的一首五言古诗。这首诗描绘了诗人在寂静的夜晚,看到窗前的明月,不禁想起远方的家乡和亲人"
1177
1302
1178
1303
text = _format_text (text )
1179
1304
lines = utils .split_string_by_punctuations (text )
@@ -1182,7 +1307,7 @@ async def _do():
1182
1307
for voice_name in voice_names :
1183
1308
voice_file = f"{ temp_dir } /tts-{ voice_name } .mp3"
1184
1309
subtitle_file = f"{ temp_dir } /tts.mp3.srt"
1185
- sub_maker = tts (text = text , voice_name = voice_name , voice_file = voice_file )
1310
+ sub_maker = azure_tts_v2 (text = text , voice_name = voice_name , voice_file = voice_file )
1186
1311
create_subtitle (sub_maker = sub_maker , text = text , subtitle_file = subtitle_file )
1187
1312
audio_duration = get_audio_duration (sub_maker )
1188
1313
print (f"voice: { voice_name } , audio duration: { audio_duration } s" )
0 commit comments