Skip to content

Commit 414bcb0

Browse files
authored
Merge pull request #264 from harry0703/dev
support azure new speech voice and fix the bug where clip were not closed
2 parents b9b9bea + d4eb7bc commit 414bcb0

File tree

11 files changed

+177
-25
lines changed

11 files changed

+177
-25
lines changed

app/config/config.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,16 @@ def save_config():
3636
_cfg["app"] = app
3737
_cfg["whisper"] = whisper
3838
_cfg["pexels"] = pexels
39+
_cfg["azure"] = azure
40+
_cfg["ui"] = ui
3941
f.write(toml.dumps(_cfg))
4042

4143

4244
_cfg = load_config()
4345
app = _cfg.get("app", {})
4446
whisper = _cfg.get("whisper", {})
4547
pexels = _cfg.get("pexels", {})
48+
azure = _cfg.get("azure", {})
4649
ui = _cfg.get("ui", {})
4750

4851
hostname = socket.gethostname()
@@ -53,7 +56,7 @@ def save_config():
5356
project_name = _cfg.get("project_name", "MoneyPrinterTurbo")
5457
project_description = _cfg.get("project_description",
5558
"<a href='https://github.com/harry0703/MoneyPrinterTurbo'>https://github.com/harry0703/MoneyPrinterTurbo</a>")
56-
project_version = _cfg.get("project_version", "1.1.1")
59+
project_version = _cfg.get("project_version", "1.1.2")
5760
reload_debug = False
5861

5962
imagemagick_path = app.get("imagemagick_path", "")

app/controllers/v1/video.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def delete_video(request: Request, task_id: str = Path(..., description="Task ID
9191

9292
sm.state.delete_task(task_id)
9393
logger.success(f"video deleted: {utils.to_json(task)}")
94-
return utils.get_response(200, task)
94+
return utils.get_response(200)
9595

9696
raise HttpException(task_id=task_id, status_code=404, message=f"{request_id}: task not found")
9797

@@ -190,4 +190,5 @@ async def download_video(_: Request, file_path: str):
190190
headers = {
191191
"Content-Disposition": f"attachment; filename={filename}{extension}"
192192
}
193-
return FileResponse(path=video_path, headers=headers, filename=f"{filename}{extension}", media_type=f'video/{extension[1:]}')
193+
return FileResponse(path=video_path, headers=headers, filename=f"{filename}{extension}",
194+
media_type=f'video/{extension[1:]}')

app/services/video.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -100,17 +100,18 @@ def combine_videos(combined_video_path: str,
100100
clips.append(clip)
101101
video_duration += clip.duration
102102

103-
final_clip = concatenate_videoclips(clips)
104-
final_clip = final_clip.set_fps(30)
103+
video_clip = concatenate_videoclips(clips)
104+
video_clip = video_clip.set_fps(30)
105105
logger.info(f"writing")
106106
# https://github.com/harry0703/MoneyPrinterTurbo/issues/111#issuecomment-2032354030
107-
final_clip.write_videofile(filename=combined_video_path,
107+
video_clip.write_videofile(filename=combined_video_path,
108108
threads=threads,
109109
logger=None,
110110
temp_audiofile_path=output_dir,
111111
audio_codec="aac",
112112
fps=30,
113113
)
114+
video_clip.close()
114115
logger.success(f"completed")
115116
return combined_video_path
116117

@@ -263,7 +264,7 @@ def create_text_clip(subtitle_item):
263264
logger=None,
264265
fps=30,
265266
)
266-
267+
video_clip.close()
267268
logger.success(f"completed")
268269

269270

app/services/voice.py

+129-4
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
11
import asyncio
22
import os
33
import re
4+
from datetime import datetime
45
from xml.sax.saxutils import unescape
56
from edge_tts.submaker import mktimestamp
67
from loguru import logger
78
from edge_tts import submaker, SubMaker
89
import edge_tts
910
from moviepy.video.tools import subtitles
1011

12+
from app.config import config
1113
from app.utils import utils
1214

1315

14-
def get_all_voices(filter_locals=None) -> list[str]:
16+
def get_all_azure_voices(filter_locals=None) -> list[str]:
1517
if filter_locals is None:
1618
filter_locals = ["zh-CN", "en-US", "zh-HK", "zh-TW"]
1719
voices_str = """
@@ -956,6 +958,34 @@ def get_all_voices(filter_locals=None) -> list[str]:
956958
957959
Name: zu-ZA-ThembaNeural
958960
Gender: Male
961+
962+
963+
Name: en-US-AvaMultilingualNeural-V2
964+
Gender: Female
965+
966+
Name: en-US-AndrewMultilingualNeural-V2
967+
Gender: Male
968+
969+
Name: en-US-EmmaMultilingualNeural-V2
970+
Gender: Female
971+
972+
Name: en-US-BrianMultilingualNeural-V2
973+
Gender: Male
974+
975+
Name: de-DE-FlorianMultilingualNeural-V2
976+
Gender: Male
977+
978+
Name: de-DE-SeraphinaMultilingualNeural-V2
979+
Gender: Female
980+
981+
Name: fr-FR-RemyMultilingualNeural-V2
982+
Gender: Male
983+
984+
Name: fr-FR-VivienneMultilingualNeural-V2
985+
Gender: Female
986+
987+
Name: zh-CN-XiaoxiaoMultilingualNeural-V2
988+
Gender: Female
959989
""".strip()
960990
voices = []
961991
name = ''
@@ -986,11 +1016,26 @@ def get_all_voices(filter_locals=None) -> list[str]:
9861016
def parse_voice_name(name: str):
9871017
# zh-CN-XiaoyiNeural-Female
9881018
# zh-CN-YunxiNeural-Male
1019+
# zh-CN-XiaoxiaoMultilingualNeural-V2-Female
9891020
name = name.replace("-Female", "").replace("-Male", "").strip()
9901021
return name
9911022

9921023

1024+
def is_azure_v2_voice(voice_name: str):
1025+
voice_name = parse_voice_name(voice_name)
1026+
print(voice_name)
1027+
if voice_name.endswith("-V2"):
1028+
return voice_name.replace("-V2", "").strip()
1029+
return ""
1030+
1031+
9931032
def tts(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
1033+
if is_azure_v2_voice(voice_name):
1034+
return azure_tts_v2(text, voice_name, voice_file)
1035+
return azure_tts_v1(text, voice_name, voice_file)
1036+
1037+
1038+
def azure_tts_v1(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
9941039
text = text.strip()
9951040
for i in range(3):
9961041
try:
@@ -1019,6 +1064,80 @@ async def _do() -> SubMaker:
10191064
return None
10201065

10211066

1067+
def azure_tts_v2(text: str, voice_name: str, voice_file: str) -> [SubMaker, None]:
1068+
voice_name = is_azure_v2_voice(voice_name)
1069+
if not voice_name:
1070+
logger.error(f"invalid voice name: {voice_name}")
1071+
raise ValueError(f"invalid voice name: {voice_name}")
1072+
text = text.strip()
1073+
1074+
def _format_duration_to_offset(duration) -> int:
1075+
if isinstance(duration, str):
1076+
time_obj = datetime.strptime(duration, "%H:%M:%S.%f")
1077+
milliseconds = (time_obj.hour * 3600000) + (time_obj.minute * 60000) + (time_obj.second * 1000) + (
1078+
time_obj.microsecond // 1000)
1079+
return milliseconds * 10000
1080+
1081+
if isinstance(duration, int):
1082+
return duration
1083+
1084+
return 0
1085+
1086+
for i in range(3):
1087+
try:
1088+
logger.info(f"start, voice name: {voice_name}, try: {i + 1}")
1089+
1090+
import azure.cognitiveservices.speech as speechsdk
1091+
1092+
sub_maker = SubMaker()
1093+
1094+
def speech_synthesizer_word_boundary_cb(evt: speechsdk.SessionEventArgs):
1095+
# print('WordBoundary event:')
1096+
# print('\tBoundaryType: {}'.format(evt.boundary_type))
1097+
# print('\tAudioOffset: {}ms'.format((evt.audio_offset + 5000)))
1098+
# print('\tDuration: {}'.format(evt.duration))
1099+
# print('\tText: {}'.format(evt.text))
1100+
# print('\tTextOffset: {}'.format(evt.text_offset))
1101+
# print('\tWordLength: {}'.format(evt.word_length))
1102+
1103+
duration = _format_duration_to_offset(str(evt.duration))
1104+
offset = _format_duration_to_offset(evt.audio_offset)
1105+
sub_maker.subs.append(evt.text)
1106+
sub_maker.offset.append((offset, offset + duration))
1107+
1108+
# Creates an instance of a speech config with specified subscription key and service region.
1109+
speech_key = config.azure.get("speech_key", "")
1110+
service_region = config.azure.get("speech_region", "")
1111+
audio_config = speechsdk.audio.AudioOutputConfig(filename=voice_file, use_default_speaker=True)
1112+
speech_config = speechsdk.SpeechConfig(subscription=speech_key,
1113+
region=service_region)
1114+
speech_config.speech_synthesis_voice_name = voice_name
1115+
# speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestSentenceBoundary,
1116+
# value='true')
1117+
speech_config.set_property(property_id=speechsdk.PropertyId.SpeechServiceResponse_RequestWordBoundary,
1118+
value='true')
1119+
1120+
speech_config.set_speech_synthesis_output_format(
1121+
speechsdk.SpeechSynthesisOutputFormat.Audio48Khz192KBitRateMonoMp3)
1122+
speech_synthesizer = speechsdk.SpeechSynthesizer(audio_config=audio_config,
1123+
speech_config=speech_config)
1124+
speech_synthesizer.synthesis_word_boundary.connect(speech_synthesizer_word_boundary_cb)
1125+
1126+
result = speech_synthesizer.speak_text_async(text).get()
1127+
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
1128+
logger.success(f"azure v2 speech synthesis succeeded: {voice_file}")
1129+
return sub_maker
1130+
elif result.reason == speechsdk.ResultReason.Canceled:
1131+
cancellation_details = result.cancellation_details
1132+
logger.error(f"azure v2 speech synthesis canceled: {cancellation_details.reason}")
1133+
if cancellation_details.reason == speechsdk.CancellationReason.Error:
1134+
logger.error(f"azure v2 speech synthesis error: {cancellation_details.error_details}")
1135+
logger.info(f"completed, output file: {voice_file}")
1136+
except Exception as e:
1137+
logger.error(f"failed, error: {str(e)}")
1138+
return None
1139+
1140+
10221141
def _format_text(text: str) -> str:
10231142
# text = text.replace("\n", " ")
10241143
text = text.replace("[", " ")
@@ -1131,15 +1250,20 @@ def get_audio_duration(sub_maker: submaker.SubMaker):
11311250

11321251

11331252
if __name__ == "__main__":
1134-
voices = get_all_voices()
1135-
print(voices)
1253+
voice_name = "zh-CN-XiaoxiaoMultilingualNeural-V2-Female"
1254+
voice_name = parse_voice_name(voice_name)
1255+
voice_name = is_azure_v2_voice(voice_name)
1256+
print(voice_name)
1257+
1258+
voices = get_all_azure_voices()
11361259
print(len(voices))
11371260

11381261

11391262
async def _do():
11401263
temp_dir = utils.storage_dir("temp")
11411264

11421265
voice_names = [
1266+
"zh-CN-XiaoxiaoMultilingualNeural",
11431267
# 女性
11441268
"zh-CN-XiaoxiaoNeural",
11451269
"zh-CN-XiaoyiNeural",
@@ -1174,6 +1298,7 @@ async def _do():
11741298
业绩解读
11751299
利润方面,2023全年贵州茅台,>归母净利润增速为19%,其中营业收入正贡献18%,营业成本正贡献百分之一,管理费用正贡献百分之一点四。(注:归母净利润增速值=营业收入增速+各科目贡献,展示贡献/拖累的前四名科目,且要求贡献值/净利润增速>15%)
11761300
"""
1301+
text = "静夜思是唐代诗人李白创作的一首五言古诗。这首诗描绘了诗人在寂静的夜晚,看到窗前的明月,不禁想起远方的家乡和亲人"
11771302

11781303
text = _format_text(text)
11791304
lines = utils.split_string_by_punctuations(text)
@@ -1182,7 +1307,7 @@ async def _do():
11821307
for voice_name in voice_names:
11831308
voice_file = f"{temp_dir}/tts-{voice_name}.mp3"
11841309
subtitle_file = f"{temp_dir}/tts.mp3.srt"
1185-
sub_maker = tts(text=text, voice_name=voice_name, voice_file=voice_file)
1310+
sub_maker = azure_tts_v2(text=text, voice_name=voice_name, voice_file=voice_file)
11861311
create_subtitle(sub_maker=sub_maker, text=text, subtitle_file=subtitle_file)
11871312
audio_duration = get_audio_duration(sub_maker)
11881313
print(f"voice: {voice_name}, audio duration: {audio_duration}s")

app/utils/utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ def split_string_by_punctuations(s):
188188
else:
189189
result.append(txt.strip())
190190
txt = ""
191-
191+
result.append(txt.strip())
192192
# filter empty string
193193
result = list(filter(None, result))
194194
return result

config.example.toml

+7-1
Original file line numberDiff line numberDiff line change
@@ -161,4 +161,10 @@
161161
### Example: "http://user:pass@proxy:1234"
162162
### Doc: https://requests.readthedocs.io/en/latest/user/advanced/#proxies
163163
# http = "http://10.10.1.10:3128"
164-
# https = "http://10.10.1.10:1080"
164+
# https = "http://10.10.1.10:1080"
165+
166+
[azure]
167+
# Azure Speech API Key
168+
# Get your API key at https://portal.azure.com/#view/Microsoft_Azure_ProjectOxford/CognitiveServicesHub/~/SpeechServices
169+
speech_key=""
170+
speech_region=""

requirements.txt

+7-1
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,10 @@ g4f~=0.2.5.4
1616
dashscope~=1.15.0
1717
google.generativeai~=0.4.1
1818
python-multipart~=0.0.9
19-
redis==5.0.3
19+
redis==5.0.3
20+
# if you use pillow~=10.3.0, you will get "PIL.Image' has no attribute 'ANTIALIAS'" error when resize video
21+
# please install opencv-python to fix "PIL.Image' has no attribute 'ANTIALIAS'" error
22+
opencv-python
23+
# for azure speech
24+
# https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/9-more-realistic-ai-voices-for-conversations-now-generally/ba-p/4099471
25+
azure-cognitiveservices-speech~=1.37.0

0 commit comments

Comments
 (0)