import os

import whisper
import json
from moviepy.editor import TextClip, CompositeVideoClip, ColorClip
import numpy as np



# Define the input video file and output audio file
videofilename = "/Users/jitendersingh/Downloads/eleven.mp4"
audiofilename = "/Users/jitendersingh/Downloads/eleven/eleven.mp3"

# Run the ffmpeg command to extract audio
os.system(f'ffmpeg -i "{videofilename}" -vn -acodec libmp3lame -q:a 4 "{audiofilename}"')

#might take some time (approx 3- 5min depending on audio length)
model = whisper.load_model("medium")
result = model.transcribe(audiofilename,word_timestamps=True)
print(result)

print (result['text'])

for each in result['segments']:
print (each)

wordlevel_info = []

for each in result['segments']:
words = each['words']
for word in words:
# print (word['word'], " ",word['start']," - ",word['end'])
wordlevel_info.append({'word':word['word'].strip(),'start':word['start'],'end':word['end']})

wordlevel_info

# Save the wordlevel_info data as JSON
output_json_file = '/Users/jitendersingh/Downloads/eleven/data.json'
with open(output_json_file, 'w', encoding='utf-8') as f:
json.dump(wordlevel_info, f, indent=4, ensure_ascii=False)

# Load the JSON file with proper decoding
with open(output_json_file, 'r', encoding='utf-8') as f:
loaded_data = json.load(f)

# Now you can access the JSON data with Hindi words in their readable form
for word_info in loaded_data:
print("Word:", word_info['word'])
print("Start:", word_info['start'])
print("End:", word_info['end'])
print()




with open('/Users/jitendersingh/Downloads/eleven/data.json', 'r', encoding='utf-8') as f:
wordlevel_info_modified = json.load(f)

wordlevel_info_modified


def split_text_into_lines(data):

MaxChars = 40
#maxduration in seconds
MaxDuration = 3.0
#Split if nothing is spoken (gap) for these many seconds
MaxGap = 1.5

subtitles = []
line = []
line_duration = 0
line_chars = 0


for idx,word_data in enumerate(data):
word = word_data["word"]
start = word_data["start"]
end = word_data["end"]

line.append(word_data)
line_duration += end - start
temp = " ".join(item["word"] for item in line)

# Check if adding a new word exceeds the maximum character count or duration
new_line_chars = len(temp)

duration_exceeded = line_duration > MaxDuration
chars_exceeded = new_line_chars > MaxChars
if idx>0:
gap = word_data['start'] - data[idx-1]['end']
# print (word,start,end,gap)
maxgap_exceeded = gap > MaxGap
else:
maxgap_exceeded = False

if duration_exceeded or chars_exceeded or maxgap_exceeded:
if line:
subtitle_line = {
"word": " ".join(item["word"] for item in line),
"start": line[0]["start"],
"end": line[-1]["end"],
"textcontents": line
}
subtitles.append(subtitle_line)
line = []
line_duration = 0
line_chars = 0


if line:
subtitle_line = {
"word": " ".join(item["word"] for item in line),
"start": line[0]["start"],
"end": line[-1]["end"],
"textcontents": line
}
subtitles.append(subtitle_line)

return subtitles

linelevel_subtitles = split_text_into_lines(wordlevel_info_modified)
print (linelevel_subtitles)

for line in linelevel_subtitles:
json_str = json.dumps(line, ensure_ascii=False, indent=4)
print(json_str.encode('utf-8').decode('utf-8'))



def create_caption(textJSON, framesize,font = "/Users/jitendersingh/Library/Fonts/Baloo-Regular.ttf",fontsize=80, color='white', bgcolor='blue'):
wordcount = len(textJSON['textcontents'])
full_duration = textJSON['end']-textJSON['start']

word_clips = []
xy_textclips_positions =[]
x_pos = 0
y_pos = 0
# max_height = 0
frame_width = framesize[0]
frame_height = framesize[1]
x_buffer = frame_width*1/10
y_buffer = frame_height*1/5

space_width = ""
space_height = ""

for index,wordJSON in enumerate(textJSON['textcontents']):
duration = wordJSON['end']-wordJSON['start']
word_clip = TextClip(wordJSON['word'], font = font,fontsize=fontsize, color=color).set_start(textJSON['start']).set_duration(full_duration)
word_clip_space = TextClip(" ", font = font,fontsize=fontsize, color=color).set_start(textJSON['start']).set_duration(full_duration)
word_width, word_height = word_clip.size
space_width,space_height = word_clip_space.size
if x_pos + word_width+ space_width > frame_width-2*x_buffer:
# Move to the next line
x_pos = 0
y_pos = y_pos+ word_height+40

# Store info of each word_clip created
xy_textclips_positions.append({
"x_pos":x_pos+x_buffer,
"y_pos": y_pos+y_buffer,
"width" : word_width,
"height" : word_height,
"word": wordJSON['word'],
"start": wordJSON['start'],
"end": wordJSON['end'],
"duration": duration
})

word_clip = word_clip.set_position((x_pos+x_buffer, y_pos+y_buffer))
word_clip_space = word_clip_space.set_position((x_pos+ word_width +x_buffer, y_pos+y_buffer))
x_pos = word_width + space_width
else:
# Store info of each word_clip created
xy_textclips_positions.append({
"x_pos":x_pos+x_buffer,
"y_pos": y_pos+y_buffer,
"width" : word_width,
"height" : word_height,
"word": wordJSON['word'],
"start": wordJSON['start'],
"end": wordJSON['end'],
"duration": duration
})

word_clip = word_clip.set_position((x_pos+x_buffer, y_pos+y_buffer))
word_clip_space = word_clip_space.set_position((x_pos+ word_width+ x_buffer, y_pos+y_buffer))

x_pos = x_pos + word_width+ space_width


word_clips.append(word_clip)
word_clips.append(word_clip_space)


for highlight_word in xy_textclips_positions:
word_clip_highlight = TextClip(highlight_word['word'], font = font,fontsize=fontsize, color=color,bg_color = bgcolor).set_start(highlight_word['start']).set_duration(highlight_word['duration'])
word_clip_highlight = word_clip_highlight.set_position((highlight_word['x_pos'], highlight_word['y_pos']))
word_clips.append(word_clip_highlight)

return word_clips

from moviepy.editor import TextClip, CompositeVideoClip, concatenate_videoclips,VideoFileClip, ColorClip
frame_size = (1080,1080)

all_linelevel_splits=[]

for line in linelevel_subtitles:
out = create_caption(line,frame_size)
all_linelevel_splits.extend(out)


# Load the input video
input_video = VideoFileClip(videofilename)
# Get the duration of the input video
input_video_duration = input_video.duration
# Create a color clip with the given frame size, color, and duration
background_clip = ColorClip(size=frame_size, color=(0, 0, 0)).set_duration(input_video_duration)

# If you want to overlay this on the original video uncomment this and also change frame_size, font size and color accordingly.
# final_video = CompositeVideoClip([input_video] + all_linelevel_splits)

final_video = CompositeVideoClip([background_clip] + all_linelevel_splits)

# Set the audio of the final video to be the same as the input video
final_video = final_video.set_audio(input_video.audio)

# Save the final clip as a video file with the audio included
final_video.write_videofile("/Users/jitendersingh/Downloads/eleven/output.mp4", fps=24, codec="libx264", audio_codec="aac")



0 comments:

Post a Comment

 
Top