highlight subtitle video

import os
import whisper
import json
from moviepy.editor import TextClip, CompositeVideoClip, ColorClip
import numpy as np



# Define the input video file and output audio file
videofilename = "/Users/jitendersingh/Downloads/eleven.mp4"
audiofilename = "/Users/jitendersingh/Downloads/eleven/eleven.mp3"

# Run the ffmpeg command to extract audio
os.system(f'ffmpeg -i "{videofilename}" -vn -acodec libmp3lame -q:a 4 "{audiofilename}"')

#might take some time (approx 3- 5min depending on audio length)
model = whisper.load_model("medium")
result = model.transcribe(audiofilename,word_timestamps=True)
print(result)

print (result['text'])

for each in result['segments']:
  print (each)

  wordlevel_info = []

for each in result['segments']:
  words = each['words']
  for word in words:
    # print (word['word'], "  ",word['start']," - ",word['end'])
    wordlevel_info.append({'word':word['word'].strip(),'start':word['start'],'end':word['end']})

wordlevel_info

# Save the wordlevel_info data as JSON
output_json_file = '/Users/jitendersingh/Downloads/eleven/data.json'
with open(output_json_file, 'w', encoding='utf-8') as f:
    json.dump(wordlevel_info, f, indent=4, ensure_ascii=False)

# Load the JSON file with proper decoding
with open(output_json_file, 'r', encoding='utf-8') as f:
    loaded_data = json.load(f)

# Now you can access the JSON data with Hindi words in their readable form
for word_info in loaded_data:
    print("Word:", word_info['word'])
    print("Start:", word_info['start'])
    print("End:", word_info['end'])
    print()




with open('/Users/jitendersingh/Downloads/eleven/data.json', 'r', encoding='utf-8') as f:
    wordlevel_info_modified = json.load(f)

wordlevel_info_modified


def split_text_into_lines(data):

    MaxChars = 40 
    #maxduration in seconds
    MaxDuration = 3.0
    #Split if nothing is spoken (gap) for these many seconds
    MaxGap = 1.5

    subtitles = []
    line = []
    line_duration = 0
    line_chars = 0


    for idx,word_data in enumerate(data):
        word = word_data["word"]
        start = word_data["start"]
        end = word_data["end"]

        line.append(word_data)
        line_duration += end - start
        
        temp = " ".join(item["word"] for item in line)
        

        # Check if adding a new word exceeds the maximum character count or duration
        new_line_chars = len(temp)

        duration_exceeded = line_duration > MaxDuration 
        chars_exceeded = new_line_chars > MaxChars 
        if idx>0:
          gap = word_data['start'] - data[idx-1]['end'] 
          # print (word,start,end,gap)
          maxgap_exceeded = gap > MaxGap
        else:
          maxgap_exceeded = False
        

        if duration_exceeded or chars_exceeded or maxgap_exceeded:
            if line:
                subtitle_line = {
                    "word": " ".join(item["word"] for item in line),
                    "start": line[0]["start"],
                    "end": line[-1]["end"],
                    "textcontents": line
                }
                subtitles.append(subtitle_line)
                line = []
                line_duration = 0
                line_chars = 0


    if line:
        subtitle_line = {
            "word": " ".join(item["word"] for item in line),
            "start": line[0]["start"],
            "end": line[-1]["end"],
            "textcontents": line
        }
        subtitles.append(subtitle_line)

    return subtitles

linelevel_subtitles = split_text_into_lines(wordlevel_info_modified)
print (linelevel_subtitles) 

for line in linelevel_subtitles:
    json_str = json.dumps(line, ensure_ascii=False, indent=4)
    print(json_str.encode('utf-8').decode('utf-8'))



def create_caption(textJSON, framesize,font = "/Users/jitendersingh/Library/Fonts/Baloo-Regular.ttf",fontsize=80, color='white', bgcolor='blue'):
    wordcount = len(textJSON['textcontents'])
    full_duration = textJSON['end']-textJSON['start']

    word_clips = []
    xy_textclips_positions =[]
    
    x_pos = 0
    y_pos = 0
    # max_height = 0
    frame_width = framesize[0]
    frame_height = framesize[1]
    x_buffer = frame_width*1/10
    y_buffer = frame_height*1/5

    space_width = ""
    space_height = ""

    for index,wordJSON in enumerate(textJSON['textcontents']):
      duration = wordJSON['end']-wordJSON['start']
      word_clip = TextClip(wordJSON['word'], font = font,fontsize=fontsize, color=color).set_start(textJSON['start']).set_duration(full_duration)
      word_clip_space = TextClip(" ", font = font,fontsize=fontsize, color=color).set_start(textJSON['start']).set_duration(full_duration)
      word_width, word_height = word_clip.size
      space_width,space_height = word_clip_space.size
      if x_pos + word_width+ space_width > frame_width-2*x_buffer:
            # Move to the next line
            x_pos = 0
            y_pos = y_pos+ word_height+40

            # Store info of each word_clip created
            xy_textclips_positions.append({
                "x_pos":x_pos+x_buffer,
                "y_pos": y_pos+y_buffer,
                "width" : word_width,
                "height" : word_height,
                "word": wordJSON['word'],
                "start": wordJSON['start'],
                "end": wordJSON['end'],
                "duration": duration
            })

            word_clip = word_clip.set_position((x_pos+x_buffer, y_pos+y_buffer))
            word_clip_space = word_clip_space.set_position((x_pos+ word_width +x_buffer, y_pos+y_buffer))
            x_pos = word_width + space_width
      else:
            # Store info of each word_clip created
            xy_textclips_positions.append({
                "x_pos":x_pos+x_buffer,
                "y_pos": y_pos+y_buffer,
                "width" : word_width,
                "height" : word_height,
                "word": wordJSON['word'],
                "start": wordJSON['start'],
                "end": wordJSON['end'],
                "duration": duration
            })

            word_clip = word_clip.set_position((x_pos+x_buffer, y_pos+y_buffer))
            word_clip_space = word_clip_space.set_position((x_pos+ word_width+ x_buffer, y_pos+y_buffer))

            x_pos = x_pos + word_width+ space_width


      word_clips.append(word_clip)
      word_clips.append(word_clip_space)  


    for highlight_word in xy_textclips_positions:
      
      word_clip_highlight = TextClip(highlight_word['word'], font = font,fontsize=fontsize, color=color,bg_color = bgcolor).set_start(highlight_word['start']).set_duration(highlight_word['duration'])
      word_clip_highlight = word_clip_highlight.set_position((highlight_word['x_pos'], highlight_word['y_pos']))
      word_clips.append(word_clip_highlight)

    return word_clips

from moviepy.editor import TextClip, CompositeVideoClip, concatenate_videoclips,VideoFileClip, ColorClip
frame_size = (1080,1080)

all_linelevel_splits=[]

for line in linelevel_subtitles:
  out = create_caption(line,frame_size)
  all_linelevel_splits.extend(out)


# Load the input video
input_video = VideoFileClip(videofilename)
# Get the duration of the input video
input_video_duration = input_video.duration
# Create a color clip with the given frame size, color, and duration
background_clip = ColorClip(size=frame_size, color=(0, 0, 0)).set_duration(input_video_duration)

# If you want to overlay this on the original video uncomment this and also change frame_size, font size and color accordingly.
# final_video = CompositeVideoClip([input_video] + all_linelevel_splits)

final_video = CompositeVideoClip([background_clip] + all_linelevel_splits)

# Set the audio of the final video to be the same as the input video
final_video = final_video.set_audio(input_video.audio)

# Save the final clip as a video file with the audio included
final_video.write_videofile("/Users/jitendersingh/Downloads/eleven/output.mp4", fps=24, codec="libx264", audio_codec="aac")
highlight subtitle video

0 comments:

Post a Comment

Populars

Archives

highlight subtitle video

Next

Newer Post

Previous

Older Post

0 comments:

Post a Comment