Tipoalgo: Gerador de Legendas

- Nesta versão foi acrescentada uma interface gráfica ao script;

- A parte de tradução das foi retirada;

- O reconhecimento da fala suporta 3 idiomas: Inglês, Português e Espanhol.

import os
import subprocess
import vosk
import pysrt
import json
import sys
import tkinter as tk
from tkinter import filedialog
import threading
from moviepy.editor import VideoFileClip

def recognize_speech(file_path, language, text_widget, subtitles):
    # Configurar o caminho do modelo de idioma Vosk de acordo com a escolha do usuário
    if language == "en":
        model_path = "vosk-model-small-en-us-0.15"
    elif language == "pt":
        model_path = "vosk-model-pt-fb-v0.1.1-20220516_2113"
    elif language == "es":
        model_path = "vosk-model-small-es-0.42"
    else:
        raise ValueError("Idioma não suportado.")

    # Verificar se o arquivo de entrada existe
    if not os.path.isfile(file_path):
        raise FileNotFoundError("O arquivo de vídeo não existe.")

    # Configurar a taxa de amostragem para o modelo Vosk
    sample_rate = 16000

    # Carregar o modelo de idioma Vosk
    model = vosk.Model(model_path)

    # Inicializar o reconhecedor de fala Vosk
    rec = vosk.KaldiRecognizer(model, sample_rate)
    rec.SetWords(True)

    # Obter a duração do áudio para atualização da barra de progresso
    audio_duration = VideoFileClip(file_path).duration

    # Comando FFmpeg para extrair o áudio e converter para WAV
    ffmpeg_command = f"ffmpeg -y -i {file_path} -vn -acodec pcm_s16le -ar {sample_rate} -ac 1 -f wav -"
    ffmpeg_process = subprocess.Popen(ffmpeg_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)

    # Realizar o reconhecimento de fala por blocos de áudio
    start_time = 0
    prev_end_time = 0

    while True:
        # Ler o próximo bloco de áudio do processo ffmpeg
        audio_data = ffmpeg_process.stdout.read(sample_rate * 9)
        if len(audio_data) == 0:
            break

        # Realizar o reconhecimento de fala no bloco atual
        rec.AcceptWaveform(audio_data)

        # Extrair o texto reconhecido do resultado
        result = json.loads(rec.Result())

        if "result" in result:
            transcript = " ".join([word["word"] for word in result["result"]])
            if transcript.strip(): # Verificar se o texto não está vazio
                block_duration = result["result"][-1]["end"] - prev_end_time
                progress_percentage = (start_time + block_duration) / audio_duration * 100
                subtitles.append(pysrt.SubRipItem(index=len(subtitles) + 1, start=pysrt.SubRipTime(seconds=start_time), end=pysrt.SubRipTime(seconds=start_time + block_duration), text=transcript))

                # Atualizar o texto no widget Text
                text_widget.insert(tk.END, f"--> {transcript}\n")
                text_widget.see(tk.END) # Rolar o texto para mantê-lo visível

                # Atualizar o tempo de início para o próximo bloco
                start_time += block_duration
                prev_end_time = result["result"][-1]["end"]

    # Fechar o processo ffmpeg
    ffmpeg_process.stdout.close()
    ffmpeg_process.wait()

    return subtitles

class SpeechRecognitionApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("Gerador de Legendas")
        self.geometry("500x500")

        self.file_path = tk.StringVar()
        self.language_choice = tk.StringVar(value="en")

        tk.Label(self, text="Escolha o arquivo de vídeo:").pack(pady=10)
        tk.Button(self, text="Procurar", command=self.browse_file).pack(pady=5)

        tk.Label(self, text="Selecione o idioma para reconhecimento:").pack(pady=10)
        tk.Radiobutton(self, text="Inglês", variable=self.language_choice, value="en").pack(anchor=tk.W)
        tk.Radiobutton(self, text="Português", variable=self.language_choice, value="pt").pack(anchor=tk.W)
        tk.Radiobutton(self, text="Espanhol", variable=self.language_choice, value="es").pack(anchor=tk.W)

        self.display_text = tk.Text(self, wrap="word", width=60, height=10)
        self.display_text.pack(pady=10)

        self.start_recognition_button = tk.Button(self, text="Iniciar Reconhecimento", command=self.start_recognition_process)
        self.start_recognition_button.pack(pady=10)

    def browse_file(self):
        self.file_path.set(filedialog.askopenfilename(filetypes=[("Arquivos de Vídeo", "*.mp4")]))
        file_path = self.file_path.get()
        self.display_text.delete(1.0, tk.END)
        self.display_text.insert(tk.END, f"Arquivo de vídeo: {file_path}")

    def start_recognition_process(self):
        file_path = self.file_path.get()
        language = self.language_choice.get()
        if not file_path:
            self.display_text.delete(1.0, tk.END)
            self.display_text.insert(tk.END, "Selecione um arquivo de vídeo.")
            return

        if not os.path.isfile(file_path):
            self.display_text.delete(1.0, tk.END)
            self.display_text.insert(tk.END, "O arquivo de vídeo não existe.")
            return

        # Limpar o texto reconhecido anterior
        self.display_text.delete(1.0, tk.END)

        # Desabilitar o botão de início para evitar múltiplas execuções
        self.start_recognition_button.config(state=tk.DISABLED)

        # Iniciar o processo de reconhecimento em uma thread separada
        threading.Thread(target=self.perform_recognition, args=(file_path, language)).start()

    def save_subtitles(self, output_file, subtitles):
        # Cria um arquivo .srt com as legendas geradas
        with open(output_file, "w", encoding="utf-8") as f:
            for subtitle in subtitles:
                f.write(str(subtitle))
                f.write('\n')

        self.display_text.delete(1.0, tk.END)
        self.display_text.insert(tk.END, f"Processo concluído. Legenda gerada em {output_file}")

    def perform_recognition(self, file_path, language):
        try:
            subtitles = pysrt.SubRipFile()
            self.display_text.insert(tk.END, "Iniciando o reconhecimento...\n")
            self.update_idletasks()

            blocks = recognize_speech(file_path, language, self.display_text, subtitles)

            # Obter o caminho do arquivo de saída .srt
            output_file = os.path.splitext(file_path)[0]
            if language == "en":
                output_file += "_ing.srt"
            elif language == "pt":
                output_file += "_por.srt"
            elif language == "es":
                output_file += "_esp.srt"

            # Salvar as legendas em formato .srt
            self.save_subtitles(output_file, subtitles)

        except Exception as e:
            self.display_text.insert(tk.END, f"Erro durante o reconhecimento: {str(e)}\n")
            self.update_idletasks()

        finally:
            # Habilitar o botão de início após o término do reconhecimento
            self.start_recognition_button.config(state=tk.NORMAL)

if __name__ == "__main__":
    app = SpeechRecognitionApp()
    app.mainloop()

Tipoalgo

sábado, 29 de julho de 2023

Gerador de Legendas - 5.0

Nenhum comentário:

Postar um comentário

Arquivo do blog

Pesquisar neste blog