import os import uuid from typing import Dict from PIL import Image from pdf2image import convert_from_path from app.services.correction_service import correct_text from app.services.engines.tesseract_ocr import ocr_tesseract from app.services.engines.easyocr_ocr import ocr_easyocr from app.services.engines.paddleocr_ocr import ocr_paddleocr # from app.services.engines.mmocr_ocr import ocr_mmocr # Opcional from app.utils import save_text_file UPLOAD_DIR = "app/static/uploads" TEXTS_DIR = "app/static/texts" def process_document(file_path: str) -> Dict[str, str]: """ Processa um arquivo PDF ou imagem, aplica OCR com diferentes engines e salva os textos extraídos. Retorna os textos por engine e o texto corrigido. Args: file_path (str): Caminho do arquivo a ser processado. Returns: Dict[str, str]: Dicionário com os textos por engine e o texto corrigido. """ try: if not os.path.exists(file_path): raise FileNotFoundError(f"Arquivo não encontrado: {file_path}") filename = os.path.basename(file_path) base_name = os.path.splitext(filename)[0] output_folder = os.path.join(TEXTS_DIR, base_name) os.makedirs(output_folder, exist_ok=True) # Converte PDF para imagens ou carrega imagem única images = convert_from_path(file_path) if file_path.lower().endswith(".pdf") else [Image.open(file_path)] results = { "tesseract": "", "easyocr": "", "paddleocr": "", # "mmocr": "" } # Processa cada página/imagem for i, image in enumerate(images): temp_img_path = os.path.join(output_folder, f"page_{i}.png") image.save(temp_img_path) results["tesseract"] += ocr_tesseract(image) + "\n" results["easyocr"] += ocr_easyocr(temp_img_path) + "\n" results["paddleocr"] += ocr_paddleocr(temp_img_path) + "\n" # results["mmocr"] += ocr_mmocr(temp_img_path) + "\n" os.remove(temp_img_path) # Salva os textos extraídos for engine, text in results.items(): save_text_file(text, os.path.join(output_folder, f"{engine}.txt")) # Aplica correção no texto do tesseract corrected = correct_text(results["tesseract"]) save_text_file(corrected, os.path.join(output_folder, "corrigido.txt")) results["corrigido"] = corrected return results except Exception as e: print(f"[ERRO] Falha ao processar documento: {e}") return {"error": str(e)}