From 8a68a6b652061e54a751453ccee9685c17379cb0 Mon Sep 17 00:00:00 2001 From: "Lila.rodri" Date: Mon, 26 May 2025 19:47:39 -0300 Subject: [PATCH] ajustado --- app/services/ocr_service.py | 77 +++++++++++++++++++++++-------------- run.py | 2 +- 2 files changed, 49 insertions(+), 30 deletions(-) diff --git a/app/services/ocr_service.py b/app/services/ocr_service.py index 389c439..6d47e1c 100644 --- a/app/services/ocr_service.py +++ b/app/services/ocr_service.py @@ -14,40 +14,59 @@ UPLOAD_DIR = "app/static/uploads" TEXTS_DIR = "app/static/texts" def process_document(file_path: str) -> Dict[str, str]: - filename = os.path.basename(file_path) - base_name = os.path.splitext(filename)[0] - output_folder = os.path.join(TEXTS_DIR, base_name) - os.makedirs(output_folder, exist_ok=True) + """ + Processa um arquivo PDF ou imagem, aplica OCR com diferentes engines e salva os textos extraídos. + Retorna os textos por engine e o texto corrigido. - images = [] - if file_path.lower().endswith(".pdf"): - images = convert_from_path(file_path) - else: - images = [Image.open(file_path)] + Args: + file_path (str): Caminho do arquivo a ser processado. - results = { - "tesseract": "", - "easyocr": "", - "paddleocr": "", - # "mmocr": "" - } + Returns: + Dict[str, str]: Dicionário com os textos por engine e o texto corrigido. + """ + try: + if not os.path.exists(file_path): + raise FileNotFoundError(f"Arquivo não encontrado: {file_path}") - for i, image in enumerate(images): - temp_img_path = os.path.join(output_folder, f"page_{i}.png") - image.save(temp_img_path) + filename = os.path.basename(file_path) + base_name = os.path.splitext(filename)[0] + output_folder = os.path.join(TEXTS_DIR, base_name) + os.makedirs(output_folder, exist_ok=True) - results["tesseract"] += ocr_tesseract(image) + "\n" - results["easyocr"] += ocr_easyocr(temp_img_path) + "\n" - results["paddleocr"] += ocr_paddleocr(temp_img_path) + "\n" - # results["mmocr"] += ocr_mmocr(temp_img_path) + "\n" + # Converte PDF para imagens ou carrega imagem única + images = convert_from_path(file_path) if file_path.lower().endswith(".pdf") else [Image.open(file_path)] - os.remove(temp_img_path) + results = { + "tesseract": "", + "easyocr": "", + "paddleocr": "", + # "mmocr": "" + } - for engine, text in results.items(): - save_text_file(text, os.path.join(output_folder, f"{engine}.txt")) + # Processa cada página/imagem + for i, image in enumerate(images): + temp_img_path = os.path.join(output_folder, f"page_{i}.png") + image.save(temp_img_path) - corrected = correct_text(results["tesseract"]) - save_text_file(corrected, os.path.join(output_folder, "corrigido.txt")) - results["corrigido"] = corrected + results["tesseract"] += ocr_tesseract(image) + "\n" + results["easyocr"] += ocr_easyocr(temp_img_path) + "\n" + results["paddleocr"] += ocr_paddleocr(temp_img_path) + "\n" + # results["mmocr"] += ocr_mmocr(temp_img_path) + "\n" + + os.remove(temp_img_path) + + # Salva os textos extraídos + for engine, text in results.items(): + save_text_file(text, os.path.join(output_folder, f"{engine}.txt")) + + # Aplica correção no texto do tesseract + corrected = correct_text(results["tesseract"]) + save_text_file(corrected, os.path.join(output_folder, "corrigido.txt")) + results["corrigido"] = corrected + + return results + + except Exception as e: + print(f"[ERRO] Falha ao processar documento: {e}") + return {"error": str(e)} - return results diff --git a/run.py b/run.py index 4b54c37..afec676 100644 --- a/run.py +++ b/run.py @@ -10,4 +10,4 @@ if __name__ == "__main__": create_default_user() print("Banco de dados criado com sucesso!") - app.run(debug=True) + app.run(debug=True, port=5003)