forked from Anteros-Code-Mentoria/poc-mvc-ocr
54 lines
1.7 KiB
Python
54 lines
1.7 KiB
Python
import os
|
|
import uuid
|
|
from typing import Dict
|
|
from PIL import Image
|
|
from pdf2image import convert_from_path
|
|
from app.services.correction_service import correct_text
|
|
from app.services.engines.tesseract_ocr import ocr_tesseract
|
|
from app.services.engines.easyocr_ocr import ocr_easyocr
|
|
from app.services.engines.paddleocr_ocr import ocr_paddleocr
|
|
# from app.services.engines.mmocr_ocr import ocr_mmocr # Opcional
|
|
from app.utils import save_text_file
|
|
|
|
UPLOAD_DIR = "app/static/uploads"
|
|
TEXTS_DIR = "app/static/texts"
|
|
|
|
def process_document(file_path: str) -> Dict[str, str]:
|
|
filename = os.path.basename(file_path)
|
|
base_name = os.path.splitext(filename)[0]
|
|
output_folder = os.path.join(TEXTS_DIR, base_name)
|
|
os.makedirs(output_folder, exist_ok=True)
|
|
|
|
images = []
|
|
if file_path.lower().endswith(".pdf"):
|
|
images = convert_from_path(file_path)
|
|
else:
|
|
images = [Image.open(file_path)]
|
|
|
|
results = {
|
|
"tesseract": "",
|
|
"easyocr": "",
|
|
"paddleocr": "",
|
|
# "mmocr": ""
|
|
}
|
|
|
|
for i, image in enumerate(images):
|
|
temp_img_path = os.path.join(output_folder, f"page_{i}.png")
|
|
image.save(temp_img_path)
|
|
|
|
results["tesseract"] += ocr_tesseract(image) + "\n"
|
|
results["easyocr"] += ocr_easyocr(temp_img_path) + "\n"
|
|
results["paddleocr"] += ocr_paddleocr(temp_img_path) + "\n"
|
|
# results["mmocr"] += ocr_mmocr(temp_img_path) + "\n"
|
|
|
|
os.remove(temp_img_path)
|
|
|
|
for engine, text in results.items():
|
|
save_text_file(text, os.path.join(output_folder, f"{engine}.txt"))
|
|
|
|
corrected = correct_text(results["tesseract"])
|
|
save_text_file(corrected, os.path.join(output_folder, "corrigido.txt"))
|
|
results["corrigido"] = corrected
|
|
|
|
return results
|