This commit is contained in:
Lila.rodri 2025-05-26 19:47:39 -03:00
parent f0ea5b282b
commit 8a68a6b652
2 changed files with 49 additions and 30 deletions

View File

@ -14,40 +14,59 @@ UPLOAD_DIR = "app/static/uploads"
TEXTS_DIR = "app/static/texts"
def process_document(file_path: str) -> Dict[str, str]:
filename = os.path.basename(file_path)
base_name = os.path.splitext(filename)[0]
output_folder = os.path.join(TEXTS_DIR, base_name)
os.makedirs(output_folder, exist_ok=True)
"""
Processa um arquivo PDF ou imagem, aplica OCR com diferentes engines e salva os textos extraídos.
Retorna os textos por engine e o texto corrigido.
images = []
if file_path.lower().endswith(".pdf"):
images = convert_from_path(file_path)
else:
images = [Image.open(file_path)]
Args:
file_path (str): Caminho do arquivo a ser processado.
results = {
"tesseract": "",
"easyocr": "",
"paddleocr": "",
# "mmocr": ""
}
Returns:
Dict[str, str]: Dicionário com os textos por engine e o texto corrigido.
"""
try:
if not os.path.exists(file_path):
raise FileNotFoundError(f"Arquivo não encontrado: {file_path}")
for i, image in enumerate(images):
temp_img_path = os.path.join(output_folder, f"page_{i}.png")
image.save(temp_img_path)
filename = os.path.basename(file_path)
base_name = os.path.splitext(filename)[0]
output_folder = os.path.join(TEXTS_DIR, base_name)
os.makedirs(output_folder, exist_ok=True)
results["tesseract"] += ocr_tesseract(image) + "\n"
results["easyocr"] += ocr_easyocr(temp_img_path) + "\n"
results["paddleocr"] += ocr_paddleocr(temp_img_path) + "\n"
# results["mmocr"] += ocr_mmocr(temp_img_path) + "\n"
# Converte PDF para imagens ou carrega imagem única
images = convert_from_path(file_path) if file_path.lower().endswith(".pdf") else [Image.open(file_path)]
os.remove(temp_img_path)
results = {
"tesseract": "",
"easyocr": "",
"paddleocr": "",
# "mmocr": ""
}
for engine, text in results.items():
save_text_file(text, os.path.join(output_folder, f"{engine}.txt"))
# Processa cada página/imagem
for i, image in enumerate(images):
temp_img_path = os.path.join(output_folder, f"page_{i}.png")
image.save(temp_img_path)
corrected = correct_text(results["tesseract"])
save_text_file(corrected, os.path.join(output_folder, "corrigido.txt"))
results["corrigido"] = corrected
results["tesseract"] += ocr_tesseract(image) + "\n"
results["easyocr"] += ocr_easyocr(temp_img_path) + "\n"
results["paddleocr"] += ocr_paddleocr(temp_img_path) + "\n"
# results["mmocr"] += ocr_mmocr(temp_img_path) + "\n"
os.remove(temp_img_path)
# Salva os textos extraídos
for engine, text in results.items():
save_text_file(text, os.path.join(output_folder, f"{engine}.txt"))
# Aplica correção no texto do tesseract
corrected = correct_text(results["tesseract"])
save_text_file(corrected, os.path.join(output_folder, "corrigido.txt"))
results["corrigido"] = corrected
return results
except Exception as e:
print(f"[ERRO] Falha ao processar documento: {e}")
return {"error": str(e)}
return results

2
run.py
View File

@ -10,4 +10,4 @@ if __name__ == "__main__":
create_default_user()
print("Banco de dados criado com sucesso!")
app.run(debug=True)
app.run(debug=True, port=5003)