Files
rmser/ocr-service/scripts/test_parsing_quality.py

62 lines
2.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import requests
import json
import mimetypes
# Папка с фото/excel
INPUT_DIR = "./test_receipts"
# Папка для результатов
OUTPUT_DIR = "./json_results"
# Адрес сервиса
API_URL = "http://10.25.100.250:5006/recognize"
def test_parsing():
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
if not os.path.exists(INPUT_DIR):
print(f"Папка {INPUT_DIR} не найдена.")
return
files = [f for f in os.listdir(INPUT_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.xlsx'))]
print(f"Найдено {len(files)} файлов. Тестируем парсинг...")
for filename in files:
file_path = os.path.join(INPUT_DIR, filename)
# Определение MIME
if filename.lower().endswith('.xlsx'):
mime_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
else:
mime_type, _ = mimetypes.guess_type(file_path)
mime_type = mime_type or 'image/jpeg'
print(f"Processing {filename} ({mime_type})...", end=" ")
try:
with open(file_path, 'rb') as f:
files = {'image': (filename, f, mime_type)}
# Тайм-аут побольше, так как Excel + LLM может быть долгим
response = requests.post(API_URL, files=files, timeout=60)
if response.status_code == 200:
data = response.json()
items = data.get("items", [])
source = data.get("source", "unknown")
doc_number = data.get("doc_number", "")
# Сохраняем JSON
out_name = f"{filename}_RESULT.json"
with open(os.path.join(OUTPUT_DIR, out_name), "w", encoding="utf-8") as out:
json.dump(data, out, ensure_ascii=False, indent=2)
print(f"OK ({source}) -> Found {len(items)} items. Doc#: {doc_number}")
else:
print(f"FAIL: {response.status_code} - {response.text}")
except Exception as e:
print(f"ERROR: {e}")
if __name__ == "__main__":
test_parsing()