Files
rmser/ocr-service/scripts/collect_data_raw.py

67 lines
2.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import requests
import json
import mimetypes
# Папка, куда вы положите фото чеков для теста
INPUT_DIR = "./test_receipts"
# Папка, куда сохраним сырой текст
OUTPUT_DIR = "./raw_outputs"
# Адрес запущенного OCR сервиса
API_URL = "http://10.25.100.250:5006/recognize"
def process_images():
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
if not os.path.exists(INPUT_DIR):
os.makedirs(INPUT_DIR)
print(f"Папка {INPUT_DIR} создана. Положите туда фото чеков и перезапустите скрипт.")
return
files = [f for f in os.listdir(INPUT_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.xlsx'))]
if not files:
print(f"В папке {INPUT_DIR} нет изображений.")
return
print(f"Найдено {len(files)} файлов. Начинаю обработку...")
for filename in files:
file_path = os.path.join(INPUT_DIR, filename)
# Явное определение mime_type для Excel файлов
if filename.lower().endswith('.xlsx'):
mime_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
else:
mime_type, _ = mimetypes.guess_type(file_path)
mime_type = mime_type or 'image/jpeg'
print(f"Processing {filename}...", end=" ")
try:
with open(file_path, 'rb') as f:
files = {'image': (filename, f, mime_type or 'image/jpeg')}
response = requests.post(API_URL, files=files, timeout=30)
if response.status_code == 200:
data = response.json()
raw_text = data.get("raw_text", "")
source = data.get("source", "unknown")
# Сохраняем RAW текст
out_name = f"{filename}_RAW.txt"
with open(os.path.join(OUTPUT_DIR, out_name), "w", encoding="utf-8") as out:
out.write(f"Source: {source}\n")
out.write("="*20 + "\n")
out.write(raw_text)
print(f"OK ({source}) -> {out_name}")
else:
print(f"FAIL: {response.status_code} - {response.text}")
except Exception as e:
print(f"ERROR: {e}")
if __name__ == "__main__":
process_images()