2612-есть ок OCR, нужно допиливать бота под новый flow для операторов

2026-02-04 19:02:33 -06:00 · 2026-01-27 00:17:10 +03:00
parent 7d2ffb54b5
commit 1843cb9c20
22 changed files with 1011 additions and 577 deletions
--- a/ocr-service/scripts/collect_data_raw.py
+++ b/ocr-service/scripts/collect_data_raw.py
@@ -0,0 +1,67 @@
+import os
+import requests
+import json
+import mimetypes
+
+# Папка, куда вы положите фото чеков для теста
+INPUT_DIR = "./test_receipts"
+# Папка, куда сохраним сырой текст
+OUTPUT_DIR = "./raw_outputs"
+# Адрес запущенного OCR сервиса
+API_URL = "http://10.25.100.250:5006/recognize"
+
+def process_images():
+    if not os.path.exists(OUTPUT_DIR):
+        os.makedirs(OUTPUT_DIR)
+
+    if not os.path.exists(INPUT_DIR):
+        os.makedirs(INPUT_DIR)
+        print(f"Папка {INPUT_DIR} создана. Положите туда фото чеков и перезапустите скрипт.")
+        return
+
+    files = [f for f in os.listdir(INPUT_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.xlsx'))]
+    
+    if not files:
+        print(f"В папке {INPUT_DIR} нет изображений.")
+        return
+
+    print(f"Найдено {len(files)} файлов. Начинаю обработку...")
+
+    for filename in files:
+        file_path = os.path.join(INPUT_DIR, filename)
+        
+        # Явное определение mime_type для Excel файлов
+        if filename.lower().endswith('.xlsx'):
+            mime_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
+        else:
+            mime_type, _ = mimetypes.guess_type(file_path)
+            mime_type = mime_type or 'image/jpeg'
+        
+        print(f"Processing {filename}...", end=" ")
+        
+        try:
+            with open(file_path, 'rb') as f:
+                files = {'image': (filename, f, mime_type or 'image/jpeg')}
+                response = requests.post(API_URL, files=files, timeout=30)
+            
+            if response.status_code == 200:
+                data = response.json()
+                raw_text = data.get("raw_text", "")
+                source = data.get("source", "unknown")
+                
+                # Сохраняем RAW текст
+                out_name = f"{filename}_RAW.txt"
+                with open(os.path.join(OUTPUT_DIR, out_name), "w", encoding="utf-8") as out:
+                    out.write(f"Source: {source}\n")
+                    out.write("="*20 + "\n")
+                    out.write(raw_text)
+                
+                print(f"OK ({source}) -> {out_name}")
+            else:
+                print(f"FAIL: {response.status_code} - {response.text}")
+                
+        except Exception as e:
+            print(f"ERROR: {e}")
+
+if __name__ == "__main__":
+    process_images()
--- a/ocr-service/scripts/test_parsing_quality.py
+++ b/ocr-service/scripts/test_parsing_quality.py
@@ -0,0 +1,62 @@
+import os
+import requests
+import json
+import mimetypes
+
+# Папка с фото/excel
+INPUT_DIR = "./test_receipts"
+# Папка для результатов
+OUTPUT_DIR = "./json_results"
+# Адрес сервиса
+API_URL = "http://10.25.100.250:5006/recognize"
+
+def test_parsing():
+    if not os.path.exists(OUTPUT_DIR):
+        os.makedirs(OUTPUT_DIR)
+
+    if not os.path.exists(INPUT_DIR):
+        print(f"Папка {INPUT_DIR} не найдена.")
+        return
+
+    files = [f for f in os.listdir(INPUT_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png', '.xlsx'))]
+    
+    print(f"Найдено {len(files)} файлов. Тестируем парсинг...")
+
+    for filename in files:
+        file_path = os.path.join(INPUT_DIR, filename)
+        
+        # Определение MIME
+        if filename.lower().endswith('.xlsx'):
+            mime_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
+        else:
+            mime_type, _ = mimetypes.guess_type(file_path)
+            mime_type = mime_type or 'image/jpeg'
+        
+        print(f"Processing {filename} ({mime_type})...", end=" ")
+        
+        try:
+            with open(file_path, 'rb') as f:
+                files = {'image': (filename, f, mime_type)}
+                # Тайм-аут побольше, так как Excel + LLM может быть долгим
+                response = requests.post(API_URL, files=files, timeout=60)
+            
+            if response.status_code == 200:
+                data = response.json()
+                items = data.get("items", [])
+                source = data.get("source", "unknown")
+                doc_number = data.get("doc_number", "")
+                
+                # Сохраняем JSON
+                out_name = f"{filename}_RESULT.json"
+                with open(os.path.join(OUTPUT_DIR, out_name), "w", encoding="utf-8") as out:
+                    json.dump(data, out, ensure_ascii=False, indent=2)
+                
+                print(f"OK ({source}) -> Found {len(items)} items. Doc#: {doc_number}")
+            else:
+                print(f"FAIL: {response.status_code} - {response.text}")
+                
+        except Exception as e:
+            print(f"ERROR: {e}")
+
+if __name__ == "__main__":
+    test_parsing()