Files
rmser/ocr-service/ocr.py
SERTY 91923b8616 .venv deleted
ocr ready to test
2025-11-29 12:29:08 +03:00

38 lines
1.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import logging
import pytesseract
from PIL import Image
import numpy as np
logger = logging.getLogger(__name__)
# Если tesseract не в PATH, раскомментируй и укажи путь:
# pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
class OCREngine:
def __init__(self):
logger.info("Initializing Tesseract OCR wrapper...")
# Tesseract не требует загрузки моделей в память,
# проверка версии просто чтобы убедиться, что он установлен
try:
version = pytesseract.get_tesseract_version()
logger.info(f"Tesseract version found: {version}")
except Exception as e:
logger.error("Tesseract not found! Make sure it is installed (apt install tesseract-ocr).")
raise e
def recognize(self, image: np.ndarray) -> str:
"""
Принимает бинарное изображение (numpy array).
"""
# Tesseract работает лучше с PIL Image
pil_img = Image.fromarray(image)
# Конфигурация:
# -l rus+eng: русский и английский
# --psm 6: Assume a single uniform block of text (хорошо для чеков)
custom_config = r'--oem 3 --psm 6'
text = pytesseract.image_to_string(pil_img, lang='rus+eng', config=custom_config)
return text
ocr_engine = OCREngine()