Files
rmser/ocr-service/llm_parser.py

74 lines
2.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import requests
import logging
import json
from typing import List
from parser import ParsedItem
logger = logging.getLogger(__name__)
YANDEX_GPT_URL = "https://llm.api.cloud.yandex.net/foundationModels/v1/completion"
class YandexGPTParser:
def __init__(self):
self.folder_id = os.getenv("YANDEX_FOLDER_ID")
self.api_key = os.getenv("YANDEX_OAUTH_TOKEN") # Используем тот же доступ
def parse_with_llm(self, raw_text: str, iam_token: str) -> List[ParsedItem]:
"""
Отправляет текст в YandexGPT для структурирования.
"""
if not iam_token:
return []
prompt = {
"modelUri": f"gpt://{self.folder_id}/yandexgpt/latest",
"completionOptions": {
"stream": False,
"temperature": 0.1, # Низкая температура для точности
"maxTokens": "2000"
},
"messages": [
{
"role": "system",
"text": (
"Ты — помощник по бухгалтерии. Извлеки список товаров из текста документа. "
"Верни ответ строго в формате JSON: "
'[{"raw_name": string, "amount": float, "price": float, "sum": float}]. '
"Если количество не указано, считай 1.0. Не пиши ничего, кроме JSON."
)
},
{
"role": "user",
"text": raw_text
}
]
}
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {iam_token}",
"x-folder-id": self.folder_id
}
try:
response = requests.post(YANDEX_GPT_URL, headers=headers, json=prompt, timeout=30)
response.raise_for_status()
result = response.json()
# Извлекаем текст ответа
content = result['result']['alternatives'][0]['message']['text']
# Очищаем от возможных markdown-оберток ```json ... ```
clean_json = content.replace("```json", "").replace("```", "").strip()
items_raw = json.loads(clean_json)
parsed_items = [ParsedItem(**item) for item in items_raw]
return parsed_items
except Exception as e:
logger.error(f"LLM Parsing error: {e}")
return []
llm_parser = YandexGPTParser()