#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
pack_project_dump.py

Упаковывает код проекта в один текстовый файл, удобный для анализа:
- дерево файлов
- затем содержимое каждого файла в блоках с маркерами
- фильтрация мусорных директорий (node_modules, dist, build и т.п.)
- лимит размера на файл, чтобы не раздувать дамп
- попытка декодирования utf-8 с заменой ошибок

Пример:
  python pack_project_dump.py --root . --out project_dump.txt
"""

from __future__ import annotations

import argparse
import fnmatch
import hashlib
import os
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable, List, Optional, Tuple


DEFAULT_EXCLUDE_DIRS = {
    "node_modules",
    "dist",
    "build",
    ".next",
    ".cache",
    ".turbo",
    ".vercel",
    "coverage",
    ".git",
    ".idea",
    ".vscode",
}

DEFAULT_EXCLUDE_FILES = {
    "package-lock.json",   # можно оставить, но часто огромный
    "yarn.lock",           # можно оставить, но часто огромный
    "pnpm-lock.yaml",      # можно оставить, но часто огромный
}

DEFAULT_TEXT_EXTS = {
    ".js", ".jsx", ".ts", ".tsx",
    ".json", ".md", ".css", ".scss", ".sass", ".less",
    ".html", ".yml", ".yaml",
    ".env", ".env.example",
    ".gitignore", ".editorconfig",
    ".txt",
    ".mjs", ".cjs", "Dockerfile",
}


@dataclass(frozen=True)
class FileEntry:
    rel_path: str
    size: int
    sha256: str


def sha256_bytes(data: bytes) -> str:
    h = hashlib.sha256()
    h.update(data)
    return h.hexdigest()


def is_probably_text(path: Path, extra_exts: Optional[set[str]] = None) -> bool:
    ext = path.suffix.lower()
    if extra_exts and ext in extra_exts:
        return True
    if ext in DEFAULT_TEXT_EXTS:
        return True
    # Файлы без расширения, но “текстовые” по имени
    if path.name in {".eslintrc", ".prettierrc"}:
        return True
    return False


def should_exclude_path(
    rel_parts: Tuple[str, ...],
    exclude_dirs: set[str],
    exclude_file_globs: List[str],
    exclude_files: set[str],
) -> bool:
    # исключаем директории по любому сегменту пути
    if any(part in exclude_dirs for part in rel_parts[:-1]):
        return True

    name = rel_parts[-1] if rel_parts else ""
    if name in exclude_files:
        return True

    rel_str = "/".join(rel_parts)
    for pat in exclude_file_globs:
        if fnmatch.fnmatch(rel_str, pat) or fnmatch.fnmatch(name, pat):
            return True

    return False


def iter_project_files(
    root: Path,
    exclude_dirs: set[str],
    exclude_files: set[str],
    exclude_file_globs: List[str],
) -> Iterable[Path]:
    for dirpath, dirnames, filenames in os.walk(root):
        # фильтруем dirnames на месте, чтобы os.walk не заходил внутрь
        dirnames[:] = [d for d in dirnames if d not in exclude_dirs]

        for fname in filenames:
            p = Path(dirpath) / fname
            rel = p.relative_to(root)
            rel_parts = tuple(rel.parts)
            if should_exclude_path(rel_parts, exclude_dirs, exclude_file_globs, exclude_files):
                continue
            yield p


def build_tree_listing(paths: List[Path], root: Path) -> str:
    rels = sorted(str(p.relative_to(root)).replace(os.sep, "/") for p in paths)
    lines = ["Дерево файлов:"]
    for r in rels:
        lines.append(f"- {r}")
    return "\n".join(lines) + "\n"


def read_file_bytes(path: Path, max_file_bytes: int) -> Tuple[bytes, bool]:
    data = path.read_bytes()
    if len(data) > max_file_bytes:
        return data[:max_file_bytes], True
    return data, False


def decode_text(data: bytes) -> str:
    # Пытаемся utf-8; если ошибки — заменяем, чтобы не падать
    return data.decode("utf-8", errors="replace")


def pack_dump(
    root: Path,
    out_path: Path,
    include_globs: List[str],
    exclude_dirs: set[str],
    exclude_files: set[str],
    exclude_file_globs: List[str],
    max_file_kb: int,
    only_text: bool,
) -> None:
    max_file_bytes = max_file_kb * 1024

    all_files = list(iter_project_files(root, exclude_dirs, exclude_files, exclude_file_globs))

    # apply include globs if provided
    if include_globs:
        def match_any(rel: str) -> bool:
            return any(fnmatch.fnmatch(rel, g) for g in include_globs)

        filtered = []
        for p in all_files:
            rel = str(p.relative_to(root)).replace(os.sep, "/")
            if match_any(rel):
                filtered.append(p)
        all_files = filtered

    entries: List[FileEntry] = []
    blocks: List[str] = []

    # дерево проекта
    blocks.append(f"Снимок проекта: {root.resolve()}")
    blocks.append(f"Дата (UTC): {datetime.now(timezone.utc).isoformat()}")
    blocks.append("")
    blocks.append(build_tree_listing(all_files, root))

    for p in sorted(all_files, key=lambda x: str(x)):
        rel = str(p.relative_to(root)).replace(os.sep, "/")

        if only_text and not is_probably_text(p):
            continue

        try:
            raw, truncated = read_file_bytes(p, max_file_bytes)
        except Exception as e:
            blocks.append("<<<FILE_BEGIN>>>")
            blocks.append(f"path: {rel}")
            blocks.append("error: не удалось прочитать файл")
            blocks.append(f"exception: {type(e).__name__}: {e}")
            blocks.append("<<<FILE_END>>>")
            blocks.append("")
            continue

        sha = sha256_bytes(raw)
        size_on_disk = p.stat().st_size
        entries.append(FileEntry(rel_path=rel, size=size_on_disk, sha256=sha))

        text = decode_text(raw)

        blocks.append("<<<FILE_BEGIN>>>")
        blocks.append(f"path: {rel}")
        blocks.append(f"size_bytes: {size_on_disk}")
        blocks.append(f"sha256_first_{max_file_kb}kb: {sha}")
        if truncated:
            blocks.append(f"truncated: true (первые {max_file_kb} KB)")
        else:
            blocks.append("truncated: false")
        blocks.append("<<<CONTENT>>>")
        blocks.append(text)
        blocks.append("<<<FILE_END>>>")
        blocks.append("")

    # краткий индекс
    blocks.insert(
        0,
        "Индекс файлов (путь | размер | sha256 первых N KB):\n"
        + "\n".join(f"- {e.rel_path} | {e.size} | {e.sha256}" for e in entries)
        + "\n"
    )

    out_path.write_text("\n".join(blocks), encoding="utf-8")


def parse_args() -> argparse.Namespace:
    ap = argparse.ArgumentParser()
    ap.add_argument("--root", default=".", help="Корень проекта")
    ap.add_argument("--out", default="react_ts_frontend.txt", help="Файл-выход (один)")
    ap.add_argument(
        "--include",
        action="append",
        default=[],
        help="Глоб-паттерн для включения (можно несколько), например: 'src/**' или '**/*.tsx'",
    )
    ap.add_argument(
        "--exclude-file",
        action="append",
        default=[],
        help="Глоб-паттерн для исключения файлов, например: '**/*.min.js'",
    )
    ap.add_argument(
        "--max-file-kb",
        type=int,
        default=512,
        help="Максимальный объём на один файл (KB). Остальное отрежется.",
    )
    ap.add_argument(
        "--only-text",
        action="store_true",
        help="Включать только вероятно текстовые файлы по расширению/имени",
    )
    return ap.parse_args()


def main() -> None:
    args = parse_args()
    root = Path(args.root).resolve()
    out_path = Path(args.out).resolve()

    pack_dump(
        root=root,
        out_path=out_path,
        include_globs=args.include,
        exclude_dirs=set(DEFAULT_EXCLUDE_DIRS),
        exclude_files=set(DEFAULT_EXCLUDE_FILES),
        exclude_file_globs=args.exclude_file,
        max_file_kb=args.max_file_kb,
        only_text=args.only_text,
    )

    print(f"Готово: {out_path}")


if __name__ == "__main__":
    main()