dev-puppeteer/my-deepagent/tests/unit/test_artifact_schema.py

"""Unit tests for src/my_deepagent/artifact_schema.py."""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any

import pytest

from my_deepagent.artifact_schema import (
    ArtifactSchemaRegistry,
    ValidationFinding,
    ValidationResult,
)
from my_deepagent.errors import MyDeepAgentError

# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------

REPO_ROOT = Path(__file__).parent.parent.parent
SEED_ROOT = REPO_ROOT / "docs" / "schemas" / "artifacts"

SEED_SCHEMA_IDS = [
    "common/final-report@1",
    "dev/phase-plan@1",
    "dev/review-finding-batch@1",
    "dev/spec@1",
]


@pytest.fixture
def seed_registry() -> ArtifactSchemaRegistry:
    return ArtifactSchemaRegistry(roots=[SEED_ROOT])


@pytest.fixture
def valid_spec() -> dict[str, Any]:
    return {
        "runId": "00000000-0000-4000-8000-000000000000",
        "phaseKey": "spec",
        "requirements": "User wants a CLI tool that analyzes log files.",
        "acceptance_criteria": ["parses .log files", "outputs JSON summary"],
        "approach": "Build a typer-based CLI using regex and json output.",
        "risks": ["log format variations may break parser"],
    }


# ---------------------------------------------------------------------------
# 1. Seed schema load success (4 schemas)
# ---------------------------------------------------------------------------


@pytest.mark.parametrize("schema_id", SEED_SCHEMA_IDS)
def test_seed_schema_loads(seed_registry: ArtifactSchemaRegistry, schema_id: str) -> None:
    schema = seed_registry.load(schema_id)
    assert isinstance(schema, dict)
    assert schema.get("$id") == schema_id


# ---------------------------------------------------------------------------
# 2. Load result caching — same dict object on second call
# ---------------------------------------------------------------------------


def test_load_caches_same_object(seed_registry: ArtifactSchemaRegistry) -> None:
    first = seed_registry.load("dev/spec@1")
    second = seed_registry.load("dev/spec@1")
    assert first is second


# ---------------------------------------------------------------------------
# 3. Unknown schema_id → artifact_schema_unknown
# ---------------------------------------------------------------------------


def test_unknown_schema_id_raises(seed_registry: ArtifactSchemaRegistry) -> None:
    with pytest.raises(MyDeepAgentError) as exc_info:
        seed_registry.load("dev/nonexistent@99")
    assert exc_info.value.code == "artifact_schema_unknown"


# ---------------------------------------------------------------------------
# 4. Invalid schema_id format (no slash) → artifact_schema_unknown
# ---------------------------------------------------------------------------


def test_invalid_schema_id_no_slash(seed_registry: ArtifactSchemaRegistry) -> None:
    with pytest.raises(MyDeepAgentError) as exc_info:
        seed_registry.load("foo")
    assert exc_info.value.code == "artifact_schema_unknown"


# ---------------------------------------------------------------------------
# 5. schema_id starting with "/" → rejected (no slash separating domain/name)
# ---------------------------------------------------------------------------


def test_invalid_schema_id_leading_slash(seed_registry: ArtifactSchemaRegistry) -> None:
    # "/foo/bar" has a slash but the domain portion would be empty
    # After splitting on "/", domain="" which is not a valid domain/name pair.
    # The registry treats it as a path traversal risk: Path("/foo/bar.json")
    # is absolute and will never exist under a root directory (is_file() → False).
    with pytest.raises(MyDeepAgentError) as exc_info:
        seed_registry.load("/dev/spec@1")
    assert exc_info.value.code == "artifact_schema_unknown"


# ---------------------------------------------------------------------------
# 6. Empty schema_id → artifact_schema_unknown
# ---------------------------------------------------------------------------


def test_empty_schema_id_raises(seed_registry: ArtifactSchemaRegistry) -> None:
    with pytest.raises(MyDeepAgentError) as exc_info:
        seed_registry.load("")
    assert exc_info.value.code == "artifact_schema_unknown"


# ---------------------------------------------------------------------------
# 7. Fallback: schema absent in first root, present in second
# ---------------------------------------------------------------------------


def test_fallback_to_second_root(tmp_path: Path) -> None:
    first_root = tmp_path / "first"
    first_root.mkdir()
    second_root = tmp_path / "second"
    (second_root / "dev").mkdir(parents=True)
    schema: dict[str, Any] = {
        "$schema": "https://json-schema.org/draft/2020-12/schema",
        "$id": "dev/thing@1",
        "type": "object",
    }
    (second_root / "dev" / "thing@1.json").write_text(json.dumps(schema), encoding="utf-8")
    registry = ArtifactSchemaRegistry(roots=[first_root, second_root])
    loaded = registry.load("dev/thing@1")
    assert loaded["$id"] == "dev/thing@1"


# ---------------------------------------------------------------------------
# 8. validate with valid data → ok=True
# ---------------------------------------------------------------------------


def test_validate_valid_spec(
    seed_registry: ArtifactSchemaRegistry, valid_spec: dict[str, Any]
) -> None:
    result = seed_registry.validate("dev/spec@1", valid_spec)
    assert result.ok is True
    assert result.errors == ()


# ---------------------------------------------------------------------------
# 9. validate with invalid data → ok=False, findings non-empty
# ---------------------------------------------------------------------------


def test_validate_invalid_data_returns_findings(
    seed_registry: ArtifactSchemaRegistry,
) -> None:
    result = seed_registry.validate("dev/spec@1", {"wrong": "data"})
    assert result.ok is False
    assert len(result.errors) > 0
    for finding in result.errors:
        assert isinstance(finding, ValidationFinding)


# ---------------------------------------------------------------------------
# 10. Missing required field → validator="required", path correct
# ---------------------------------------------------------------------------


def test_validate_missing_required_field(
    seed_registry: ArtifactSchemaRegistry, valid_spec: dict[str, Any]
) -> None:
    data = {k: v for k, v in valid_spec.items() if k != "requirements"}
    result = seed_registry.validate("dev/spec@1", data)
    assert result.ok is False
    required_findings = [f for f in result.errors if f.validator == "required"]
    assert any("requirements" in f.message for f in required_findings)


# ---------------------------------------------------------------------------
# 11. Invalid enum value → validator="enum", expected has enum list
# ---------------------------------------------------------------------------


def test_validate_invalid_enum_severity(seed_registry: ArtifactSchemaRegistry) -> None:
    data = {
        "runId": "00000000-0000-4000-8000-000000000000",
        "phaseKey": "review",
        "reviewerRole": "code-reviewer",
        "findings": [
            {
                "severity": "bogus",
                "category": "correctness",
                "summary": "something is wrong here",
            }
        ],
        "summary": "Overall review summary with enough length.",
    }
    result = seed_registry.validate("dev/review-finding-batch@1", data)
    assert result.ok is False
    enum_findings = [f for f in result.errors if f.validator == "enum"]
    assert len(enum_findings) > 0
    finding = enum_findings[0]
    assert isinstance(finding.expected, list)
    assert "bogus" not in finding.expected


# ---------------------------------------------------------------------------
# 12. Wrong type → validator="type", expected has type name
# ---------------------------------------------------------------------------


def test_validate_wrong_type(
    seed_registry: ArtifactSchemaRegistry, valid_spec: dict[str, Any]
) -> None:
    data = dict(valid_spec)
    data["acceptance_criteria"] = "should be a list, not a string"
    result = seed_registry.validate("dev/spec@1", data)
    assert result.ok is False
    type_findings = [f for f in result.errors if f.validator == "type"]
    assert len(type_findings) > 0
    assert type_findings[0].expected == "array"


# ---------------------------------------------------------------------------
# 13. Nested error path — /findings/0/severity format
# ---------------------------------------------------------------------------


def test_validate_nested_error_path(seed_registry: ArtifactSchemaRegistry) -> None:
    data = {
        "runId": "00000000-0000-4000-8000-000000000000",
        "phaseKey": "review",
        "reviewerRole": "code-reviewer",
        "findings": [
            {
                "severity": "not-valid",
                "category": "correctness",
                "summary": "a finding summary",
            }
        ],
        "summary": "Overall review summary with enough length.",
    }
    result = seed_registry.validate("dev/review-finding-batch@1", data)
    assert result.ok is False
    paths = [f.path for f in result.errors]
    assert any(p.startswith("/findings/0/") for p in paths)


# ---------------------------------------------------------------------------
# 14. known_schema_ids() returns all 4 seed schemas, sorted
# ---------------------------------------------------------------------------


def test_known_schema_ids_returns_seeds(seed_registry: ArtifactSchemaRegistry) -> None:
    ids = seed_registry.known_schema_ids()
    for expected in SEED_SCHEMA_IDS:
        assert expected in ids
    assert ids == sorted(ids)


# ---------------------------------------------------------------------------
# 15. Empty roots list → config_invalid
# ---------------------------------------------------------------------------


def test_empty_roots_raises() -> None:
    with pytest.raises(MyDeepAgentError) as exc_info:
        ArtifactSchemaRegistry(roots=[])
    assert exc_info.value.code == "config_invalid"


# ---------------------------------------------------------------------------
# 16. Corrupted JSON file → artifact_schema_load_failed
# ---------------------------------------------------------------------------


def test_corrupted_json_raises(tmp_path: Path) -> None:
    (tmp_path / "dev").mkdir()
    (tmp_path / "dev" / "broken@1.json").write_text("{", encoding="utf-8")
    registry = ArtifactSchemaRegistry(roots=[tmp_path])
    with pytest.raises(MyDeepAgentError) as exc_info:
        registry.load("dev/broken@1")
    assert exc_info.value.code == "artifact_schema_load_failed"


# ---------------------------------------------------------------------------
# 17. Valid JSON but not a dict → artifact_schema_load_failed
# ---------------------------------------------------------------------------


def test_non_dict_json_raises(tmp_path: Path) -> None:
    (tmp_path / "dev").mkdir()
    (tmp_path / "dev" / "array@1.json").write_text("[1, 2, 3]", encoding="utf-8")
    registry = ArtifactSchemaRegistry(roots=[tmp_path])
    with pytest.raises(MyDeepAgentError) as exc_info:
        registry.load("dev/array@1")
    assert exc_info.value.code == "artifact_schema_load_failed"


# ---------------------------------------------------------------------------
# 18. Schema itself is invalid Draft 2020-12 → artifact_schema_load_failed
# ---------------------------------------------------------------------------


def test_invalid_draft_schema_raises(tmp_path: Path) -> None:
    (tmp_path / "dev").mkdir()
    bad_schema = {"type": "not_a_type"}
    (tmp_path / "dev" / "bad@1.json").write_text(json.dumps(bad_schema), encoding="utf-8")
    registry = ArtifactSchemaRegistry(roots=[tmp_path])
    with pytest.raises(MyDeepAgentError) as exc_info:
        registry.load("dev/bad@1")
    assert exc_info.value.code == "artifact_schema_load_failed"


# ---------------------------------------------------------------------------
# 19. Validator caching: _validator called twice returns same instance
# ---------------------------------------------------------------------------


def test_validator_instance_cached(seed_registry: ArtifactSchemaRegistry) -> None:
    # Access internal cache to verify the same validator instance is reused.
    v1 = seed_registry._validator("dev/spec@1")
    v2 = seed_registry._validator("dev/spec@1")
    assert v1 is v2


# ---------------------------------------------------------------------------
# 20. dev/spec@1 valid example produces ok=True (full fixture check)
# ---------------------------------------------------------------------------


def test_spec_valid_example_ok(seed_registry: ArtifactSchemaRegistry) -> None:
    valid_spec: dict[str, Any] = {
        "runId": "00000000-0000-4000-8000-000000000000",
        "phaseKey": "spec",
        "requirements": "User wants a CLI tool that analyzes log files.",
        "acceptance_criteria": ["parses .log files", "outputs JSON summary"],
        "approach": "Build a typer-based CLI using regex and json output.",
        "risks": ["log format variations may break parser"],
    }
    result = seed_registry.validate("dev/spec@1", valid_spec)
    assert result.ok is True
    assert result.errors == ()


# ---------------------------------------------------------------------------
# Bonus: ValidationResult and ValidationFinding are frozen dataclasses
# ---------------------------------------------------------------------------


def test_validation_result_frozen() -> None:
    result = ValidationResult(ok=True)
    with pytest.raises((AttributeError, TypeError)):
        result.ok = False  # type: ignore[misc]


def test_validation_finding_frozen() -> None:
    finding = ValidationFinding(path="/foo", message="err", validator="type", expected="string")
    with pytest.raises((AttributeError, TypeError)):
        finding.path = "/bar"  # type: ignore[misc]


# ---------------------------------------------------------------------------
# Bonus: known_schema_ids with nonexistent root dir is silently skipped
# ---------------------------------------------------------------------------


def test_known_schema_ids_skips_nonexistent_root(tmp_path: Path) -> None:
    missing = tmp_path / "does_not_exist"
    registry = ArtifactSchemaRegistry(roots=[missing])
    assert registry.known_schema_ids() == []


# ---------------------------------------------------------------------------
# Bonus: validate with non-dict top-level data
# ---------------------------------------------------------------------------


def test_validate_non_dict_data_returns_error(
    seed_registry: ArtifactSchemaRegistry,
) -> None:
    result = seed_registry.validate("dev/spec@1", [1, 2, 3])
    assert result.ok is False
    type_findings = [f for f in result.errors if f.validator == "type"]
    assert len(type_findings) > 0