Source code for jamb.storage.document_dag

"""DAG-based document hierarchy for jamb."""

from collections import deque
from dataclasses import dataclass, field
from pathlib import Path

from jamb.storage.document_config import DocumentConfig


[docs] @dataclass class DocumentDAG: """Directed acyclic graph of document relationships. Supports multiple parents per document (DAG structure). Attributes: documents (dict[str, DocumentConfig]): Mapping of document prefix to its :class:`~jamb.storage.document_config.DocumentConfig`. document_paths (dict[str, Path]): Mapping of document prefix to the filesystem path of the document directory. """ documents: dict[str, DocumentConfig] = field(default_factory=dict) document_paths: dict[str, Path] = field(default_factory=dict)
[docs] def get_parents(self, prefix: str) -> list[str]: """Get parent document prefixes for a given document. Args: prefix: The document prefix to look up. Returns: List of parent document prefix strings. Empty if the document has no parents or is not found. """ config = self.documents.get(prefix) if config is None: return [] return list(config.parents)
[docs] def get_children(self, prefix: str) -> list[str]: """Get child document prefixes for a given document. Args: prefix: The document prefix to look up. Returns: List of child document prefix strings. """ children = [] for p, config in self.documents.items(): if prefix in config.parents: children.append(p) return children
[docs] def get_root_documents(self) -> list[str]: """Get documents with no parents. Returns: List of document prefix strings that have no parent documents. """ return [p for p, config in self.documents.items() if not config.parents]
[docs] def get_leaf_documents(self) -> list[str]: """Get documents with no children. Returns: List of document prefix strings that have no child documents. """ all_parents = set() for config in self.documents.values(): all_parents.update(config.parents) return [p for p in self.documents if p not in all_parents]
[docs] def topological_sort(self) -> list[str]: """Return prefixes in topological order (parents before children). Uses Kahn's algorithm. If there are cycles, remaining nodes are appended at the end. Raises: ValueError: If any document references unknown parent documents. """ # Build in-degree map in_degree: dict[str, int] = {p: 0 for p in self.documents} children_map: dict[str, list[str]] = {p: [] for p in self.documents} missing_parents: list[str] = [] for prefix, config in self.documents.items(): for parent in config.parents: if parent in self.documents: in_degree[prefix] += 1 children_map[parent].append(prefix) else: missing_parents.append(f"{prefix} references unknown parent {parent}") if missing_parents: raise ValueError(f"Document DAG has missing parents: {', '.join(missing_parents)}") # Start with nodes that have no parents queue: deque[str] = deque() for prefix, degree in in_degree.items(): if degree == 0: queue.append(prefix) result: list[str] = [] while queue: node = queue.popleft() result.append(node) for child in children_map[node]: in_degree[child] -= 1 if in_degree[child] == 0: queue.append(child) # Append any remaining (cycle participants) for prefix in self.documents: if prefix not in result: result.append(prefix) return result
[docs] def validate_acyclic(self) -> list[str]: """Check for cycles in the DAG. Returns: List of error messages. Empty if no cycles. """ # Build in-degree map and children adjacency (single Kahn's pass) in_degree: dict[str, int] = {p: 0 for p in self.documents} children_map: dict[str, list[str]] = {p: [] for p in self.documents} for prefix, config in self.documents.items(): for parent in config.parents: if parent in self.documents: in_degree[prefix] += 1 children_map[parent].append(prefix) queue: deque[str] = deque() for prefix, degree in in_degree.items(): if degree == 0: queue.append(prefix) visited: set[str] = set() while queue: node = queue.popleft() visited.add(node) for child in children_map[node]: in_degree[child] -= 1 if in_degree[child] == 0: queue.append(child) errors: list[str] = [] cycle_nodes = set(self.documents.keys()) - visited if cycle_nodes: errors.append(f"Cycle detected among documents: {', '.join(sorted(cycle_nodes))}") return errors