Source code for sphinx_autodoc_toml.parser

"""Core parser for extracting doc-comments from TOML files.

.. spec:: The parser MUST recognize doc-comments marked with #: syntax.
   :id: S_PARSER_001
   :status: implemented
   :tags: parser, syntax
   :links: R_SPEC_001

.. spec:: The parser MUST validate the Separator Rule for doc-comments.
   :id: S_PARSER_002
   :status: implemented
   :tags: parser, validation
   :links: R_SPEC_002, R_PARSE_003

.. spec:: The parser MUST validate the Attachment Rule for doc-comments.
   :id: S_PARSER_003
   :status: implemented
   :tags: parser, validation
   :links: R_SPEC_003, R_PARSE_004

.. spec:: The parser MUST support multi-line doc-comments.
   :id: S_PARSER_004
   :status: implemented
   :tags: parser, syntax
   :links: R_SPEC_004

.. spec:: The parser MUST use tomlkit to preserve comments and whitespace.
   :id: S_PARSER_005
   :status: implemented
   :tags: parser, toml
   :links: R_PARSE_001

.. spec:: The parser MUST extract all valid doc-comments from a TOML file.
   :id: S_PARSER_006
   :status: implemented
   :tags: parser, extraction
   :links: R_PARSE_002

.. spec:: The parser MUST identify and parse TOML table headers.
   :id: S_PARSER_007
   :status: implemented
   :tags: parser, toml
   :links: R_PARSE_005

.. spec:: The parser MUST identify and parse TOML key-value pairs.
   :id: S_PARSER_008
   :status: implemented
   :tags: parser, toml
   :links: R_PARSE_005

.. spec:: The parser MUST determine hierarchical TOML paths for items.
   :id: S_PARSER_009
   :status: implemented
   :tags: parser, toml, hierarchy
   :links: R_PARSE_006

.. spec:: The parser MUST extract TOML content for documented items.
   :id: S_PARSER_010
   :status: implemented
   :tags: parser, extraction
   :links: R_PARSE_007
"""

import re
from pathlib import Path
from typing import Any, List, Optional

import tomlkit
from tomlkit.items import Table



[docs]
class DocComment:
    """Represents a doc-comment block extracted from a TOML file."""

    def __init__(self, path: List[str], content: str, line_number: int, toml_content: str = ""):
        """
        Initialize a DocComment.

        Args:
            path: The TOML path to the item (e.g., ["project", "dependencies"])
            content: The extracted doc-comment content (without #: markers)
            line_number: The line number where the doc-comment starts
            toml_content: The actual TOML content for this item
        """
        self.path = path
        self.content = content
        self.line_number = line_number
        self.toml_content = toml_content

    @property
    def full_path(self) -> str:
        """Return the full dotted path (e.g., 'project.dependencies')."""
        return ".".join(self.path) if self.path else ""

    @property
    def toml_path(self) -> str:
        """Return the TOML table notation (e.g., '[project.dependencies]')."""
        if not self.path:
            return ""
        return f"[{self.full_path}]"

    def __repr__(self) -> str:
        return f"DocComment(path={self.full_path!r}, line={self.line_number})"




[docs]
class TomlDocParser:
    """Parser for extracting doc-comments from TOML files according to TOML-Doc spec."""

    DOC_COMMENT_PATTERN = re.compile(r"^#:\s?(.*)")
    TABLE_PATTERN = re.compile(r"^\[([^\]]+)\]")
    KEY_PATTERN = re.compile(r"^([a-zA-Z0-9_-]+)\s*=")

    def __init__(self, toml_path: Path):
        """
        Initialize the parser.

        Args:
            toml_path: Path to the TOML file to parse
        """
        self.toml_path = toml_path
        self.raw_content = toml_path.read_text()
        self.lines = self.raw_content.splitlines()
        # Parse with tomlkit to extract actual TOML content
        self.toml_doc = tomlkit.parse(self.raw_content)


[docs]
    def parse(self) -> List[DocComment]:
        """
        Parse the TOML file and extract all valid doc-comments.

        Returns:
            List of DocComment objects
        """
        doc_comments = []
        i = 0

        while i < len(self.lines):
            line = self.lines[i]

            # Check if this line starts a doc-comment block
            if self.DOC_COMMENT_PATTERN.match(line):
                # Extract the doc-comment block
                doc_comment = self._extract_doc_comment_block(i)
                if doc_comment:
                    doc_comments.append(doc_comment)
                    # Skip past the doc-comment and the item it documents
                    i = doc_comment.line_number + len(doc_comment.content.split("\n")) + 1
                else:
                    i += 1
            else:
                i += 1

        return doc_comments


    def _extract_doc_comment_block(self, start_line: int) -> Optional[DocComment]:
        """
        Extract a doc-comment block starting at the given line.

        Args:
            start_line: The line index where the doc-comment starts (0-indexed)

        Returns:
            DocComment if valid, None otherwise
        """
        # First, validate the Separator Rule: must be preceded by empty line
        if not self._check_separator_rule(start_line):
            return None

        # Extract all consecutive #: lines
        doc_lines = []
        current_line = start_line

        while current_line < len(self.lines):
            line = self.lines[current_line]
            match = self.DOC_COMMENT_PATTERN.match(line.strip())
            if match:
                doc_lines.append(match.group(1))
                current_line += 1
            else:
                break

        if not doc_lines:
            return None

        # Find the TOML item that this doc-comment documents
        # It should be the next non-empty, non-comment line
        item_line_idx = self._find_next_toml_item(current_line)
        if item_line_idx is None:
            return None

        # Validate the Attachment Rule: no empty lines between doc-comment and item
        if not self._check_attachment_rule(current_line - 1, item_line_idx):
            return None

        # Determine the path for this item
        path = self._get_toml_path_for_line(item_line_idx)
        if path is None:
            return None

        # Extract the TOML content for this item
        toml_content = self._extract_toml_content(path)

        # Create the DocComment
        content = "\n".join(doc_lines)
        return DocComment(
            path=path, content=content, line_number=start_line + 1, toml_content=toml_content
        )

    def _check_separator_rule(self, doc_start_line: int) -> bool:
        """
        Check the Separator Rule: doc-comment must be preceded by empty line.

        Args:
            doc_start_line: The line index where the doc-comment starts

        Returns:
            True if valid, False otherwise
        """
        # If this is the first line, it's valid
        if doc_start_line == 0:
            return True

        # Look backwards for a non-empty line
        for i in range(doc_start_line - 1, -1, -1):
            line = self.lines[i].strip()
            if line:
                # Found a non-empty line - it should NOT be a doc-comment or TOML item
                # The line before the doc-comment should be empty
                return False
            else:
                # Found an empty line - this satisfies the separator rule
                return True

        # Reached the beginning of the file
        return True

    def _check_attachment_rule(self, doc_end_line: int, item_line: int) -> bool:
        """
        Check the Attachment Rule: no empty lines between doc-comment and item.

        Args:
            doc_end_line: The line index where the doc-comment ends
            item_line: The line index of the TOML item

        Returns:
            True if valid, False otherwise
        """
        # Check all lines between doc_end_line and item_line
        for i in range(doc_end_line + 1, item_line):
            line = self.lines[i].strip()
            if line == "":
                # Found an empty line - violates attachment rule
                return False

        return True

    def _find_next_toml_item(self, start_line: int) -> Optional[int]:
        """
        Find the next TOML item (table or key) after the given line.

        Args:
            start_line: The line index to start searching from

        Returns:
            Line index of the next TOML item, or None if not found
        """
        for i in range(start_line, len(self.lines)):
            line = self.lines[i].strip()

            # Skip empty lines and regular comments
            if not line or (line.startswith("#") and not line.startswith("#:")):
                continue

            # Check if this is a table header
            if self.TABLE_PATTERN.match(line):
                return i

            # Check if this is a key assignment
            if self.KEY_PATTERN.match(line):
                return i

        return None

    def _get_toml_path_for_line(self, line_idx: int) -> Optional[List[str]]:
        """
        Get the TOML path for an item at the given line.

        Args:
            line_idx: The line index of the TOML item

        Returns:
            List representing the path (e.g., ["project", "dependencies"])
        """
        line = self.lines[line_idx].strip()

        # Check if this is a table header
        table_match = self.TABLE_PATTERN.match(line)
        if table_match:
            table_path = table_match.group(1)
            return table_path.split(".")

        # Check if this is a key assignment
        key_match = self.KEY_PATTERN.match(line)
        if key_match:
            key_name = key_match.group(1)
            # Need to find the current table context
            current_table = self._find_current_table(line_idx)
            if current_table:
                return current_table + [key_name]
            else:
                return [key_name]

        return None

    def _find_current_table(self, line_idx: int) -> List[str]:
        """
        Find the table context for a line by looking backwards.

        Args:
            line_idx: The line index to find the table context for

        Returns:
            List representing the current table path
        """
        # Look backwards for the most recent table header
        for i in range(line_idx - 1, -1, -1):
            line = self.lines[i].strip()
            table_match = self.TABLE_PATTERN.match(line)
            if table_match:
                table_path = table_match.group(1)
                return table_path.split(".")

        # No table found, we're at the root level
        return []

    def _extract_toml_content(self, path: List[str]) -> str:
        """
        Extract the TOML content for a given path.

        Args:
            path: The path to the TOML item (e.g., ["project", "dependencies"])

        Returns:
            The TOML content as a string
        """
        if not path:
            return ""

        # Navigate to the item in the tomlkit document
        try:
            current: Any = self.toml_doc
            # For tables, navigate to the parent and check if it's a table
            # For keys, navigate to the parent table and get the key
            for part in path[:-1]:
                if part in current:
                    current = current[part]
                else:
                    return ""

            # Check if the last part is a key or a table
            last_part = path[-1]
            if last_part in current:
                item = current[last_part]
                # Check if it's a table or a key-value pair
                if isinstance(item, (dict, Table)):
                    # It's a table - serialize its contents
                    # Create a temporary document with just this table
                    temp_doc = tomlkit.document()
                    temp_doc.update(item)
                    content = tomlkit.dumps(temp_doc).strip()
                    return content
                else:
                    # It's a key-value pair
                    value_str = tomlkit.dumps({"temp": item}).strip()
                    # Extract just the value part (after "temp = ")
                    if "=" in value_str:
                        value_part = value_str.split("=", 1)[1].strip()
                        return f"{last_part} = {value_part}"
                    return ""
            else:
                return ""
        except (KeyError, TypeError):
            return ""




[docs]
def parse_toml_file(toml_path: Path) -> List[DocComment]:
    """
    Parse a TOML file and extract all doc-comments.

    Args:
        toml_path: Path to the TOML file

    Returns:
        List of DocComment objects

    Example:
        >>> from pathlib import Path
        >>> doc_comments = parse_toml_file(Path("pyproject.toml"))
        >>> for dc in doc_comments:
        ...     print(f"{dc.toml_path}: {dc.content}")
    """
    parser = TomlDocParser(toml_path)
    return parser.parse()
sphinx-autodoc-toml

Navigation

Related Topics

Source code for sphinx_autodoc_toml.parser