Source code for intelliscraper.parsers.base_parser

"""Abstract base class for content parsers.

All parsers in IntelliScraper extend ``BaseParser`` to provide a
consistent interface for extracting text, links, and Markdown from
scraped HTML content.

To create a site-specific parser, subclass ``HTMLParser`` (which
already implements ``BaseParser``) and add your custom extraction
logic as ``@cached_property`` methods.

Example::

    class MyCustomParser(HTMLParser):
        @cached_property
        def product_title(self) -> str | None:
            tag = self.soup.select_one("h1.product-title")
            return tag.get_text(strip=True) if tag else None
"""

from __future__ import annotations

import abc
from functools import cached_property



[docs]
class BaseParser(abc.ABC):
    """Abstract base for all content parsers.

    Defines the minimum interface that every parser must implement.
    Concrete parsers should extend ``HTMLParser`` rather than this
    class directly, unless they need a fundamentally different
    parsing engine.

    Args:
        url: The source URL of the scraped page.  Used for
            normalising relative links.
        html: Raw HTML string to parse.
    """


[docs]
    @abc.abstractmethod
    def __init__(self, url: str, html: str) -> None:
        """Initialize the parser with URL and HTML content."""


    @cached_property
    @abc.abstractmethod
    def text(self) -> str:
        """Extract plain text from the HTML content."""

    @cached_property
    @abc.abstractmethod
    def links(self) -> list[str]:
        """Extract all normalised ``href`` values from ``<a>`` tags."""

    @cached_property
    @abc.abstractmethod
    def markdown(self) -> str:
        """Convert the HTML to Markdown."""