Source code for intelliscraper.parsers.base_parser

"""Abstract base class for content parsers.

All parsers in IntelliScraper extend ``BaseParser`` to provide a
consistent interface for extracting text, links, and Markdown from
scraped HTML content.

To create a site-specific parser, subclass ``HTMLParser`` (which
already implements ``BaseParser``) and add your custom extraction
logic as ``@cached_property`` methods.

Example::

    class MyCustomParser(HTMLParser):
        @cached_property
        def product_title(self) -> str | None:
            tag = self.soup.select_one("h1.product-title")
            return tag.get_text(strip=True) if tag else None
"""

from __future__ import annotations

import abc
from functools import cached_property


[docs] class BaseParser(abc.ABC): """Abstract base for all content parsers. Defines the minimum interface that every parser must implement. Concrete parsers should extend ``HTMLParser`` rather than this class directly, unless they need a fundamentally different parsing engine. Args: url: The source URL of the scraped page. Used for normalising relative links. html: Raw HTML string to parse. """
[docs] @abc.abstractmethod def __init__(self, url: str, html: str) -> None: """Initialize the parser with URL and HTML content."""
@cached_property @abc.abstractmethod def text(self) -> str: """Extract plain text from the HTML content.""" @cached_property @abc.abstractmethod def links(self) -> list[str]: """Extract all normalised ``href`` values from ``<a>`` tags.""" @cached_property @abc.abstractmethod def markdown(self) -> str: """Convert the HTML to Markdown."""