"""Local browser backend connects to Chrome via CDP.
Connects to a user's already-running Chrome instance via the Chrome
DevTools Protocol (CDP) on a configurable port. All existing cookies,
logins, and sessions in that browser are immediately available no
``session_data`` or authentication code needed.
Before using this backend, either:
1. Start Chrome manually::
google-chrome \\
--remote-debugging-port=9222 \\
--user-data-dir="$HOME/.config/google-chrome-debug" \\
--profile-directory="Default"
2. Or let the backend auto-launch Chrome (it will attempt to find and
start Chrome with the debug profile if the port is not already open).
Important:
The debug profile at ``~/.config/google-chrome-debug`` is separate
from your default Chrome profile. You must log into target sites
(e.g. LinkedIn) in this profile *before* scraping. Use::
make chrome-debug-login URL=https://www.linkedin.com
to open Chrome with this profile and log in.
"""
from __future__ import annotations
import asyncio
import logging
import os
import shutil
import subprocess
import urllib.error
import urllib.request
from typing import TYPE_CHECKING
from intelliscraper.browser.backend import BrowserBackend
from intelliscraper.exception import LocalBrowserConnectionError
if TYPE_CHECKING:
from playwright.async_api import Browser, BrowserContext, Playwright
logger = logging.getLogger(__name__)
# CDP connection constants.
_CDP_DEFAULT_PORT = 9222
_CDP_MAX_WAIT_SEC = 20
_CDP_POLL_INTERVAL_SEC = 1.0
_CDP_CONNECT_TIMEOUT_MS = 10_000
[docs]
class LocalBrowserBackend(BrowserBackend):
"""Connect to an already-running Chrome instance via CDP.
This backend reuses the user's real Chrome session, preserving all
cookies, localStorage, and authenticated state. It is ideal for
scraping sites that require complex authentication flows (e.g.
LinkedIn, Gmail).
Args:
cdp_port: Chrome DevTools Protocol port. Defaults to 9222.
headless: If Chrome needs to be auto-launched, whether to run it
headless. Defaults to ``True``.
profile_dir: Path to the Chrome user-data-dir used for the debug
profile. Defaults to ``~/.config/google-chrome-debug``.
Raises:
LocalBrowserConnectionError: If Chrome is not reachable after
``_CDP_MAX_WAIT_SEC`` seconds.
Example::
backend = LocalBrowserBackend(cdp_port=9222, headless=False)
browser, context = await backend.initialize(playwright)
"""
def __init__(
self,
cdp_port: int = _CDP_DEFAULT_PORT,
headless: bool = True,
profile_dir: str | None = None,
) -> None:
self._cdp_port = cdp_port
self._cdp_url = f"http://localhost:{cdp_port}"
self._headless = headless
self._profile_dir = profile_dir or os.path.join(
os.path.expanduser("~"), ".config", "google-chrome-debug"
)
# BrowserBackend interface
@property
def owns_browser(self) -> bool:
"""Local backend does not own the browser process."""
return False
[docs]
async def initialize(
self,
playwright: Playwright,
) -> tuple[Browser, BrowserContext]:
"""Connect to Chrome via CDP and return browser + context.
If Chrome is not already running on the configured port, the
backend will attempt to auto-launch it using the debug profile.
Args:
playwright: A started Playwright instance.
Returns:
A ``(browser, context)`` tuple. The context is the first
existing context found in the CDP browser (preserving all
cookies and logins).
Raises:
LocalBrowserConnectionError: If Chrome cannot be reached.
"""
logger.info("Checking Chrome debug port at %s ...", self._cdp_url)
# Fast pre-check: is Chrome already up?
if not self._is_chrome_port_open():
logger.info(
"Chrome debug port not detected attempting to launch "
"Chrome with remote debugging enabled ..."
)
await self._launch_chrome_with_debugging()
logger.info(
"Waiting up to %ds for Chrome to be ready ...",
_CDP_MAX_WAIT_SEC,
)
# Poll until Chrome is reachable.
reachable = await self._wait_for_chrome_port()
if not reachable:
raise LocalBrowserConnectionError(
f"Chrome not reachable on port {self._cdp_port} after "
f"{_CDP_MAX_WAIT_SEC}s.\n\n"
"Please start Chrome manually with remote debugging:\n\n"
" google-chrome \\\n"
" --remote-debugging-port=9222 \\\n"
' --user-data-dir="$HOME/.config/google-chrome-debug" \\\n'
' --profile-directory="Default"\n\n'
"Or use: make chrome-debug-login URL=https://example.com\n\n"
"Note: all existing Chrome windows must be FULLY closed "
"first, then run the command above before using "
"IntelliScraper."
)
# Connect over CDP.
logger.info("Connecting to Chrome via CDP ...")
try:
browser = await playwright.chromium.connect_over_cdp(
endpoint_url=self._cdp_url,
timeout=_CDP_CONNECT_TIMEOUT_MS,
)
except Exception as exc:
raise LocalBrowserConnectionError(
f"Chrome is reachable at {self._cdp_url} but CDP "
f"connection failed.\nError: {exc}"
) from exc
# Reuse existing context (preserves all logins / cookies).
if browser.contexts:
context = browser.contexts[0]
logger.info(
"Reusing existing browser context (%d context(s) found)",
len(browser.contexts),
)
else:
logger.warning(
"No existing context found in CDP browser — creating a "
"fresh one. You may need to log in to target sites "
"manually."
)
context = await browser.new_context(ignore_https_errors=True)
logger.info("Local browser (CDP) ready.")
return browser, context
[docs]
async def cleanup(
self,
browser: Browser,
context: BrowserContext,
) -> None:
"""Clean up local browser resources.
In CDP mode the Chrome process and context belong to the user,
so they are **not** closed. Only a debug log entry is emitted.
Args:
browser: The CDP-connected browser (not closed).
context: The reused browser context (not closed).
"""
logger.debug(
"CDP mode: skipping context/browser close "
"(Chrome instance belongs to user)"
)
def _is_chrome_port_open(self) -> bool:
"""Return ``True`` if the Chrome debug port already responds.
Performs a single synchronous HTTP probe against
``/json/version`` with a 2-second timeout.
"""
try:
urllib.request.urlopen(f"{self._cdp_url}/json/version", timeout=2)
return True
except Exception:
return False
async def _launch_chrome_with_debugging(self) -> None:
"""Launch Chrome with remote debugging using the debug profile.
Searches for a Chrome/Chromium executable, kills any existing
Chrome processes that might hold the profile lock, then launches
Chrome with the required flags.
Note:
``pkill`` targets ``/opt/google/chrome/chrome`` specifically
to avoid killing Electron-based apps (e.g. VS Code).
"""
chrome_executables = [
"google-chrome",
"google-chrome-stable",
"chromium-browser",
"chromium",
]
chrome_bin = next(
(exe for exe in chrome_executables if shutil.which(exe)),
None,
)
if chrome_bin is None:
logger.warning(
"Could not find a Chrome/Chromium executable. Please "
"start Chrome manually with "
"--remote-debugging-port=9222."
)
return
# Kill existing Chrome processes to unlock the profile.
logger.info(
"Terminating existing Chrome processes to unlock the " "debug profile ..."
)
try:
subprocess.run(
["pkill", "-f", "google-chrome-debu[g]"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
subprocess.run(
["pkill", "-f", "chromium-browser.*debu[g]"],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
await asyncio.sleep(2)
except Exception as exc:
logger.warning(
"Could not kill Chrome processes: %s. Proceeding anyway.",
exc,
)
cmd = [
chrome_bin,
f"--remote-debugging-port={self._cdp_port}",
f"--user-data-dir={self._profile_dir}",
"--profile-directory=Default",
"--no-first-run",
"--no-default-browser-check",
"--disable-background-timer-throttling",
"--disable-renderer-backgrounding",
]
if self._headless:
cmd.append("--headless=new")
env = os.environ.copy()
env.setdefault("DISPLAY", ":0")
headless_msg = "headless" if self._headless else "visible"
logger.info(
"Auto-launching Chrome (%s, profile: %s).\n"
" NOTE: This uses the debug profile. Log into target "
"sites in this profile before scraping.",
headless_msg,
self._profile_dir,
)
logger.debug("Chrome command: %s", " ".join(cmd))
subprocess.Popen(
cmd,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True,
env=env,
)
async def _wait_for_chrome_port(self) -> bool:
"""Poll Chrome's debug endpoint until it responds or timeout.
Probes ``/json/version`` once per ``_CDP_POLL_INTERVAL_SEC``
seconds, up to ``_CDP_MAX_WAIT_SEC`` total.
Returns:
``True`` if Chrome responded within the timeout.
"""
max_attempts = int(_CDP_MAX_WAIT_SEC / _CDP_POLL_INTERVAL_SEC)
for attempt in range(max_attempts):
try:
urllib.request.urlopen(f"{self._cdp_url}/json/version", timeout=2)
logger.debug(
"Chrome port reachable after ~%.0fs",
attempt * _CDP_POLL_INTERVAL_SEC,
)
return True
except Exception:
logger.debug(
"Port not ready yet — attempt %d/%d",
attempt + 1,
max_attempts,
)
await asyncio.sleep(_CDP_POLL_INTERVAL_SEC)
return False