diff options
Diffstat (limited to 'misc/py-aider-chat/files/patch-aider_scrape.py')
-rw-r--r-- | misc/py-aider-chat/files/patch-aider_scrape.py | 79 |
1 files changed, 11 insertions, 68 deletions
diff --git a/misc/py-aider-chat/files/patch-aider_scrape.py b/misc/py-aider-chat/files/patch-aider_scrape.py index 969ff29d15f8..ba103c6f725e 100644 --- a/misc/py-aider-chat/files/patch-aider_scrape.py +++ b/misc/py-aider-chat/files/patch-aider_scrape.py @@ -1,65 +1,6 @@ ---- aider/scrape.py.orig 2024-09-09 10:28:04 UTC +--- aider/scrape.py.orig 2025-05-09 22:41:18 UTC +++ aider/scrape.py -@@ -15,57 +15,8 @@ def install_playwright(io): - - - def install_playwright(io): -- try: -- from playwright.sync_api import sync_playwright -+ return False - -- has_pip = True -- except ImportError: -- has_pip = False -- -- try: -- with sync_playwright() as p: -- p.chromium.launch() -- has_chromium = True -- except Exception: -- has_chromium = False -- -- if has_pip and has_chromium: -- return True -- -- pip_cmd = utils.get_pip_install(["aider-chat[playwright]"]) -- chromium_cmd = "-m playwright install --with-deps chromium" -- chromium_cmd = [sys.executable] + chromium_cmd.split() -- -- cmds = "" -- if not has_pip: -- cmds += " ".join(pip_cmd) + "\n" -- if not has_chromium: -- cmds += " ".join(chromium_cmd) + "\n" -- -- text = f"""For the best web scraping, install Playwright: -- --{cmds} --See {urls.enable_playwright} for more info. --""" -- -- io.tool_output(text) -- if not io.confirm_ask("Install playwright?", default="y"): -- return -- -- if not has_pip: -- success, output = utils.run_install(pip_cmd) -- if not success: -- io.tool_error(output) -- return -- -- success, output = utils.run_install(chromium_cmd) -- if not success: -- io.tool_error(output) -- return -- -- return True -- -- - class Scraper: - pandoc_available = None - playwright_available = None -@@ -82,7 +33,7 @@ class Scraper: +@@ -92,7 +92,7 @@ class Scraper: else: self.print_error = print @@ -68,7 +9,7 @@ self.verify_ssl = verify_ssl def scrape(self, url): -@@ -93,10 +44,7 @@ class Scraper: +@@ -103,10 +103,7 @@ class Scraper: `url` - the URL to scrape. """ @@ -80,10 +21,12 @@ if not content: self.print_error(f"Failed to retrieve content from {url}") -@@ -130,56 +78,6 @@ class Scraper: +@@ -138,58 +135,6 @@ class Scraper: + ] + return any(re.search(pattern, content, re.IGNORECASE) for pattern in html_patterns) return False - - # Internals... +- +- # Internals... - def scrape_with_playwright(self, url): - import playwright # noqa: F401 - from playwright.sync_api import Error as PlaywrightError @@ -113,7 +56,8 @@ - try: - response = page.goto(url, wait_until="networkidle", timeout=5000) - except PlaywrightTimeoutError: -- self.print_error(f"Timeout while loading {url}") +- print(f"Page didn't quiesce, scraping content anyway: {url}") +- response = None - except PlaywrightError as e: - self.print_error(f"Error navigating to {url}: {str(e)}") - return None, None @@ -133,7 +77,6 @@ - browser.close() - - return content, mime_type -- + def scrape_with_httpx(self, url): import httpx - |