summaryrefslogtreecommitdiff
path: root/misc/py-aider-chat/files/patch-aider_scrape.py
diff options
context:
space:
mode:
Diffstat (limited to 'misc/py-aider-chat/files/patch-aider_scrape.py')
-rw-r--r--misc/py-aider-chat/files/patch-aider_scrape.py79
1 files changed, 11 insertions, 68 deletions
diff --git a/misc/py-aider-chat/files/patch-aider_scrape.py b/misc/py-aider-chat/files/patch-aider_scrape.py
index 969ff29d15f8..ba103c6f725e 100644
--- a/misc/py-aider-chat/files/patch-aider_scrape.py
+++ b/misc/py-aider-chat/files/patch-aider_scrape.py
@@ -1,65 +1,6 @@
---- aider/scrape.py.orig 2024-09-09 10:28:04 UTC
+--- aider/scrape.py.orig 2025-05-09 22:41:18 UTC
+++ aider/scrape.py
-@@ -15,57 +15,8 @@ def install_playwright(io):
-
-
- def install_playwright(io):
-- try:
-- from playwright.sync_api import sync_playwright
-+ return False
-
-- has_pip = True
-- except ImportError:
-- has_pip = False
--
-- try:
-- with sync_playwright() as p:
-- p.chromium.launch()
-- has_chromium = True
-- except Exception:
-- has_chromium = False
--
-- if has_pip and has_chromium:
-- return True
--
-- pip_cmd = utils.get_pip_install(["aider-chat[playwright]"])
-- chromium_cmd = "-m playwright install --with-deps chromium"
-- chromium_cmd = [sys.executable] + chromium_cmd.split()
--
-- cmds = ""
-- if not has_pip:
-- cmds += " ".join(pip_cmd) + "\n"
-- if not has_chromium:
-- cmds += " ".join(chromium_cmd) + "\n"
--
-- text = f"""For the best web scraping, install Playwright:
--
--{cmds}
--See {urls.enable_playwright} for more info.
--"""
--
-- io.tool_output(text)
-- if not io.confirm_ask("Install playwright?", default="y"):
-- return
--
-- if not has_pip:
-- success, output = utils.run_install(pip_cmd)
-- if not success:
-- io.tool_error(output)
-- return
--
-- success, output = utils.run_install(chromium_cmd)
-- if not success:
-- io.tool_error(output)
-- return
--
-- return True
--
--
- class Scraper:
- pandoc_available = None
- playwright_available = None
-@@ -82,7 +33,7 @@ class Scraper:
+@@ -92,7 +92,7 @@ class Scraper:
else:
self.print_error = print
@@ -68,7 +9,7 @@
self.verify_ssl = verify_ssl
def scrape(self, url):
-@@ -93,10 +44,7 @@ class Scraper:
+@@ -103,10 +103,7 @@ class Scraper:
`url` - the URL to scrape.
"""
@@ -80,10 +21,12 @@
if not content:
self.print_error(f"Failed to retrieve content from {url}")
-@@ -130,56 +78,6 @@ class Scraper:
+@@ -138,58 +135,6 @@ class Scraper:
+ ]
+ return any(re.search(pattern, content, re.IGNORECASE) for pattern in html_patterns)
return False
-
- # Internals...
+-
+- # Internals...
- def scrape_with_playwright(self, url):
- import playwright # noqa: F401
- from playwright.sync_api import Error as PlaywrightError
@@ -113,7 +56,8 @@
- try:
- response = page.goto(url, wait_until="networkidle", timeout=5000)
- except PlaywrightTimeoutError:
-- self.print_error(f"Timeout while loading {url}")
+- print(f"Page didn't quiesce, scraping content anyway: {url}")
+- response = None
- except PlaywrightError as e:
- self.print_error(f"Error navigating to {url}: {str(e)}")
- return None, None
@@ -133,7 +77,6 @@
- browser.close()
-
- return content, mime_type
--
+
def scrape_with_httpx(self, url):
import httpx
-