misc/py-aider-chat/files/patch-aider_scrape.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

--- aider/scrape.py.orig	2025-05-09 22:41:18 UTC
+++ aider/scrape.py
@@ -92,7 +92,7 @@ class Scraper:
         else:
             self.print_error = print
 
-        self.playwright_available = playwright_available
+        self.playwright_available = False
         self.verify_ssl = verify_ssl
 
     def scrape(self, url):
@@ -103,10 +103,7 @@ class Scraper:
         `url` - the URL to scrape.
         """
 
-        if self.playwright_available:
-            content, mime_type = self.scrape_with_playwright(url)
-        else:
-            content, mime_type = self.scrape_with_httpx(url)
+        content, mime_type = self.scrape_with_httpx(url)
 
         if not content:
             self.print_error(f"Failed to retrieve content from {url}")
@@ -138,58 +135,6 @@ class Scraper:
             ]
             return any(re.search(pattern, content, re.IGNORECASE) for pattern in html_patterns)
         return False
-
-    # Internals...
-    def scrape_with_playwright(self, url):
-        import playwright  # noqa: F401
-        from playwright.sync_api import Error as PlaywrightError
-        from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
-        from playwright.sync_api import sync_playwright
-
-        with sync_playwright() as p:
-            try:
-                browser = p.chromium.launch()
-            except Exception as e:
-                self.playwright_available = False
-                self.print_error(str(e))
-                return None, None
-
-            try:
-                context = browser.new_context(ignore_https_errors=not self.verify_ssl)
-                page = context.new_page()
-
-                user_agent = page.evaluate("navigator.userAgent")
-                user_agent = user_agent.replace("Headless", "")
-                user_agent = user_agent.replace("headless", "")
-                user_agent += " " + aider_user_agent
-
-                page.set_extra_http_headers({"User-Agent": user_agent})
-
-                response = None
-                try:
-                    response = page.goto(url, wait_until="networkidle", timeout=5000)
-                except PlaywrightTimeoutError:
-                    print(f"Page didn't quiesce, scraping content anyway: {url}")
-                    response = None
-                except PlaywrightError as e:
-                    self.print_error(f"Error navigating to {url}: {str(e)}")
-                    return None, None
-
-                try:
-                    content = page.content()
-                    mime_type = None
-                    if response:
-                        content_type = response.header_value("content-type")
-                        if content_type:
-                            mime_type = content_type.split(";")[0]
-                except PlaywrightError as e:
-                    self.print_error(f"Error retrieving page content: {str(e)}")
-                    content = None
-                    mime_type = None
-            finally:
-                browser.close()
-
-        return content, mime_type
 
     def scrape_with_httpx(self, url):
         import httpx