misc/py-aider-chat/files/patch-aider_scrape.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139

--- aider/scrape.py.orig	2024-09-09 10:28:04 UTC
+++ aider/scrape.py
@@ -15,57 +15,8 @@ def install_playwright(io):
 
 
 def install_playwright(io):
-    try:
-        from playwright.sync_api import sync_playwright
+    return False
 
-        has_pip = True
-    except ImportError:
-        has_pip = False
-
-    try:
-        with sync_playwright() as p:
-            p.chromium.launch()
-            has_chromium = True
-    except Exception:
-        has_chromium = False
-
-    if has_pip and has_chromium:
-        return True
-
-    pip_cmd = utils.get_pip_install(["aider-chat[playwright]"])
-    chromium_cmd = "-m playwright install --with-deps chromium"
-    chromium_cmd = [sys.executable] + chromium_cmd.split()
-
-    cmds = ""
-    if not has_pip:
-        cmds += " ".join(pip_cmd) + "\n"
-    if not has_chromium:
-        cmds += " ".join(chromium_cmd) + "\n"
-
-    text = f"""For the best web scraping, install Playwright:
-
-{cmds}
-See {urls.enable_playwright} for more info.
-"""
-
-    io.tool_output(text)
-    if not io.confirm_ask("Install playwright?", default="y"):
-        return
-
-    if not has_pip:
-        success, output = utils.run_install(pip_cmd)
-        if not success:
-            io.tool_error(output)
-            return
-
-    success, output = utils.run_install(chromium_cmd)
-    if not success:
-        io.tool_error(output)
-        return
-
-    return True
-
-
 class Scraper:
     pandoc_available = None
     playwright_available = None
@@ -82,7 +33,7 @@ class Scraper:
         else:
             self.print_error = print
 
-        self.playwright_available = playwright_available
+        self.playwright_available = False
         self.verify_ssl = verify_ssl
 
     def scrape(self, url):
@@ -93,10 +44,7 @@ class Scraper:
         `url` - the URL to scrape.
         """
 
-        if self.playwright_available:
-            content, mime_type = self.scrape_with_playwright(url)
-        else:
-            content, mime_type = self.scrape_with_httpx(url)
+        content, mime_type = self.scrape_with_httpx(url)
 
         if not content:
             self.print_error(f"Failed to retrieve content from {url}")
@@ -130,56 +78,6 @@ class Scraper:
         return False
 
     # Internals...
-    def scrape_with_playwright(self, url):
-        import playwright  # noqa: F401
-        from playwright.sync_api import Error as PlaywrightError
-        from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
-        from playwright.sync_api import sync_playwright
-
-        with sync_playwright() as p:
-            try:
-                browser = p.chromium.launch()
-            except Exception as e:
-                self.playwright_available = False
-                self.print_error(str(e))
-                return None, None
-
-            try:
-                context = browser.new_context(ignore_https_errors=not self.verify_ssl)
-                page = context.new_page()
-
-                user_agent = page.evaluate("navigator.userAgent")
-                user_agent = user_agent.replace("Headless", "")
-                user_agent = user_agent.replace("headless", "")
-                user_agent += " " + aider_user_agent
-
-                page.set_extra_http_headers({"User-Agent": user_agent})
-
-                response = None
-                try:
-                    response = page.goto(url, wait_until="networkidle", timeout=5000)
-                except PlaywrightTimeoutError:
-                    self.print_error(f"Timeout while loading {url}")
-                except PlaywrightError as e:
-                    self.print_error(f"Error navigating to {url}: {str(e)}")
-                    return None, None
-
-                try:
-                    content = page.content()
-                    mime_type = None
-                    if response:
-                        content_type = response.header_value("content-type")
-                        if content_type:
-                            mime_type = content_type.split(";")[0]
-                except PlaywrightError as e:
-                    self.print_error(f"Error retrieving page content: {str(e)}")
-                    content = None
-                    mime_type = None
-            finally:
-                browser.close()
-
-        return content, mime_type
-
     def scrape_with_httpx(self, url):
         import httpx