diff options
author | Dave Cottlehuber <dch@FreeBSD.org> | 2024-07-27 19:20:20 +0000 |
---|---|---|
committer | Dave Cottlehuber <dch@FreeBSD.org> | 2024-07-27 19:26:02 +0000 |
commit | 800bb7c50cfe116a92e6668875bec471cd4cb4dd (patch) | |
tree | 3359a5d2d8d2fe85a9d8deafa15eece213f20e2a /misc/py-aider-chat/files/patch-aider_scrape.py | |
parent | security/aws-iam-authenticator: Update to 0.6.22 (diff) |
misc/py-aider-chat: update to 0.45.0
Reported by: portscout
Sponsored by: SkunkWerks, GmbH
Diffstat (limited to 'misc/py-aider-chat/files/patch-aider_scrape.py')
-rw-r--r-- | misc/py-aider-chat/files/patch-aider_scrape.py | 120 |
1 files changed, 56 insertions, 64 deletions
diff --git a/misc/py-aider-chat/files/patch-aider_scrape.py b/misc/py-aider-chat/files/patch-aider_scrape.py index 6ba15a5ce575..f590cbbf2c49 100644 --- a/misc/py-aider-chat/files/patch-aider_scrape.py +++ b/misc/py-aider-chat/files/patch-aider_scrape.py @@ -1,80 +1,72 @@ ---- aider/scrape.py.orig 2024-07-04 14:23:32 UTC +--- aider/scrape.py.orig 2024-07-27 19:09:04 UTC +++ aider/scrape.py -@@ -3,9 +3,7 @@ import sys - import re - import sys +@@ -15,56 +15,9 @@ def install_playwright(io): --import playwright - import pypandoc --from playwright.sync_api import sync_playwright - from aider import __version__, urls - from aider.dump import dump # noqa: F401 -@@ -42,14 +40,10 @@ class Scraper: - """ - Scrape a url and turn it into readable markdown. + def install_playwright(io): +- try: +- from playwright.sync_api import sync_playwright ++ return -- `url` - the URLto scrape. -+ `url` - the URL to scrape. - """ -- self.try_playwright() +- has_pip = True +- except ImportError: +- has_pip = False -- if self.playwright_available: -- content = self.scrape_with_playwright(url) -- else: -- content = self.scrape_with_httpx(url) -+ content = self.scrape_with_httpx(url) - - if not content: - return -@@ -62,50 +56,8 @@ class Scraper: - - # Internals... - def scrape_with_playwright(self, url): +- try: - with sync_playwright() as p: -- try: -- browser = p.chromium.launch() -- except Exception as e: -- self.playwright_available = False -- self.print_error(e) -- return +- p.chromium.launch() +- has_chromium = True +- except Exception: +- has_chromium = False - -- page = browser.new_page() +- if has_pip and has_chromium: +- return True - -- user_agent = page.evaluate("navigator.userAgent") -- user_agent = user_agent.replace("Headless", "") -- user_agent = user_agent.replace("headless", "") -- user_agent += " " + aider_user_agent +- pip_cmd = utils.get_pip_install(["aider-chat[playwright]"]) +- chromium_cmd = "playwright install --with-deps chromium".split() - -- page = browser.new_page(user_agent=user_agent) -- try: -- page.goto(url, wait_until="networkidle", timeout=5000) -- except playwright._impl._errors.TimeoutError: -- pass -- content = page.content() -- browser.close() +- cmds = "" +- if not has_pip: +- cmds += " ".join(pip_cmd) + "\n" +- if not has_chromium: +- cmds += " ".join(chromium_cmd) + "\n" - -- return content +- text = f"""For the best web scraping, install Playwright: - -- def try_playwright(self): -- if self.playwright_available is not None: -- return +-{cmds} +-See {urls.enable_playwright} for more info. +-""" - -- try: -- with sync_playwright() as p: -- p.chromium.launch() -- self.playwright_available = True -- except Exception: - self.playwright_available = False +- io.tool_error(text) +- if not io.confirm_ask("Install playwright?", default="y"): +- return - -- def get_playwright_instructions(self): -- if self.playwright_available in (True, None): - return -- if self.playwright_instructions_shown: +- if not has_pip: +- success, output = utils.run_install(pip_cmd) +- if not success: +- io.tool_error(output) - return - -- self.playwright_instructions_shown = True -- return PLAYWRIGHT_INFO +- success, output = utils.run_install(chromium_cmd) +- if not success: +- io.tool_error(output) +- return +- +- return True +- +- + class Scraper: + pandoc_available = None + playwright_available = None +@@ -89,10 +42,7 @@ class Scraper: + `url` - the URLto scrape. + """ + +- if self.playwright_available: +- content = self.scrape_with_playwright(url) +- else: +- content = self.scrape_with_httpx(url) ++ content = self.scrape_with_httpx(url) - def scrape_with_httpx(self, url): - import httpx + if not content: + return |