diff options
-rw-r--r-- | lib/open_ai.ex | 21 | ||||
-rw-r--r-- | lib/plugins/link.ex | 50 | ||||
-rw-r--r-- | lib/plugins/link/github.ex | 87 | ||||
-rw-r--r-- | lib/plugins/link/html.ex | 168 | ||||
-rw-r--r-- | lib/plugins/link/reddit.ex | 2 | ||||
-rw-r--r-- | lib/plugins/link/scraper.ex | 45 | ||||
-rw-r--r-- | lib/plugins/link/store.ex | 30 |
7 files changed, 289 insertions, 114 deletions
diff --git a/lib/open_ai.ex b/lib/open_ai.ex index cc0de27..da54e3a 100644 --- a/lib/open_ai.ex +++ b/lib/open_ai.ex @@ -1,19 +1,32 @@ defmodule OpenAi do + require Logger + def post(path, data, options \\ []) do config = Application.get_env(:nola, :openai, []) - url = "https://api.openai.com#{path}" + base_url = Keyword.get(config, :base_url, "https://api.openai.com") + url = "#{base_url}#{path}" headers = [{"user-agent", "internal private experiment bot, href@random.sh"}, {"content-type", "application/json"}, {"authorization", "Bearer " <> Keyword.get(config, :key, "unset-api-key")}] - options = options ++ [timeout: :timer.seconds(180), recv_timeout: :timer.seconds(180)] + options = options ++ [timeout: :timer.seconds(30), recv_timeout: :timer.seconds(30)] + Logger.debug("openai: post: #{url} #{inspect data}") with {:ok, json} <- Poison.encode(data), {:ok, %HTTPoison.Response{status_code: 200, body: body}} <- HTTPoison.post(url, json, headers, options), {:ok, data} <- Poison.decode(body) do {:ok, data} else - {:ok, %HTTPoison.Response{status_code: code}} -> {:error, Plug.Conn.Status.reason_atom(code)} - {:error, %HTTPoison.Error{reason: reason}} -> {:error, reason} + {:ok, %HTTPoison.Response{status_code: code, body: body}} -> + Logger.error("OpenAI: HTTP #{code} #{inspect body}") + status = Plug.Conn.Status.reason_atom(code) + case Poison.decode(body) do + {:ok, %{"error" => %{"message" => message, "code" => code}}} -> + {:error, {status, message}} + kek -> + {:error, status} + end + {:error, %HTTPoison.Error{reason: reason}} -> + {:error, reason} end end diff --git a/lib/plugins/link.ex b/lib/plugins/link.ex index 4c4261f..84eb976 100644 --- a/lib/plugins/link.ex +++ b/lib/plugins/link.ex @@ -37,6 +37,8 @@ defmodule Nola.Plugins.Link do def short_irc_doc, do: false def irc_doc, do: @ircdoc require Logger + alias __MODULE__.Store + alias __MODULE__.Scraper def start_link() do GenServer.start_link(__MODULE__, [], name: __MODULE__) @@ -52,6 +54,7 @@ defmodule Nola.Plugins.Link do defstruct [:client] def init([]) do + Store.setup() {:ok, _} = Registry.register(Nola.PubSub, "messages", [plugin: __MODULE__]) #{:ok, _} = Registry.register(Nola.PubSub, "messages:telegram", [plugin: __MODULE__]) Logger.info("Link handler started") @@ -71,16 +74,19 @@ defmodule Nola.Plugins.Link do text = case uris do [uri] -> text [luri | _] -> - if luri.host == uri.host && luri.path == luri.path do + if luri.host == uri.host && luri.path == uri.path do text else ["-> #{URI.to_string(luri)}", text] end end - if is_list(text) do - for line <- text, do: message.replyfun.(line) - else - message.replyfun.(text) + case text do + lines when is_list(lines) -> + for text <- lines, do: message.replyfun.(text) + text when is_binary(text) -> + message.replyfun.(text) + nil -> + nil end _ -> nil end @@ -239,6 +245,7 @@ defmodule Nola.Plugins.Link do Logger.debug("link: expanding #{uri} with default") headers = [{"user-agent", "DmzBot (like TwitterBot)"}] options = [follow_redirect: false, max_body_length: 30_000_000] + url = URI.to_string(uri) case get(URI.to_string(uri), headers, options) do {:ok, text} -> {:ok, acc, text} @@ -247,12 +254,15 @@ defmodule Nola.Plugins.Link do #new_uri = %URI{new_uri | scheme: scheme, authority: uri.authority, host: uri.host, port: uri.port} expand_link([new_uri | acc]) {:error, status, _headers} -> - text = Plug.Conn.Status.reason_phrase(status) - {:ok, acc, "Error: HTTP #{text} (#{status})"} + #text = Plug.Conn.Status.reason_phrase(status) + #{:ok, acc, "Error: HTTP #{text} (#{status})"} + retry_expand_with_scraper(acc, url) {:error, {:tls_alert, {:handshake_failure, err}}} -> - {:ok, acc, "TLS Error: #{to_string(err)}"} + {:ok, acc, nil} # "TLS Error: #{to_string(err)}"} + {:error, :timeout} -> + retry_expand_with_scraper(acc, url) {:error, reason} -> - {:ok, acc, "Error: #{to_string(reason)}"} + {:ok, acc, nil} #"Error: #{to_string(reason)}"} end end @@ -261,6 +271,27 @@ defmodule Nola.Plugins.Link do {:ok, [uri], "-> #{URI.to_string(uri)}"} end + # Last resort: scrape the page + # We'll be mostly calling this when 403 or 500 or timeout because site blocks us. + # An external service will scrape the page for us and return the body. + # We'll call directly the HTML handler on the result. + defp retry_expand_with_scraper(acc, url) do + Logger.info("Attempting scraper") + handlers = Keyword.get(Application.get_env(:nola, __MODULE__), :handlers) + Logger.info("Attempting scraper #{inspect handlers}") + with true <- Keyword.has_key?(handlers, :"Nola.Plugins.Link.HTML"), + {:ok, body, _meta} <- Scraper.get(url), + {:ok, text} <- __MODULE__.HTML.post_expand(url, body, nil, nil) + do + {:ok, acc, text} + else + error -> + Logger.debug("Attempt with scraper failed: #{inspect error}") + # We give up here. We don't return anything (the acc from caller `expand default` + # does not matter anymore) and I see returning error messages as useless. + {:ok, acc, nil} + end + end defp human_size(bytes) do bytes @@ -268,4 +299,5 @@ defmodule Nola.Plugins.Link do |> FileSize.scale() |> FileSize.format() end + end diff --git a/lib/plugins/link/github.ex b/lib/plugins/link/github.ex index 0069a40..77fa81f 100644 --- a/lib/plugins/link/github.ex +++ b/lib/plugins/link/github.ex @@ -3,11 +3,10 @@ defmodule Nola.Plugins.Link.Github do @impl true def match(uri = %URI{host: "github.com", path: path}, _) do - case String.split(path, "/") do - ["", user, repo] -> - {true, %{user: user, repo: repo, path: "#{user}/#{repo}"}} - _ -> - false + with ["", user, repo] <- String.split(path, "/") do + {true, %{user: user, repo: repo, path: "#{user}/#{repo}"}} + else + _ -> false end end @@ -18,32 +17,60 @@ defmodule Nola.Plugins.Link.Github do @impl true def expand(_uri, %{user: user, repo: repo}, _opts) do - case HTTPoison.get("https://api.github.com/repos/#{user}/#{repo}") do - {:ok, %HTTPoison.Response{status_code: 200, body: body}} -> - {:ok, json} = Jason.decode(body) - src = json["source"]["full_name"] - disabled = if(json["disabled"], do: " (disabled)", else: "") - archived = if(json["archived"], do: " (archived)", else: "") - fork = if src && src != json["full_name"] do - " (⑂ #{json["source"]["full_name"]})" - else - "" - end - start = "#{json["full_name"]}#{disabled}#{archived}#{fork} - #{json["description"]}" - tags = for(t <- json["topics"]||[], do: "##{t}") |> Enum.intersperse(", ") |> Enum.join("") - lang = if(json["language"], do: "#{json["language"]} - ", else: "") - issues = if(json["open_issues_count"], do: "#{json["open_issues_count"]} issues - ", else: "") - last_push = if at = json["pushed_at"] do - {:ok, date, _} = DateTime.from_iso8601(at) - " - last pushed #{DateTime.to_string(date)}" - else - "" - end - network = "#{lang}#{issues}#{json["stargazers_count"]} stars - #{json["subscribers_count"]} watchers - #{json["forks_count"]} forks#{last_push}" - {:ok, [start, tags, network]} - other -> - :error + with {:ok, response} <- HTTPoison.get("https://api.github.com/repos/#{user}/#{repo}"), + {:ok, json} <- Jason.decode(response.body) do + info = %{ + full_name: json["full_name"], + disabled: json["disabled"], + archived: json["archived"], + source: json["source"], + description: json["description"], + topics: json["topics"], + language: json["language"], + open_issues_count: json["open_issues_count"], + pushed_at: json["pushed_at"], + stargazers_count: json["stargazers_count"], + subscribers_count: json["subscribers_count"], + forks_count: json["forks_count"] + } + + start = build_start(info) + tags = build_tags(info) + network = build_network(info) + + {:ok, [start, tags, network]} + else + _ -> :error end end + defp build_start(info) do + parts = [] + |> maybe_add(info.disabled, " (disabled)") + |> maybe_add(info.archived, " (archived)") + |> maybe_add(info.source && info.source["full_name"] != info.full_name, " (⑂ #{info.source["full_name"]})") + + "#{info.full_name}#{parts} - #{info.description}" + end + + defp build_tags(info) do + for(t <- info.topics || [], do: "##{t}") |> Enum.intersperse(", ") |> Enum.join("") + end + + defp build_network(info) do + lang = info.language && "#{info.language} - " || "" + issues = info.open_issues_count && "#{info.open_issues_count} issues - " || "" + last_push = + if at = info.pushed_at do + {:ok, date, _} = DateTime.from_iso8601(at) + " - last pushed #{DateTime.to_string(date)}" + else + "" + end + "#{lang}#{issues}#{info.stargazers_count} stars - #{info.subscribers_count} watchers - #{info.forks_count} forks#{last_push}" + end + + defp maybe_add(acc, condition, value) do + if condition, do: acc ++ [value], else: acc + end end diff --git a/lib/plugins/link/html.ex b/lib/plugins/link/html.ex index a941aac..5899ed5 100644 --- a/lib/plugins/link/html.ex +++ b/lib/plugins/link/html.ex @@ -5,102 +5,130 @@ defmodule Nola.Plugins.Link.HTML do def match(_, _), do: false @impl true - def post_match(_url, "text/html"<>_, _header, _opts) do - {:body, nil} - end + def post_match(_url, "text/html" <> _, _header, _opts), do: {:body, nil} def post_match(_, _, _, _), do: false @impl true def post_expand(url, body, _params, _opts) do html = Floki.parse(body) - title = collect_title(html) opengraph = collect_open_graph(html) - itemprops = collect_itemprops(html) - text = if Map.has_key?(opengraph, "title") && Map.has_key?(opengraph, "description") do - sitename = if sn = Map.get(opengraph, "site_name") do - "#{sn}" - else - "" - end - paywall? = if Map.get(opengraph, "article:content_tier", Map.get(itemprops, "article:content_tier", "free")) == "free" do - "" - else - "[paywall] " - end - section = if section = Map.get(opengraph, "article:section", Map.get(itemprops, "article:section", nil)) do - ": #{section}" - else - "" - end - date = case DateTime.from_iso8601(Map.get(opengraph, "article:published_time", Map.get(itemprops, "article:published_time", ""))) do - {:ok, date, _} -> - "#{Timex.format!(date, "%d/%m/%y", :strftime)}. " - _ -> - "" - end - uri = URI.parse(url) - - prefix = "#{paywall?}#{Map.get(opengraph, "site_name", uri.host)}#{section}" - prefix = unless prefix == "" do - "#{prefix} — " - else - "" - end - [clean_text("#{prefix}#{Map.get(opengraph, "title")}")] ++ Nola.Irc.Message.splitlong(clean_text("#{date}#{Map.get(opengraph, "description")}")) + + text = if has_sufficient_opengraph_data?(opengraph) do + generate_text_from_opengraph(url, html, opengraph) else - clean_text(title) + clean_text(collect_title(html)) end + {:ok, text} end + defp has_sufficient_opengraph_data?(opengraph) do + Map.has_key?(opengraph, "title") && Map.has_key?(opengraph, "description") + end + + defp generate_text_from_opengraph(url, html, opengraph) do + itemprops = collect_itemprops(html) + prefix = collect_prefix_and_site_name(url, opengraph, itemprops) + description = collect_description(opengraph, itemprops, 500) + + [clean_text("#{prefix}#{Map.get(opengraph, "title")}")] ++ description + end + defp collect_title(html) do case Floki.find(html, "title") do - [{"title", [], [title]} | _] -> - String.trim(title) - _ -> - nil + [{"title", [], [title]} | _] -> String.trim(title) + _ -> "" end end defp collect_open_graph(html) do - Enum.reduce(Floki.find(html, "head meta"), %{}, fn(tag, acc) -> - case tag do - {"meta", values, []} -> - name = List.keyfind(values, "property", 0, {nil, nil}) |> elem(1) - content = List.keyfind(values, "content", 0, {nil, nil}) |> elem(1) - case name do - "og:" <> key -> - Map.put(acc, key, content) - "article:"<>_ -> - Map.put(acc, name, content) - _other -> acc - end - _other -> acc - end - end) + Floki.find(html, "head meta") + |> Enum.reduce(%{}, &extract_meta_tag/2) end + defp extract_meta_tag({"meta", values, []}, acc) do + with {_, name} <- List.keyfind(values, "property", 0, {nil, nil}), + {_, content} <- List.keyfind(values, "content", 0, {nil, nil}), + true <- is_valid_meta_tag?(name) do + Map.put(acc, strip_prefix(name), content) + else + _ -> acc + end + end + defp extract_meta_tag(_, acc), do: acc + + defp is_valid_meta_tag?(name) do + String.starts_with?(name, "og:") || String.starts_with?(name, "article:") + end + + defp is_valid_meta_tag?(nil) do + false + end + + defp strip_prefix("og:" <> key), do: key + defp strip_prefix(other), do: other + defp collect_itemprops(html) do - Enum.reduce(Floki.find(html, "[itemprop]"), %{}, fn(tag, acc) -> - case tag do - {"meta", values, []} -> - name = List.keyfind(values, "itemprop", 0, {nil, nil}) |> elem(1) - content = List.keyfind(values, "content", 0, {nil, nil}) |> elem(1) - case name do - "article:" <> key -> - Map.put(acc, name, content) - _other -> acc - end - _other -> acc - end - end) + Floki.find(html, "[itemprop]") + |> Enum.reduce(%{}, &extract_itemprop/2) end + defp extract_itemprop({"meta", values, []}, acc) do + with {_, name} <- List.keyfind(values, "itemprop", 0, {nil, nil}), + {_, content} <- List.keyfind(values, "content", 0, {nil, nil}), + true <- String.starts_with?(name, "article:") do + Map.put(acc, name, content) + else + _ -> acc + end + end + defp extract_itemprop(_, acc), do: acc + + defp collect_prefix_and_site_name(url, opengraph, itemprops) do + uri = URI.parse(url) + site_name = Map.get(opengraph, "site_name", uri.host) + paywall_status = get_paywall_status(opengraph, itemprops) + section = get_section(opengraph, itemprops) + + prefix = "#{paywall_status}#{site_name}#{section}" + if prefix == "", do: "", else: "#{prefix} — " + end + + defp get_paywall_status(opengraph, itemprops) do + content_tier = Map.get(opengraph, "article:content_tier", Map.get(itemprops, "article:content_tier", "free")) + if content_tier == "free", do: "", else: "[paywall] " + end + + defp get_section(opengraph, itemprops) do + section = Map.get(opengraph, "article:section", Map.get(itemprops, "article:section")) + if section, do: ": #{section}", else: "" + end + + defp collect_description(opengraph, itemprops, max_length) do + date = get_formatted_date(opengraph, itemprops) + description = transform_description(Map.get(opengraph, "description"), max_length) + + Nola.Irc.Message.splitlong(clean_text("#{date}#{description}")) + end + + defp get_formatted_date(opengraph, itemprops) do + published_time = Map.get(opengraph, "article:published_time", Map.get(itemprops, "article:published_time", "")) + case DateTime.from_iso8601(published_time) do + {:ok, date, _} -> "#{Timex.format!(date, "%d/%m/%y", :strftime)}. " + _ -> "" + end + end + + # TODO: Swap with AI description instead of truncating. + defp transform_description(string, length) when is_binary(string) do + if String.length(string) >= length, do: String.truncate(string, length), else: string + end + defp transform_description(nil, _), do: nil + defp clean_text(text) do text |> String.replace("\n", " ") |> HtmlEntities.decode() end - end diff --git a/lib/plugins/link/reddit.ex b/lib/plugins/link/reddit.ex index 016e025..707e284 100644 --- a/lib/plugins/link/reddit.ex +++ b/lib/plugins/link/reddit.ex @@ -108,7 +108,7 @@ defmodule Nola.Plugins.Link.Reddit do end title = "#{nsfw}#{sr}: #{flair}#{Map.get(op, "title")}" state_str = if(state, do: "#{state}, ") - content = "by u/#{Map.get(op, "author")} - #{state_str}#{up} up, #{down} down, #{comments} comments - #{self_str}" + content = "by u/#{Map.get(op, "author")} - #{state_str}#{up} up, #{comments} comments - #{self_str}" {:ok, [title, content]} err -> diff --git a/lib/plugins/link/scraper.ex b/lib/plugins/link/scraper.ex new file mode 100644 index 0000000..f5487e3 --- /dev/null +++ b/lib/plugins/link/scraper.ex @@ -0,0 +1,45 @@ +defmodule Nola.Plugins.Link.Scraper do + + defmodule UseScraper do + require Logger + + def get(url, config) do + base_url = Keyword.get(config, :base_url, "https://api.usescraper.com") + api_key = Keyword.get(config, :api_key, "unset api key") + options = Keyword.get(config, :http_options, []) + headers = [{"user-agent", "nola, href@random.sh"}, + {"content-type", "application/json"}, + {"authorization", "Bearer " <> api_key}] + Logger.debug("scraper: use_scraper: get: #{url}") + with {:ok, json} <- Poison.encode(%{"url" => url, "format" => "html"}), + {:ok, %HTTPoison.Response{status_code: 200, body: body}} <- HTTPoison.post("#{base_url}/scraper/scrape", json, headers, options), + {:ok, %{"status" => "scraped", "html" => body, "meta" => meta = %{"fetchedUrlStatusCode" => 200}}} <- Poison.decode(body) do + {:ok, body, meta} + else + {:ok, %{"status" => "scraped", "text" => body, "meta" => meta = %{"fetchedUrlStatusCode" => code}}} -> + Logger.error("scraper: use_scraper: scraper got http #{code} for #{url}") + status = Plug.Conn.Status.reason_atom(code) + {:error, status} + {:ok, %{"status" => "failed"}} -> + Logger.error("scraper: use_scraper: scraper service failed for #{url}") + {:error, :scrape_failed} + {:ok, %HTTPoison.Response{status_code: code, body: body}} -> + Logger.error("scraper: use_scraper: scraper service failed (http #{code}) for #{url}") + status = Plug.Conn.Status.reason_atom(code) + {:error, status} + {:error, %HTTPoison.Error{reason: reason}} -> + Logger.error("scraper: use_scraper: scraper service failed (http #{inspect reason}) for #{url}") + {:error, reason} + end + end + end + + def get(url) do + config = Keyword.get(Application.get_env(:nola, Nola.Plugins.Link, []), :scraper) || [] + case config[:service] do + "usescraper" -> UseScraper.get(url, config[:config] || []) + _ -> {:error, :scraping_disabled} + end + end + +end diff --git a/lib/plugins/link/store.ex b/lib/plugins/link/store.ex new file mode 100644 index 0000000..566cc9a --- /dev/null +++ b/lib/plugins/link/store.ex @@ -0,0 +1,30 @@ +defmodule Nola.Plugins.Link.Store do + require Record + import Ex2ms + + @type url() :: String.t() + + Record.defrecord(:link, link: nil, at: nil) + @type link :: record(:link, link: String.t(), at: nil) + + Record.defrecord(:link_entry, key: nil, at: nil) + @type link_entry :: record(:link_entry, key: {url(), String.t()}, at: nil) + + def setup do + :ets.new(:links, [:set, :public, :named_table, keypos: 2]) + end + + @spec insert_link(url()) :: true + def insert_link(url) do + :ets.insert(:links, link(link: url, at: NaiveDateTime.utc_now() |> NaiveDateTime.to_unix())) + end + + @spec get_link(url()) :: String.t() | nil + def get_link(url) do + case :ets.lookup(:links, url) do + [link] -> link + [] -> nil + end + end + +end |