summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/open_ai.ex21
-rw-r--r--lib/plugins/link.ex50
-rw-r--r--lib/plugins/link/github.ex87
-rw-r--r--lib/plugins/link/html.ex168
-rw-r--r--lib/plugins/link/reddit.ex2
-rw-r--r--lib/plugins/link/scraper.ex45
-rw-r--r--lib/plugins/link/store.ex30
7 files changed, 289 insertions, 114 deletions
diff --git a/lib/open_ai.ex b/lib/open_ai.ex
index cc0de27..da54e3a 100644
--- a/lib/open_ai.ex
+++ b/lib/open_ai.ex
@@ -1,19 +1,32 @@
defmodule OpenAi do
+ require Logger
+
def post(path, data, options \\ []) do
config = Application.get_env(:nola, :openai, [])
- url = "https://api.openai.com#{path}"
+ base_url = Keyword.get(config, :base_url, "https://api.openai.com")
+ url = "#{base_url}#{path}"
headers = [{"user-agent", "internal private experiment bot, href@random.sh"},
{"content-type", "application/json"},
{"authorization", "Bearer " <> Keyword.get(config, :key, "unset-api-key")}]
- options = options ++ [timeout: :timer.seconds(180), recv_timeout: :timer.seconds(180)]
+ options = options ++ [timeout: :timer.seconds(30), recv_timeout: :timer.seconds(30)]
+ Logger.debug("openai: post: #{url} #{inspect data}")
with {:ok, json} <- Poison.encode(data),
{:ok, %HTTPoison.Response{status_code: 200, body: body}} <- HTTPoison.post(url, json, headers, options),
{:ok, data} <- Poison.decode(body) do
{:ok, data}
else
- {:ok, %HTTPoison.Response{status_code: code}} -> {:error, Plug.Conn.Status.reason_atom(code)}
- {:error, %HTTPoison.Error{reason: reason}} -> {:error, reason}
+ {:ok, %HTTPoison.Response{status_code: code, body: body}} ->
+ Logger.error("OpenAI: HTTP #{code} #{inspect body}")
+ status = Plug.Conn.Status.reason_atom(code)
+ case Poison.decode(body) do
+ {:ok, %{"error" => %{"message" => message, "code" => code}}} ->
+ {:error, {status, message}}
+ kek ->
+ {:error, status}
+ end
+ {:error, %HTTPoison.Error{reason: reason}} ->
+ {:error, reason}
end
end
diff --git a/lib/plugins/link.ex b/lib/plugins/link.ex
index 4c4261f..84eb976 100644
--- a/lib/plugins/link.ex
+++ b/lib/plugins/link.ex
@@ -37,6 +37,8 @@ defmodule Nola.Plugins.Link do
def short_irc_doc, do: false
def irc_doc, do: @ircdoc
require Logger
+ alias __MODULE__.Store
+ alias __MODULE__.Scraper
def start_link() do
GenServer.start_link(__MODULE__, [], name: __MODULE__)
@@ -52,6 +54,7 @@ defmodule Nola.Plugins.Link do
defstruct [:client]
def init([]) do
+ Store.setup()
{:ok, _} = Registry.register(Nola.PubSub, "messages", [plugin: __MODULE__])
#{:ok, _} = Registry.register(Nola.PubSub, "messages:telegram", [plugin: __MODULE__])
Logger.info("Link handler started")
@@ -71,16 +74,19 @@ defmodule Nola.Plugins.Link do
text = case uris do
[uri] -> text
[luri | _] ->
- if luri.host == uri.host && luri.path == luri.path do
+ if luri.host == uri.host && luri.path == uri.path do
text
else
["-> #{URI.to_string(luri)}", text]
end
end
- if is_list(text) do
- for line <- text, do: message.replyfun.(line)
- else
- message.replyfun.(text)
+ case text do
+ lines when is_list(lines) ->
+ for text <- lines, do: message.replyfun.(text)
+ text when is_binary(text) ->
+ message.replyfun.(text)
+ nil ->
+ nil
end
_ -> nil
end
@@ -239,6 +245,7 @@ defmodule Nola.Plugins.Link do
Logger.debug("link: expanding #{uri} with default")
headers = [{"user-agent", "DmzBot (like TwitterBot)"}]
options = [follow_redirect: false, max_body_length: 30_000_000]
+ url = URI.to_string(uri)
case get(URI.to_string(uri), headers, options) do
{:ok, text} ->
{:ok, acc, text}
@@ -247,12 +254,15 @@ defmodule Nola.Plugins.Link do
#new_uri = %URI{new_uri | scheme: scheme, authority: uri.authority, host: uri.host, port: uri.port}
expand_link([new_uri | acc])
{:error, status, _headers} ->
- text = Plug.Conn.Status.reason_phrase(status)
- {:ok, acc, "Error: HTTP #{text} (#{status})"}
+ #text = Plug.Conn.Status.reason_phrase(status)
+ #{:ok, acc, "Error: HTTP #{text} (#{status})"}
+ retry_expand_with_scraper(acc, url)
{:error, {:tls_alert, {:handshake_failure, err}}} ->
- {:ok, acc, "TLS Error: #{to_string(err)}"}
+ {:ok, acc, nil} # "TLS Error: #{to_string(err)}"}
+ {:error, :timeout} ->
+ retry_expand_with_scraper(acc, url)
{:error, reason} ->
- {:ok, acc, "Error: #{to_string(reason)}"}
+ {:ok, acc, nil} #"Error: #{to_string(reason)}"}
end
end
@@ -261,6 +271,27 @@ defmodule Nola.Plugins.Link do
{:ok, [uri], "-> #{URI.to_string(uri)}"}
end
+ # Last resort: scrape the page
+ # We'll be mostly calling this when 403 or 500 or timeout because site blocks us.
+ # An external service will scrape the page for us and return the body.
+ # We'll call directly the HTML handler on the result.
+ defp retry_expand_with_scraper(acc, url) do
+ Logger.info("Attempting scraper")
+ handlers = Keyword.get(Application.get_env(:nola, __MODULE__), :handlers)
+ Logger.info("Attempting scraper #{inspect handlers}")
+ with true <- Keyword.has_key?(handlers, :"Nola.Plugins.Link.HTML"),
+ {:ok, body, _meta} <- Scraper.get(url),
+ {:ok, text} <- __MODULE__.HTML.post_expand(url, body, nil, nil)
+ do
+ {:ok, acc, text}
+ else
+ error ->
+ Logger.debug("Attempt with scraper failed: #{inspect error}")
+ # We give up here. We don't return anything (the acc from caller `expand default`
+ # does not matter anymore) and I see returning error messages as useless.
+ {:ok, acc, nil}
+ end
+ end
defp human_size(bytes) do
bytes
@@ -268,4 +299,5 @@ defmodule Nola.Plugins.Link do
|> FileSize.scale()
|> FileSize.format()
end
+
end
diff --git a/lib/plugins/link/github.ex b/lib/plugins/link/github.ex
index 0069a40..77fa81f 100644
--- a/lib/plugins/link/github.ex
+++ b/lib/plugins/link/github.ex
@@ -3,11 +3,10 @@ defmodule Nola.Plugins.Link.Github do
@impl true
def match(uri = %URI{host: "github.com", path: path}, _) do
- case String.split(path, "/") do
- ["", user, repo] ->
- {true, %{user: user, repo: repo, path: "#{user}/#{repo}"}}
- _ ->
- false
+ with ["", user, repo] <- String.split(path, "/") do
+ {true, %{user: user, repo: repo, path: "#{user}/#{repo}"}}
+ else
+ _ -> false
end
end
@@ -18,32 +17,60 @@ defmodule Nola.Plugins.Link.Github do
@impl true
def expand(_uri, %{user: user, repo: repo}, _opts) do
- case HTTPoison.get("https://api.github.com/repos/#{user}/#{repo}") do
- {:ok, %HTTPoison.Response{status_code: 200, body: body}} ->
- {:ok, json} = Jason.decode(body)
- src = json["source"]["full_name"]
- disabled = if(json["disabled"], do: " (disabled)", else: "")
- archived = if(json["archived"], do: " (archived)", else: "")
- fork = if src && src != json["full_name"] do
- " (⑂ #{json["source"]["full_name"]})"
- else
- ""
- end
- start = "#{json["full_name"]}#{disabled}#{archived}#{fork} - #{json["description"]}"
- tags = for(t <- json["topics"]||[], do: "##{t}") |> Enum.intersperse(", ") |> Enum.join("")
- lang = if(json["language"], do: "#{json["language"]} - ", else: "")
- issues = if(json["open_issues_count"], do: "#{json["open_issues_count"]} issues - ", else: "")
- last_push = if at = json["pushed_at"] do
- {:ok, date, _} = DateTime.from_iso8601(at)
- " - last pushed #{DateTime.to_string(date)}"
- else
- ""
- end
- network = "#{lang}#{issues}#{json["stargazers_count"]} stars - #{json["subscribers_count"]} watchers - #{json["forks_count"]} forks#{last_push}"
- {:ok, [start, tags, network]}
- other ->
- :error
+ with {:ok, response} <- HTTPoison.get("https://api.github.com/repos/#{user}/#{repo}"),
+ {:ok, json} <- Jason.decode(response.body) do
+ info = %{
+ full_name: json["full_name"],
+ disabled: json["disabled"],
+ archived: json["archived"],
+ source: json["source"],
+ description: json["description"],
+ topics: json["topics"],
+ language: json["language"],
+ open_issues_count: json["open_issues_count"],
+ pushed_at: json["pushed_at"],
+ stargazers_count: json["stargazers_count"],
+ subscribers_count: json["subscribers_count"],
+ forks_count: json["forks_count"]
+ }
+
+ start = build_start(info)
+ tags = build_tags(info)
+ network = build_network(info)
+
+ {:ok, [start, tags, network]}
+ else
+ _ -> :error
end
end
+ defp build_start(info) do
+ parts = []
+ |> maybe_add(info.disabled, " (disabled)")
+ |> maybe_add(info.archived, " (archived)")
+ |> maybe_add(info.source && info.source["full_name"] != info.full_name, " (⑂ #{info.source["full_name"]})")
+
+ "#{info.full_name}#{parts} - #{info.description}"
+ end
+
+ defp build_tags(info) do
+ for(t <- info.topics || [], do: "##{t}") |> Enum.intersperse(", ") |> Enum.join("")
+ end
+
+ defp build_network(info) do
+ lang = info.language && "#{info.language} - " || ""
+ issues = info.open_issues_count && "#{info.open_issues_count} issues - " || ""
+ last_push =
+ if at = info.pushed_at do
+ {:ok, date, _} = DateTime.from_iso8601(at)
+ " - last pushed #{DateTime.to_string(date)}"
+ else
+ ""
+ end
+ "#{lang}#{issues}#{info.stargazers_count} stars - #{info.subscribers_count} watchers - #{info.forks_count} forks#{last_push}"
+ end
+
+ defp maybe_add(acc, condition, value) do
+ if condition, do: acc ++ [value], else: acc
+ end
end
diff --git a/lib/plugins/link/html.ex b/lib/plugins/link/html.ex
index a941aac..5899ed5 100644
--- a/lib/plugins/link/html.ex
+++ b/lib/plugins/link/html.ex
@@ -5,102 +5,130 @@ defmodule Nola.Plugins.Link.HTML do
def match(_, _), do: false
@impl true
- def post_match(_url, "text/html"<>_, _header, _opts) do
- {:body, nil}
- end
+ def post_match(_url, "text/html" <> _, _header, _opts), do: {:body, nil}
def post_match(_, _, _, _), do: false
@impl true
def post_expand(url, body, _params, _opts) do
html = Floki.parse(body)
- title = collect_title(html)
opengraph = collect_open_graph(html)
- itemprops = collect_itemprops(html)
- text = if Map.has_key?(opengraph, "title") && Map.has_key?(opengraph, "description") do
- sitename = if sn = Map.get(opengraph, "site_name") do
- "#{sn}"
- else
- ""
- end
- paywall? = if Map.get(opengraph, "article:content_tier", Map.get(itemprops, "article:content_tier", "free")) == "free" do
- ""
- else
- "[paywall] "
- end
- section = if section = Map.get(opengraph, "article:section", Map.get(itemprops, "article:section", nil)) do
- ": #{section}"
- else
- ""
- end
- date = case DateTime.from_iso8601(Map.get(opengraph, "article:published_time", Map.get(itemprops, "article:published_time", ""))) do
- {:ok, date, _} ->
- "#{Timex.format!(date, "%d/%m/%y", :strftime)}. "
- _ ->
- ""
- end
- uri = URI.parse(url)
-
- prefix = "#{paywall?}#{Map.get(opengraph, "site_name", uri.host)}#{section}"
- prefix = unless prefix == "" do
- "#{prefix} — "
- else
- ""
- end
- [clean_text("#{prefix}#{Map.get(opengraph, "title")}")] ++ Nola.Irc.Message.splitlong(clean_text("#{date}#{Map.get(opengraph, "description")}"))
+
+ text = if has_sufficient_opengraph_data?(opengraph) do
+ generate_text_from_opengraph(url, html, opengraph)
else
- clean_text(title)
+ clean_text(collect_title(html))
end
+
{:ok, text}
end
+ defp has_sufficient_opengraph_data?(opengraph) do
+ Map.has_key?(opengraph, "title") && Map.has_key?(opengraph, "description")
+ end
+
+ defp generate_text_from_opengraph(url, html, opengraph) do
+ itemprops = collect_itemprops(html)
+ prefix = collect_prefix_and_site_name(url, opengraph, itemprops)
+ description = collect_description(opengraph, itemprops, 500)
+
+ [clean_text("#{prefix}#{Map.get(opengraph, "title")}")] ++ description
+ end
+
defp collect_title(html) do
case Floki.find(html, "title") do
- [{"title", [], [title]} | _] ->
- String.trim(title)
- _ ->
- nil
+ [{"title", [], [title]} | _] -> String.trim(title)
+ _ -> ""
end
end
defp collect_open_graph(html) do
- Enum.reduce(Floki.find(html, "head meta"), %{}, fn(tag, acc) ->
- case tag do
- {"meta", values, []} ->
- name = List.keyfind(values, "property", 0, {nil, nil}) |> elem(1)
- content = List.keyfind(values, "content", 0, {nil, nil}) |> elem(1)
- case name do
- "og:" <> key ->
- Map.put(acc, key, content)
- "article:"<>_ ->
- Map.put(acc, name, content)
- _other -> acc
- end
- _other -> acc
- end
- end)
+ Floki.find(html, "head meta")
+ |> Enum.reduce(%{}, &extract_meta_tag/2)
end
+ defp extract_meta_tag({"meta", values, []}, acc) do
+ with {_, name} <- List.keyfind(values, "property", 0, {nil, nil}),
+ {_, content} <- List.keyfind(values, "content", 0, {nil, nil}),
+ true <- is_valid_meta_tag?(name) do
+ Map.put(acc, strip_prefix(name), content)
+ else
+ _ -> acc
+ end
+ end
+ defp extract_meta_tag(_, acc), do: acc
+
+ defp is_valid_meta_tag?(name) do
+ String.starts_with?(name, "og:") || String.starts_with?(name, "article:")
+ end
+
+ defp is_valid_meta_tag?(nil) do
+ false
+ end
+
+ defp strip_prefix("og:" <> key), do: key
+ defp strip_prefix(other), do: other
+
defp collect_itemprops(html) do
- Enum.reduce(Floki.find(html, "[itemprop]"), %{}, fn(tag, acc) ->
- case tag do
- {"meta", values, []} ->
- name = List.keyfind(values, "itemprop", 0, {nil, nil}) |> elem(1)
- content = List.keyfind(values, "content", 0, {nil, nil}) |> elem(1)
- case name do
- "article:" <> key ->
- Map.put(acc, name, content)
- _other -> acc
- end
- _other -> acc
- end
- end)
+ Floki.find(html, "[itemprop]")
+ |> Enum.reduce(%{}, &extract_itemprop/2)
end
+ defp extract_itemprop({"meta", values, []}, acc) do
+ with {_, name} <- List.keyfind(values, "itemprop", 0, {nil, nil}),
+ {_, content} <- List.keyfind(values, "content", 0, {nil, nil}),
+ true <- String.starts_with?(name, "article:") do
+ Map.put(acc, name, content)
+ else
+ _ -> acc
+ end
+ end
+ defp extract_itemprop(_, acc), do: acc
+
+ defp collect_prefix_and_site_name(url, opengraph, itemprops) do
+ uri = URI.parse(url)
+ site_name = Map.get(opengraph, "site_name", uri.host)
+ paywall_status = get_paywall_status(opengraph, itemprops)
+ section = get_section(opengraph, itemprops)
+
+ prefix = "#{paywall_status}#{site_name}#{section}"
+ if prefix == "", do: "", else: "#{prefix} — "
+ end
+
+ defp get_paywall_status(opengraph, itemprops) do
+ content_tier = Map.get(opengraph, "article:content_tier", Map.get(itemprops, "article:content_tier", "free"))
+ if content_tier == "free", do: "", else: "[paywall] "
+ end
+
+ defp get_section(opengraph, itemprops) do
+ section = Map.get(opengraph, "article:section", Map.get(itemprops, "article:section"))
+ if section, do: ": #{section}", else: ""
+ end
+
+ defp collect_description(opengraph, itemprops, max_length) do
+ date = get_formatted_date(opengraph, itemprops)
+ description = transform_description(Map.get(opengraph, "description"), max_length)
+
+ Nola.Irc.Message.splitlong(clean_text("#{date}#{description}"))
+ end
+
+ defp get_formatted_date(opengraph, itemprops) do
+ published_time = Map.get(opengraph, "article:published_time", Map.get(itemprops, "article:published_time", ""))
+ case DateTime.from_iso8601(published_time) do
+ {:ok, date, _} -> "#{Timex.format!(date, "%d/%m/%y", :strftime)}. "
+ _ -> ""
+ end
+ end
+
+ # TODO: Swap with AI description instead of truncating.
+ defp transform_description(string, length) when is_binary(string) do
+ if String.length(string) >= length, do: String.truncate(string, length), else: string
+ end
+ defp transform_description(nil, _), do: nil
+
defp clean_text(text) do
text
|> String.replace("\n", " ")
|> HtmlEntities.decode()
end
-
end
diff --git a/lib/plugins/link/reddit.ex b/lib/plugins/link/reddit.ex
index 016e025..707e284 100644
--- a/lib/plugins/link/reddit.ex
+++ b/lib/plugins/link/reddit.ex
@@ -108,7 +108,7 @@ defmodule Nola.Plugins.Link.Reddit do
end
title = "#{nsfw}#{sr}: #{flair}#{Map.get(op, "title")}"
state_str = if(state, do: "#{state}, ")
- content = "by u/#{Map.get(op, "author")} - #{state_str}#{up} up, #{down} down, #{comments} comments - #{self_str}"
+ content = "by u/#{Map.get(op, "author")} - #{state_str}#{up} up, #{comments} comments - #{self_str}"
{:ok, [title, content]}
err ->
diff --git a/lib/plugins/link/scraper.ex b/lib/plugins/link/scraper.ex
new file mode 100644
index 0000000..f5487e3
--- /dev/null
+++ b/lib/plugins/link/scraper.ex
@@ -0,0 +1,45 @@
+defmodule Nola.Plugins.Link.Scraper do
+
+ defmodule UseScraper do
+ require Logger
+
+ def get(url, config) do
+ base_url = Keyword.get(config, :base_url, "https://api.usescraper.com")
+ api_key = Keyword.get(config, :api_key, "unset api key")
+ options = Keyword.get(config, :http_options, [])
+ headers = [{"user-agent", "nola, href@random.sh"},
+ {"content-type", "application/json"},
+ {"authorization", "Bearer " <> api_key}]
+ Logger.debug("scraper: use_scraper: get: #{url}")
+ with {:ok, json} <- Poison.encode(%{"url" => url, "format" => "html"}),
+ {:ok, %HTTPoison.Response{status_code: 200, body: body}} <- HTTPoison.post("#{base_url}/scraper/scrape", json, headers, options),
+ {:ok, %{"status" => "scraped", "html" => body, "meta" => meta = %{"fetchedUrlStatusCode" => 200}}} <- Poison.decode(body) do
+ {:ok, body, meta}
+ else
+ {:ok, %{"status" => "scraped", "text" => body, "meta" => meta = %{"fetchedUrlStatusCode" => code}}} ->
+ Logger.error("scraper: use_scraper: scraper got http #{code} for #{url}")
+ status = Plug.Conn.Status.reason_atom(code)
+ {:error, status}
+ {:ok, %{"status" => "failed"}} ->
+ Logger.error("scraper: use_scraper: scraper service failed for #{url}")
+ {:error, :scrape_failed}
+ {:ok, %HTTPoison.Response{status_code: code, body: body}} ->
+ Logger.error("scraper: use_scraper: scraper service failed (http #{code}) for #{url}")
+ status = Plug.Conn.Status.reason_atom(code)
+ {:error, status}
+ {:error, %HTTPoison.Error{reason: reason}} ->
+ Logger.error("scraper: use_scraper: scraper service failed (http #{inspect reason}) for #{url}")
+ {:error, reason}
+ end
+ end
+ end
+
+ def get(url) do
+ config = Keyword.get(Application.get_env(:nola, Nola.Plugins.Link, []), :scraper) || []
+ case config[:service] do
+ "usescraper" -> UseScraper.get(url, config[:config] || [])
+ _ -> {:error, :scraping_disabled}
+ end
+ end
+
+end
diff --git a/lib/plugins/link/store.ex b/lib/plugins/link/store.ex
new file mode 100644
index 0000000..566cc9a
--- /dev/null
+++ b/lib/plugins/link/store.ex
@@ -0,0 +1,30 @@
+defmodule Nola.Plugins.Link.Store do
+ require Record
+ import Ex2ms
+
+ @type url() :: String.t()
+
+ Record.defrecord(:link, link: nil, at: nil)
+ @type link :: record(:link, link: String.t(), at: nil)
+
+ Record.defrecord(:link_entry, key: nil, at: nil)
+ @type link_entry :: record(:link_entry, key: {url(), String.t()}, at: nil)
+
+ def setup do
+ :ets.new(:links, [:set, :public, :named_table, keypos: 2])
+ end
+
+ @spec insert_link(url()) :: true
+ def insert_link(url) do
+ :ets.insert(:links, link(link: url, at: NaiveDateTime.utc_now() |> NaiveDateTime.to_unix()))
+ end
+
+ @spec get_link(url()) :: String.t() | nil
+ def get_link(url) do
+ case :ets.lookup(:links, url) do
+ [link] -> link
+ [] -> nil
+ end
+ end
+
+end