diff options
Diffstat (limited to 'lib/plugins/link/html.ex')
-rw-r--r-- | lib/plugins/link/html.ex | 168 |
1 files changed, 98 insertions, 70 deletions
diff --git a/lib/plugins/link/html.ex b/lib/plugins/link/html.ex index a941aac..5899ed5 100644 --- a/lib/plugins/link/html.ex +++ b/lib/plugins/link/html.ex @@ -5,102 +5,130 @@ defmodule Nola.Plugins.Link.HTML do def match(_, _), do: false @impl true - def post_match(_url, "text/html"<>_, _header, _opts) do - {:body, nil} - end + def post_match(_url, "text/html" <> _, _header, _opts), do: {:body, nil} def post_match(_, _, _, _), do: false @impl true def post_expand(url, body, _params, _opts) do html = Floki.parse(body) - title = collect_title(html) opengraph = collect_open_graph(html) - itemprops = collect_itemprops(html) - text = if Map.has_key?(opengraph, "title") && Map.has_key?(opengraph, "description") do - sitename = if sn = Map.get(opengraph, "site_name") do - "#{sn}" - else - "" - end - paywall? = if Map.get(opengraph, "article:content_tier", Map.get(itemprops, "article:content_tier", "free")) == "free" do - "" - else - "[paywall] " - end - section = if section = Map.get(opengraph, "article:section", Map.get(itemprops, "article:section", nil)) do - ": #{section}" - else - "" - end - date = case DateTime.from_iso8601(Map.get(opengraph, "article:published_time", Map.get(itemprops, "article:published_time", ""))) do - {:ok, date, _} -> - "#{Timex.format!(date, "%d/%m/%y", :strftime)}. " - _ -> - "" - end - uri = URI.parse(url) - - prefix = "#{paywall?}#{Map.get(opengraph, "site_name", uri.host)}#{section}" - prefix = unless prefix == "" do - "#{prefix} — " - else - "" - end - [clean_text("#{prefix}#{Map.get(opengraph, "title")}")] ++ Nola.Irc.Message.splitlong(clean_text("#{date}#{Map.get(opengraph, "description")}")) + + text = if has_sufficient_opengraph_data?(opengraph) do + generate_text_from_opengraph(url, html, opengraph) else - clean_text(title) + clean_text(collect_title(html)) end + {:ok, text} end + defp has_sufficient_opengraph_data?(opengraph) do + Map.has_key?(opengraph, "title") && Map.has_key?(opengraph, "description") + end + + defp generate_text_from_opengraph(url, html, opengraph) do + itemprops = collect_itemprops(html) + prefix = collect_prefix_and_site_name(url, opengraph, itemprops) + description = collect_description(opengraph, itemprops, 500) + + [clean_text("#{prefix}#{Map.get(opengraph, "title")}")] ++ description + end + defp collect_title(html) do case Floki.find(html, "title") do - [{"title", [], [title]} | _] -> - String.trim(title) - _ -> - nil + [{"title", [], [title]} | _] -> String.trim(title) + _ -> "" end end defp collect_open_graph(html) do - Enum.reduce(Floki.find(html, "head meta"), %{}, fn(tag, acc) -> - case tag do - {"meta", values, []} -> - name = List.keyfind(values, "property", 0, {nil, nil}) |> elem(1) - content = List.keyfind(values, "content", 0, {nil, nil}) |> elem(1) - case name do - "og:" <> key -> - Map.put(acc, key, content) - "article:"<>_ -> - Map.put(acc, name, content) - _other -> acc - end - _other -> acc - end - end) + Floki.find(html, "head meta") + |> Enum.reduce(%{}, &extract_meta_tag/2) end + defp extract_meta_tag({"meta", values, []}, acc) do + with {_, name} <- List.keyfind(values, "property", 0, {nil, nil}), + {_, content} <- List.keyfind(values, "content", 0, {nil, nil}), + true <- is_valid_meta_tag?(name) do + Map.put(acc, strip_prefix(name), content) + else + _ -> acc + end + end + defp extract_meta_tag(_, acc), do: acc + + defp is_valid_meta_tag?(name) do + String.starts_with?(name, "og:") || String.starts_with?(name, "article:") + end + + defp is_valid_meta_tag?(nil) do + false + end + + defp strip_prefix("og:" <> key), do: key + defp strip_prefix(other), do: other + defp collect_itemprops(html) do - Enum.reduce(Floki.find(html, "[itemprop]"), %{}, fn(tag, acc) -> - case tag do - {"meta", values, []} -> - name = List.keyfind(values, "itemprop", 0, {nil, nil}) |> elem(1) - content = List.keyfind(values, "content", 0, {nil, nil}) |> elem(1) - case name do - "article:" <> key -> - Map.put(acc, name, content) - _other -> acc - end - _other -> acc - end - end) + Floki.find(html, "[itemprop]") + |> Enum.reduce(%{}, &extract_itemprop/2) end + defp extract_itemprop({"meta", values, []}, acc) do + with {_, name} <- List.keyfind(values, "itemprop", 0, {nil, nil}), + {_, content} <- List.keyfind(values, "content", 0, {nil, nil}), + true <- String.starts_with?(name, "article:") do + Map.put(acc, name, content) + else + _ -> acc + end + end + defp extract_itemprop(_, acc), do: acc + + defp collect_prefix_and_site_name(url, opengraph, itemprops) do + uri = URI.parse(url) + site_name = Map.get(opengraph, "site_name", uri.host) + paywall_status = get_paywall_status(opengraph, itemprops) + section = get_section(opengraph, itemprops) + + prefix = "#{paywall_status}#{site_name}#{section}" + if prefix == "", do: "", else: "#{prefix} — " + end + + defp get_paywall_status(opengraph, itemprops) do + content_tier = Map.get(opengraph, "article:content_tier", Map.get(itemprops, "article:content_tier", "free")) + if content_tier == "free", do: "", else: "[paywall] " + end + + defp get_section(opengraph, itemprops) do + section = Map.get(opengraph, "article:section", Map.get(itemprops, "article:section")) + if section, do: ": #{section}", else: "" + end + + defp collect_description(opengraph, itemprops, max_length) do + date = get_formatted_date(opengraph, itemprops) + description = transform_description(Map.get(opengraph, "description"), max_length) + + Nola.Irc.Message.splitlong(clean_text("#{date}#{description}")) + end + + defp get_formatted_date(opengraph, itemprops) do + published_time = Map.get(opengraph, "article:published_time", Map.get(itemprops, "article:published_time", "")) + case DateTime.from_iso8601(published_time) do + {:ok, date, _} -> "#{Timex.format!(date, "%d/%m/%y", :strftime)}. " + _ -> "" + end + end + + # TODO: Swap with AI description instead of truncating. + defp transform_description(string, length) when is_binary(string) do + if String.length(string) >= length, do: String.truncate(string, length), else: string + end + defp transform_description(nil, _), do: nil + defp clean_text(text) do text |> String.replace("\n", " ") |> HtmlEntities.decode() end - end |