summaryrefslogtreecommitdiff
path: root/lib/plugins/link/html.ex
blob: 1173526ea85dda4f05dcfb97d941c3a0d96770bf (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
defmodule Nola.Plugins.Link.HTML do
  @behaviour Nola.Plugins.Link

  @impl true
  def match(_, _), do: false

  @impl true
  def post_match(_url, "text/html" <> _, _header, _opts), do: {:body, nil}
  def post_match(_, _, _, _), do: false

  @impl true
  def post_expand(url, body, _params, _opts) do
    {:ok, html} = Floki.parse_document(body)
    opengraph = collect_open_graph(html)

    text =
      if has_sufficient_opengraph_data?(opengraph) do
        generate_text_from_opengraph(url, html, opengraph)
      else
        clean_text(collect_title(html))
      end

    {:ok, text}
  end

  defp has_sufficient_opengraph_data?(opengraph) do
    Map.has_key?(opengraph, "title") && Map.has_key?(opengraph, "description")
  end

  defp generate_text_from_opengraph(url, html, opengraph) do
    itemprops = collect_itemprops(html)
    prefix = collect_prefix_and_site_name(url, opengraph, itemprops)
    description = collect_description(opengraph, itemprops, 400)

    [clean_text("#{prefix}#{Map.get(opengraph, "title")}")] ++ description
  end

  defp collect_title(html) do
    case Floki.find(html, "title") do
      [{"title", [], [title]} | _] -> String.trim(title)
      _ -> ""
    end
  end

  defp collect_open_graph(html) do
    Floki.find(html, "head meta")
    |> Enum.reverse()
    |> Enum.reduce(%{}, &extract_meta_tag/2)
  end

  defp extract_meta_tag({"meta", values, []}, acc) do
    with {_, name} <- List.keyfind(values, "property", 0, {nil, nil}),
         {_, content} <- List.keyfind(values, "content", 0, {nil, nil}),
         true <- is_valid_meta_tag?(name) do
      Map.put(acc, strip_prefix(name), content)
    else
      _ -> acc
    end
  end

  defp extract_meta_tag(_, acc), do: acc

  defp is_valid_meta_tag?(nil) do
    false
  end

  defp is_valid_meta_tag?(name) do
    String.starts_with?(name, "og:") || String.starts_with?(name, "article:")
  end

  defp strip_prefix("og:" <> key), do: key
  defp strip_prefix(other), do: other

  defp collect_itemprops(html) do
    Floki.find(html, "[itemprop]")
    |> Enum.reduce(%{}, &extract_itemprop/2)
  end

  defp extract_itemprop({"meta", values, []}, acc) do
    with {_, name} <- List.keyfind(values, "itemprop", 0, {nil, nil}),
         {_, content} <- List.keyfind(values, "content", 0, {nil, nil}),
         true <- String.starts_with?(name, "article:") do
      Map.put(acc, name, content)
    else
      _ -> acc
    end
  end

  defp extract_itemprop(_, acc), do: acc

  defp collect_prefix_and_site_name(url, opengraph, itemprops) do
    uri = URI.parse(url)
    site_name = Map.get(opengraph, "site_name", uri.host)
    paywall_status = get_paywall_status(opengraph, itemprops)
    section = get_section(opengraph, itemprops)

    prefix = "#{paywall_status}#{site_name}#{section}"
    if prefix == "", do: "", else: "#{prefix} — "
  end

  defp get_paywall_status(opengraph, itemprops) do
    content_tier =
      Map.get(
        opengraph,
        "article:content_tier",
        Map.get(itemprops, "article:content_tier", "free")
      )

    if content_tier == "free", do: "", else: "[paywall] "
  end

  defp get_section(opengraph, itemprops) do
    section = Map.get(opengraph, "article:section", Map.get(itemprops, "article:section"))
    if section, do: ": #{section}", else: ""
  end

  defp collect_description(opengraph, itemprops, max_length) do
    date = get_formatted_date(opengraph, itemprops)
    description = transform_description(Map.get(opengraph, "description"), max_length)

    Nola.Irc.Message.splitlong(clean_text("#{date}#{description}"))
  end

  defp get_formatted_date(opengraph, itemprops) do
    published_time =
      Map.get(
        opengraph,
        "article:published_time",
        Map.get(itemprops, "article:published_time", "")
      )

    case DateTime.from_iso8601(published_time) do
      {:ok, date, _} -> "#{Timex.format!(date, "%d/%m/%y", :strftime)}. "
      _ -> ""
    end
  end

  # TODO: Swap with AI description instead of truncating.
  defp transform_description(nil, _), do: nil

  defp transform_description(string, length) when is_binary(string) do
    if String.length(string) > length, do: "#{String.slice(string, 0..length)}…", else: string
  end

  defp clean_text(text) do
    text
    |> String.replace("\n", " ")
    |> String.replace("<br>", " ")
    |> String.replace("<br/>", " ")
    |> String.replace("<br />", " ")
    |> HtmlEntities.decode()
  end
end