1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
|
defmodule Nola.Plugins.Link.HTML do
@behaviour Nola.Plugins.Link
@impl true
def match(_, _), do: false
@impl true
def post_match(_url, "text/html" <> _, _header, _opts), do: {:body, nil}
def post_match(_, _, _, _), do: false
@impl true
def post_expand(url, body, _params, _opts) do
{:ok, html} = Floki.parse_document(body)
opengraph = collect_open_graph(html)
text =
if has_sufficient_opengraph_data?(opengraph) do
generate_text_from_opengraph(url, html, opengraph)
else
clean_text(collect_title(html))
end
{:ok, text}
end
defp has_sufficient_opengraph_data?(opengraph) do
Map.has_key?(opengraph, "title") && Map.has_key?(opengraph, "description")
end
defp generate_text_from_opengraph(url, html, opengraph) do
itemprops = collect_itemprops(html)
prefix = collect_prefix_and_site_name(url, opengraph, itemprops)
description = collect_description(opengraph, itemprops, 400)
[clean_text("#{prefix}#{Map.get(opengraph, "title")}")] ++ description
end
defp collect_title(html) do
case Floki.find(html, "title") do
[{"title", [], [title]} | _] -> String.trim(title)
_ -> ""
end
end
defp collect_open_graph(html) do
Floki.find(html, "head meta")
|> Enum.reverse()
|> Enum.reduce(%{}, &extract_meta_tag/2)
end
defp extract_meta_tag({"meta", values, []}, acc) do
with {_, name} <- List.keyfind(values, "property", 0, {nil, nil}),
{_, content} <- List.keyfind(values, "content", 0, {nil, nil}),
true <- is_valid_meta_tag?(name) do
Map.put(acc, strip_prefix(name), content)
else
_ -> acc
end
end
defp extract_meta_tag(_, acc), do: acc
defp is_valid_meta_tag?(nil) do
false
end
defp is_valid_meta_tag?(name) do
String.starts_with?(name, "og:") || String.starts_with?(name, "article:")
end
defp strip_prefix("og:" <> key), do: key
defp strip_prefix(other), do: other
defp collect_itemprops(html) do
Floki.find(html, "[itemprop]")
|> Enum.reduce(%{}, &extract_itemprop/2)
end
defp extract_itemprop({"meta", values, []}, acc) do
with {_, name} <- List.keyfind(values, "itemprop", 0, {nil, nil}),
{_, content} <- List.keyfind(values, "content", 0, {nil, nil}),
true <- String.starts_with?(name, "article:") do
Map.put(acc, name, content)
else
_ -> acc
end
end
defp extract_itemprop(_, acc), do: acc
defp collect_prefix_and_site_name(url, opengraph, itemprops) do
uri = URI.parse(url)
site_name = Map.get(opengraph, "site_name", uri.host)
paywall_status = get_paywall_status(opengraph, itemprops)
section = get_section(opengraph, itemprops)
prefix = "#{paywall_status}#{site_name}#{section}"
if prefix == "", do: "", else: "#{prefix} — "
end
defp get_paywall_status(opengraph, itemprops) do
content_tier =
Map.get(
opengraph,
"article:content_tier",
Map.get(itemprops, "article:content_tier", "free")
)
if content_tier == "free", do: "", else: "[paywall] "
end
defp get_section(opengraph, itemprops) do
section = Map.get(opengraph, "article:section", Map.get(itemprops, "article:section"))
if section, do: ": #{section}", else: ""
end
defp collect_description(opengraph, itemprops, max_length) do
date = get_formatted_date(opengraph, itemprops)
description = transform_description(Map.get(opengraph, "description"), max_length)
Nola.Irc.Message.splitlong(clean_text("#{date}#{description}"))
end
defp get_formatted_date(opengraph, itemprops) do
published_time =
Map.get(
opengraph,
"article:published_time",
Map.get(itemprops, "article:published_time", "")
)
case DateTime.from_iso8601(published_time) do
{:ok, date, _} -> "#{Timex.format!(date, "%d/%m/%y", :strftime)}. "
_ -> ""
end
end
# TODO: Swap with AI description instead of truncating.
defp transform_description(nil, _), do: nil
defp transform_description(string, length) when is_binary(string) do
if String.length(string) > length, do: "#{String.slice(string, 0..length)}…", else: string
end
defp clean_text(text) do
text
|> String.replace("\n", " ")
|> String.replace("<br>", " ")
|> String.replace("<br/>", " ")
|> String.replace("<br />", " ")
|> HtmlEntities.decode()
end
end
|