1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
|
defmodule Nola.Plugins.Link.Scraper do
defmodule UseScraper do
require Logger
def get(url, config) do
base_url = Keyword.get(config, :base_url, "https://api.usescraper.com")
api_key = Keyword.get(config, :api_key, "unset api key")
options = Keyword.get(config, :http_options, [])
headers = [
{"user-agent", "nola, href@random.sh"},
{"content-type", "application/json"},
{"authorization", "Bearer " <> api_key}
]
Logger.debug("scraper: use_scraper: get: #{url}")
with {:ok, json} <- Poison.encode(%{"url" => url, "format" => "html"}),
{:ok, %HTTPoison.Response{status_code: 200, body: body}} <-
HTTPoison.post("#{base_url}/scraper/scrape", json, headers, options),
{:ok,
%{
"status" => "scraped",
"html" => body,
"meta" => meta = %{"fetchedUrlStatusCode" => 200}
}} <- Poison.decode(body) do
{:ok, body, meta}
else
{:ok,
%{
"status" => "scraped",
"text" => body,
"meta" => meta = %{"fetchedUrlStatusCode" => code}
}} ->
Logger.error("scraper: use_scraper: scraper got http #{code} for #{url}")
status = Plug.Conn.Status.reason_atom(code)
{:error, status}
{:ok, %{"status" => "failed"}} ->
Logger.error("scraper: use_scraper: scraper service failed for #{url}")
{:error, :scrape_failed}
{:ok, %HTTPoison.Response{status_code: code, body: body}} ->
Logger.error("scraper: use_scraper: scraper service failed (http #{code}) for #{url}")
status = Plug.Conn.Status.reason_atom(code)
{:error, status}
{:error, %HTTPoison.Error{reason: reason}} ->
Logger.error(
"scraper: use_scraper: scraper service failed (http #{inspect(reason)}) for #{url}"
)
{:error, reason}
end
end
end
def get(url) do
config = Keyword.get(Application.get_env(:nola, Nola.Plugins.Link, []), :scraper) || []
case config[:service] do
"usescraper" -> UseScraper.get(url, config[:config] || [])
_ -> {:error, :scraping_disabled}
end
end
end
|