summaryrefslogtreecommitdiff
path: root/lib/plugins/link/scraper.ex
blob: f5487e3a7d12fb93a17654ad04ce531bcc8a7120 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
defmodule Nola.Plugins.Link.Scraper do

  defmodule UseScraper do
    require Logger

    def get(url, config) do
      base_url = Keyword.get(config, :base_url, "https://api.usescraper.com")
      api_key = Keyword.get(config, :api_key, "unset api key")
      options = Keyword.get(config, :http_options, [])
      headers = [{"user-agent", "nola, href@random.sh"},
                {"content-type", "application/json"},
                {"authorization", "Bearer " <> api_key}]
      Logger.debug("scraper: use_scraper: get: #{url}")
      with {:ok, json} <- Poison.encode(%{"url" => url, "format" => "html"}),
          {:ok, %HTTPoison.Response{status_code: 200, body: body}} <- HTTPoison.post("#{base_url}/scraper/scrape", json, headers, options),
          {:ok, %{"status" => "scraped", "html" => body, "meta" => meta = %{"fetchedUrlStatusCode" => 200}}} <- Poison.decode(body) do
        {:ok, body, meta}
      else
        {:ok, %{"status" => "scraped", "text" => body, "meta" => meta = %{"fetchedUrlStatusCode" => code}}} ->
          Logger.error("scraper: use_scraper: scraper got http #{code} for #{url}")
          status = Plug.Conn.Status.reason_atom(code)
          {:error, status}
        {:ok, %{"status" => "failed"}} ->
          Logger.error("scraper: use_scraper: scraper service failed for #{url}")
          {:error, :scrape_failed}
        {:ok, %HTTPoison.Response{status_code: code, body: body}} ->
          Logger.error("scraper: use_scraper: scraper service failed (http #{code}) for #{url}")
          status = Plug.Conn.Status.reason_atom(code)
          {:error, status}
        {:error, %HTTPoison.Error{reason: reason}} ->
          Logger.error("scraper: use_scraper: scraper service failed (http #{inspect reason}) for #{url}")
          {:error, reason}
      end
    end
  end

  def get(url) do
    config = Keyword.get(Application.get_env(:nola, Nola.Plugins.Link, []), :scraper) || []
    case config[:service] do
      "usescraper" -> UseScraper.get(url, config[:config] || [])
      _ -> {:error, :scraping_disabled}
    end
  end

end