Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions assets/css/screens/keyword.scss
Original file line number Diff line number Diff line change
@@ -1,4 +1,29 @@
body.keyword.show {
nav ul {
list-style: none;
}

.badge {
border: {
width: 1px;
style: solid;
radius: 5px;
}
font-size: x-small;
padding: 3px;
text-transform: uppercase;
}

.badge-ads {
border-color: green;
color: green;
}

.badge-ads-position {
border-color: grey;
color: grey;
}

iframe {
width: 100%;
height: 400px;
Expand Down
2 changes: 1 addition & 1 deletion lib/google_crawler/accounts.ex
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ defmodule GoogleCrawler.Accounts do
## Examples

iex> auth_user("bob@email.com", "valid_password")
{:ok, $User{}}
{:ok, %User{}}

iex> auth_user("bob@email.com", "invalid_password")
{:error, "invalid password"}
Expand Down
11 changes: 9 additions & 2 deletions lib/google_crawler/google/api_client.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ end
defmodule GoogleCrawler.Google.ApiClient do
@behaviour GoogleCrawler.Google.ApiClientBehaviour

@url "https://www.google.com/search?q="
@url "https://www.google.com/search?hl=en&q="
@user_agent "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"

def search(keyword) do
case HTTPoison.get(@url <> URI.encode(keyword)) do
case HTTPoison.get(@url <> URI.encode(keyword), request_headers()) do
{:ok, %HTTPoison.Response{status_code: 200, body: body}} ->
{:ok, body}

Expand All @@ -19,4 +20,10 @@ defmodule GoogleCrawler.Google.ApiClient do
{:error, reason}
end
end

def request_headers do
[
{"User-Agent", @user_agent}
]
end
end
74 changes: 74 additions & 0 deletions lib/google_crawler/google/scraper.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
defmodule GoogleCrawler.Google.Scraper do
alias GoogleCrawler.Google.ScraperResult

@selectors %{
total_results: "#result-stats",
non_ads_links: "div.r > a",
top_ads_links: "#tads .ads-ad > .ad_cclk > a.V0MxL",
bottom_ads_links: "#bottomads .ads-ad > .ad_cclk > a.V0MxL"
}

def scrap(html) do
result = %ScraperResult{}

{:ok, document} = Floki.parse_document(html)

parse_raw_html_result(result, html)
|> parse_total_results(document)
|> parse_non_ads_links(document)
|> parse_top_ads_links(document)
|> parse_bottom_ads_links(document)
end

defp parse_total_results(result, document) do
total_results_text = Floki.find(document, @selectors.total_results) |> Floki.text()

total_results =
Regex.named_captures(~r/About (?<total_result>.*) results/, total_results_text)
|> Map.get("total_result")
|> String.replace(",", "")
|> Integer.parse()
|> elem(0)

%{result | total_results: total_results}
end

defp parse_non_ads_links(result, document) do
non_ads_links = parse_links(document, @selectors.non_ads_links)

%{result | links: non_ads_links, total_links: length(non_ads_links)}
end

defp parse_top_ads_links(result, document) do
top_ads_links = parse_links(document, @selectors.top_ads_links)

%{result | top_ads_links: top_ads_links, total_top_ads_links: length(top_ads_links)}
end

defp parse_bottom_ads_links(result, document) do
bottom_ads_links = parse_links(document, @selectors.bottom_ads_links)

%{
result
| bottom_ads_links: bottom_ads_links,
total_bottom_ads_links: length(bottom_ads_links)
}
end

defp parse_raw_html_result(result, html) do
%{result | raw_html_result: cleanup_html(html)}
end

defp parse_links(document, selector) do
document
|> Floki.find(selector)
|> Floki.attribute("href")
end

defp cleanup_html(html) do
html
|> String.chunk(:printable)
|> Enum.filter(&String.printable?/1)
|> Enum.join()
end
end
10 changes: 10 additions & 0 deletions lib/google_crawler/google/scraper_result.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
defmodule GoogleCrawler.Google.ScraperResult do
defstruct raw_html_result: nil,
total_results: 0,
links: [],
total_links: 0,
top_ads_links: [],
total_top_ads_links: 0,
bottom_ads_links: [],
total_bottom_ads_links: 0
end
15 changes: 0 additions & 15 deletions lib/google_crawler/google/scrapper.ex

This file was deleted.

81 changes: 80 additions & 1 deletion lib/google_crawler/search.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@ defmodule GoogleCrawler.Search do
"""

import Ecto.Query, warn: false
alias Ecto.Multi
alias GoogleCrawler.Repo

alias GoogleCrawler.Search.Keyword
alias GoogleCrawler.Search.KeywordFile
alias GoogleCrawler.Search.Link
alias GoogleCrawler.Google.ScraperResult

@doc """
Returns the list of keywords belongs to the given user.
Expand All @@ -21,6 +24,7 @@ defmodule GoogleCrawler.Search do
def list_user_keywords(user) do
Keyword
|> where(user_id: ^user.id)
|> order_by(desc: :inserted_at)
|> Repo.all()
end

Expand All @@ -38,7 +42,10 @@ defmodule GoogleCrawler.Search do
** (Ecto.NoResultsError)

"""
def get_keyword(id), do: Repo.get(Keyword, id)
def get_keyword(id) do
Repo.get(Keyword, id)
|> Repo.preload(:links)
end

@doc """
Creates a keyword.
Expand Down Expand Up @@ -98,6 +105,53 @@ defmodule GoogleCrawler.Search do
|> Repo.update()
end

@doc """
Update the search result for a keyword.

## Examples

iex> update_keyword_result(keyword, %{field: new_value})
{:ok, %Keyword{}}

iex> update_keyword_result(keyword, %{field: bad_value})
{:error, %Ecto.Changeset{}}

"""
def update_keyword_result(%Keyword{} = keyword, attrs) do
keyword
|> Keyword.update_result_changeset(attrs)
|> Repo.update()
end

@doc """
Update the keyword result from the scraper result and mark the keyword as completed.
"""
def update_keyword_result_from_scraper(%Keyword{} = keyword, %ScraperResult{} = result) do
keyword_changeset =
Keyword.update_result_changeset(keyword, %{
status: :completed,
raw_html_result: result.raw_html_result,
total_results: result.total_results,
total_ads_links: result.total_top_ads_links + result.total_bottom_ads_links,
total_links: result.total_links
})

Multi.new()
|> create_keyword_link_multi(keyword, result.top_ads_links, %{
is_ads: true,
ads_position: :top
})
|> create_keyword_link_multi(keyword, result.bottom_ads_links, %{
is_ads: true,
ads_position: :bottom
})
|> create_keyword_link_multi(keyword, result.links, %{
is_ads: false
})
|> Multi.update(:keyword, keyword_changeset)
|> Repo.transaction()
end

@doc """
Parses the keyword from the given file.
Returns the stream for each line in the csv file as [line_result].
Expand All @@ -112,4 +166,29 @@ defmodule GoogleCrawler.Search do
def parse_keywords_from_file!(file_path, mime_type) do
KeywordFile.parse!(file_path, mime_type)
end

@doc """
List result links of the given keyword.
"""
def list_keyword_links(keyword, query \\ []) do
Link
|> where(keyword_id: ^keyword.id)
|> where(^query)
|> order_by(desc: :is_ads)
|> Repo.all()
end

# Create the multi to insert the links.
# Other attributes of the link except the link itself must be specified
defp create_keyword_link_multi(multi, _keyword, [], _attrs), do: multi

defp create_keyword_link_multi(multi, keyword, [link | rest_of_links], attrs) do
changeset =
Ecto.build_assoc(keyword, :links)
|> Link.changeset(Map.put(attrs, :url, link))

multi
|> Multi.insert("link_#{length(multi.operations)}", changeset)
|> create_keyword_link_multi(keyword, rest_of_links, attrs)
end
end
15 changes: 14 additions & 1 deletion lib/google_crawler/search/keyword.ex
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,30 @@ defmodule GoogleCrawler.Search.Keyword do
field :keyword, :string
field :status, GoogleCrawler.Search.Keyword.Status, default: :in_queue
field :raw_html_result, :string
field :total_results, :integer
field :total_ads_links, :integer
field :total_links, :integer

belongs_to :user, GoogleCrawler.Accounts.User
has_many :links, GoogleCrawler.Search.Link

timestamps()
end

@fields ~w(keyword user_id status raw_html_result)a
@fields ~w(keyword user_id status raw_html_result total_results total_ads_links total_links)a

def changeset(keyword, attrs \\ %{}) do
keyword
|> cast(attrs, @fields)
|> validate_required([:keyword, :user_id, :status])
end

def update_result_changeset(keyword, attrs \\ %{}) do
keyword
|> changeset(attrs)
|> validate_required([:raw_html_result, :total_results, :total_ads_links, :total_links])
|> validate_number(:total_results, greater_than_or_equal_to: 0)
|> validate_number(:total_ads_links, greater_than_or_equal_to: 0)
|> validate_number(:total_links, greater_than_or_equal_to: 0)
end
end
34 changes: 34 additions & 0 deletions lib/google_crawler/search/link.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import EctoEnum

defenum(GoogleCrawler.Search.Link.AdsPosition, top: 0, bottom: 1)

defmodule GoogleCrawler.Search.Link do
use Ecto.Schema
import Ecto.Changeset

schema "links" do
field :url, :string
field :is_ads, :boolean, default: false
field :ads_position, GoogleCrawler.Search.Link.AdsPosition

belongs_to :keyword, GoogleCrawler.Search.Keyword

timestamps()
end

@fields ~w(url is_ads ads_position)a

def changeset(link, attrs \\ %{}) do
link
|> cast(attrs, @fields)
|> validate_required([:url, :is_ads])
|> validate_ads_position()
end

def validate_ads_position(%Ecto.Changeset{changes: %{is_ads: true}} = changeset) do
changeset
|> validate_required(:ads_position)
end

def validate_ads_position(changeset), do: changeset
end
4 changes: 2 additions & 2 deletions lib/google_crawler/search/search_keyword_task.ex
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
defmodule GoogleCrawler.Search.SearchKeywordTask do
alias GoogleCrawler.Search.Keyword
alias GoogleCrawler.Google.Scrapper
alias GoogleCrawler.Google.Scraper

def perform(%Keyword{} = keyword) do
case google_api_client().search(keyword.keyword) do
{:ok, body} ->
Scrapper.scrap(body)
Scraper.scrap(body)

{:error, reason} ->
raise "Keyword search failed: #{reason}"
Expand Down
Loading