From 26a8ce651749fcbec48130f44a0db03a22fb5ea4 Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Fri, 3 Apr 2020 11:54:48 +0700 Subject: [PATCH 01/17] Add the result scrapper --- lib/google_crawler/google/scrapper.ex | 38 ++- mix.exs | 3 +- test/fixtures/search_results/hotels.html | 226 ++++++++++++++++++ .../{ => search_results}/search_result.html | 0 test/google_crawler/google/scrapper_test.exs | 22 ++ test/support/mocks/google_api_client.ex | 4 +- 6 files changed, 286 insertions(+), 7 deletions(-) create mode 100644 test/fixtures/search_results/hotels.html rename test/fixtures/{ => search_results}/search_result.html (100%) create mode 100644 test/google_crawler/google/scrapper_test.exs diff --git a/lib/google_crawler/google/scrapper.ex b/lib/google_crawler/google/scrapper.ex index 2940793..ac8b223 100644 --- a/lib/google_crawler/google/scrapper.ex +++ b/lib/google_crawler/google/scrapper.ex @@ -1,9 +1,39 @@ +defmodule GoogleCrawler.Google.ScrapperResult do + defstruct raw_html_result: nil, + total_results: 0 +end + defmodule GoogleCrawler.Google.Scrapper do + alias GoogleCrawler.Google.ScrapperResult + + # @selectors %{ + # total_result: "#result-stats" + # } + def scrap(html) do - # TODO: Scrap the page content - %{ - raw_html_result: cleanup_html(html) - } + result = %ScrapperResult{} + + {:ok, document} = Floki.parse_document(html) + + parse_raw_html_result(result, html) + |> parse_total_results(document) + end + + def parse_total_results(result, document) do + total_results_text = Floki.find(document, "#result-stats") |> Floki.text + + total_results = + Regex.named_captures(~r/About (?.*) results/, total_results_text) + |> Map.get("total_result") + |> String.replace(",", "") + |> Integer.parse + |> elem(0) + + %{ result | total_results: total_results } + end + + def parse_raw_html_result(result, html) do + %{ result | raw_html_result: cleanup_html(html) } end def cleanup_html(html) do diff --git a/mix.exs b/mix.exs index 84cd248..d8a21d0 100644 --- a/mix.exs +++ b/mix.exs @@ -47,7 +47,8 @@ defmodule GoogleCrawler.MixProject do {:faker_elixir_octopus, "~> 1.0.0", only: [:dev, :test]}, {:csv, "~> 2.3"}, {:httpoison, "~> 1.6"}, - {:ecto_enum, "~> 1.4"} + {:ecto_enum, "~> 1.4"}, + {:floki, "~> 0.26.0"} ] end diff --git a/test/fixtures/search_results/hotels.html b/test/fixtures/search_results/hotels.html new file mode 100644 index 0000000..3a74990 --- /dev/null +++ b/test/fixtures/search_results/hotels.html @@ -0,0 +1,226 @@ + +kittens for adoption - Google Search

Accessibility links

Skip to main contentAccessibility help
Accessibility feedback
About 3,820,000,000 results (0.65 seconds) 

Ads

  1. 1000s of adorable dogs & cats for adoption. Adopt rabbits, hamsters & fish too! Discussion Forum. Adopt Cute Pets. Lost & Found. Types: Puppies, Kittens, Rabbits & Hamsters.

    People also search for

Footer links

Google apps
Google Account
Rossukhon Leagmongkol
rossukhon@nimblehq.co
diff --git a/test/fixtures/search_result.html b/test/fixtures/search_results/search_result.html similarity index 100% rename from test/fixtures/search_result.html rename to test/fixtures/search_results/search_result.html diff --git a/test/google_crawler/google/scrapper_test.exs b/test/google_crawler/google/scrapper_test.exs new file mode 100644 index 0000000..2760bff --- /dev/null +++ b/test/google_crawler/google/scrapper_test.exs @@ -0,0 +1,22 @@ +defmodule GoogleCrawler.Google.ScrapperTest do + use ExUnit.Case + + alias GoogleCrawler.Google.Scrapper + alias GoogleCrawler.Google.ScrapperResult + + test "scrap/1" do + html = response_fixtures("hotels.html") + + result = Scrapper.scrap(html) + + IO.inspect result + # assert %ScrapperResult{ + # raw_html_result: html + # } == result + end + + defp response_fixtures(path) do + Path.join(["test/fixtures/search_results", path]) + |> File.read!() + end +end diff --git a/test/support/mocks/google_api_client.ex b/test/support/mocks/google_api_client.ex index bf50d94..ef0285a 100644 --- a/test/support/mocks/google_api_client.ex +++ b/test/support/mocks/google_api_client.ex @@ -6,11 +6,11 @@ defmodule GoogleCrawler.Google.MockApiClient do end def search(_keyword) do - {:ok, response_fixtures('search_result.html')} + {:ok, response_fixtures('hotels.html')} end defp response_fixtures(path) do - Path.join(["test/fixtures", path]) + Path.join(["test/fixtures/search_results", path]) |> File.read!() end end From d78b5a8ab75cc89a7bf6873b2c2c9693c5248905 Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Tue, 14 Apr 2020 15:49:05 +0700 Subject: [PATCH 02/17] Scrap the links from google result --- lib/google_crawler/google/scrapper.ex | 66 ++- test/fixtures/search_results/hotels.html | 443 ++++++++++--------- test/google_crawler/google/scrapper_test.exs | 34 +- 3 files changed, 316 insertions(+), 227 deletions(-) diff --git a/lib/google_crawler/google/scrapper.ex b/lib/google_crawler/google/scrapper.ex index ac8b223..01d2b3b 100644 --- a/lib/google_crawler/google/scrapper.ex +++ b/lib/google_crawler/google/scrapper.ex @@ -1,14 +1,23 @@ defmodule GoogleCrawler.Google.ScrapperResult do defstruct raw_html_result: nil, - total_results: 0 + total_results: 0, + links: [], + total_links: 0, + top_ads_links: [], + total_top_ads_links: 0, + bottom_ads_links: [], + total_bottom_ads_links: 0 end defmodule GoogleCrawler.Google.Scrapper do alias GoogleCrawler.Google.ScrapperResult - # @selectors %{ - # total_result: "#result-stats" - # } + @selectors %{ + total_result: "#result-stats", + non_ads_links: "div.r > a", + top_ads_links: "#tads .ads-ad > .ad_cclk > a.V0MxL", + bottom_ads_links: "#bottomads .ads-ad > .ad_cclk > a.V0MxL" + } def scrap(html) do result = %ScrapperResult{} @@ -17,26 +26,61 @@ defmodule GoogleCrawler.Google.Scrapper do parse_raw_html_result(result, html) |> parse_total_results(document) + |> parse_non_ads_links(document) + |> parse_top_ads_links(document) + |> parse_bottom_ads_links(document) end - def parse_total_results(result, document) do - total_results_text = Floki.find(document, "#result-stats") |> Floki.text + # TODO: Make sure the result page is in English to match the regex + defp parse_total_results(result, document) do + total_results_text = Floki.find(document, "#result-stats") |> Floki.text() total_results = Regex.named_captures(~r/About (?.*) results/, total_results_text) |> Map.get("total_result") |> String.replace(",", "") - |> Integer.parse + |> Integer.parse() |> elem(0) - %{ result | total_results: total_results } + %{result | total_results: total_results} end - def parse_raw_html_result(result, html) do - %{ result | raw_html_result: cleanup_html(html) } + defp parse_non_ads_links(result, document) do + non_ads_links = + document + |> Floki.find(@selectors.non_ads_links) + |> Floki.attribute("href") + + %{result | links: non_ads_links, total_links: length(non_ads_links)} + end + + defp parse_top_ads_links(result, document) do + top_ads_links = + document + |> Floki.find(@selectors.top_ads_links) + |> Floki.attribute("href") + + %{result | top_ads_links: top_ads_links, total_top_ads_links: length(top_ads_links)} + end + + defp parse_bottom_ads_links(result, document) do + bottom_ads_links = + document + |> Floki.find(@selectors.bottom_ads_links) + |> Floki.attribute("href") + + %{ + result + | bottom_ads_links: bottom_ads_links, + total_bottom_ads_links: length(bottom_ads_links) + } + end + + defp parse_raw_html_result(result, html) do + %{result | raw_html_result: cleanup_html(html)} end - def cleanup_html(html) do + defp cleanup_html(html) do html |> String.chunk(:printable) |> Enum.filter(&String.printable?/1) diff --git a/test/fixtures/search_results/hotels.html b/test/fixtures/search_results/hotels.html index 3a74990..44c3bb8 100644 --- a/test/fixtures/search_results/hotels.html +++ b/test/fixtures/search_results/hotels.html @@ -1,226 +1,245 @@ + + +hotels - Google Search

Accessibility links

Skip to main contentAccessibility help
Accessibility feedback
About 5,970,000,000 results (0.79 seconds) 

Ads


  1. Hotels: Booking.com‎

    Ad·www.booking.com/Hotels
    Ad·www.booking.com/Hotels
    Lowest Price Guarantee! Book at over 2,590,000 hotels online. Best Price Guarantee.

    People also search for

Footer links

Thailand
Bang Chak, Phra Khanong, Bangkok - From your Internet address - Use precise location - Learn more

Accessibility links

Skip to main contentAccessibility help
Accessibility feedback
About 3,820,000,000 results (0.65 seconds) 

Ads

  1. 1000s of adorable dogs & cats for adoption. Adopt rabbits, hamsters & fish too! Discussion Forum. Adopt Cute Pets. Lost & Found. Types: Puppies, Kittens, Rabbits & Hamsters.

    People also search for

Footer links

Google apps
Google Account
Rossukhon Leagmongkol
rossukhon@nimblehq.co
+}catch(e){_._DumpException(e)} +})(this.gbar_); +// Google Inc. +
Google apps
\ No newline at end of file diff --git a/test/google_crawler/google/scrapper_test.exs b/test/google_crawler/google/scrapper_test.exs index 2760bff..02816c5 100644 --- a/test/google_crawler/google/scrapper_test.exs +++ b/test/google_crawler/google/scrapper_test.exs @@ -8,15 +8,41 @@ defmodule GoogleCrawler.Google.ScrapperTest do html = response_fixtures("hotels.html") result = Scrapper.scrap(html) + raw_html = cleanup_html(html) - IO.inspect result - # assert %ScrapperResult{ - # raw_html_result: html - # } == result + assert %ScrapperResult{ + raw_html_result: ^raw_html, + total_results: 5_970_000_000, + links: [ + "https://www.hotels.com/", + "https://www.booking.com/city/th/bangkok.html", + "https://www.tripadvisor.com/Hotels-g293916-Bangkok-Hotels.html", + "https://www.agoda.com/city/bangkok-th.html", + "https://www.agoda.com/country/thailand.html", + "https://www.expedia.co.th/en/Bangkok-Hotels.d178236.Travel-Guide-Hotels", + "http://www.bangkok.com/hotels/", + "https://www.trivago.com/bangkok-519/hotel", + "https://www.hotelscombined.com/Place/Bangkok.htm" + ], + total_links: 9, + top_ads_links: [ + "https://www.booking.com/" + ], + total_top_ads_links: 1, + bottom_ads_links: [], + total_bottom_ads_links: 0 + } = result end defp response_fixtures(path) do Path.join(["test/fixtures/search_results", path]) |> File.read!() end + + defp cleanup_html(html) do + html + |> String.chunk(:printable) + |> Enum.filter(&String.printable?/1) + |> Enum.join() + end end From 007238cc0f61308058add0dfd3389fc2e583952c Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Tue, 14 Apr 2020 18:10:38 +0700 Subject: [PATCH 03/17] Add result links table and define the link schema --- lib/google_crawler/google/scrapper.ex | 4 ++-- lib/google_crawler/search.ex | 7 ++++++ lib/google_crawler/search/keyword.ex | 12 +++++++++- lib/google_crawler/search/link.ex | 20 ++++++++++++++++ .../search/search_keyword_worker.ex | 17 ++++++++++---- ..._links_and_add_links_count_to_keywords.exs | 23 +++++++++++++++++++ 6 files changed, 76 insertions(+), 7 deletions(-) create mode 100644 lib/google_crawler/search/link.ex create mode 100644 priv/repo/migrations/20200414103012_create_links_and_add_links_count_to_keywords.exs diff --git a/lib/google_crawler/google/scrapper.ex b/lib/google_crawler/google/scrapper.ex index 01d2b3b..74b919e 100644 --- a/lib/google_crawler/google/scrapper.ex +++ b/lib/google_crawler/google/scrapper.ex @@ -13,7 +13,7 @@ defmodule GoogleCrawler.Google.Scrapper do alias GoogleCrawler.Google.ScrapperResult @selectors %{ - total_result: "#result-stats", + total_results: "#result-stats", non_ads_links: "div.r > a", top_ads_links: "#tads .ads-ad > .ad_cclk > a.V0MxL", bottom_ads_links: "#bottomads .ads-ad > .ad_cclk > a.V0MxL" @@ -33,7 +33,7 @@ defmodule GoogleCrawler.Google.Scrapper do # TODO: Make sure the result page is in English to match the regex defp parse_total_results(result, document) do - total_results_text = Floki.find(document, "#result-stats") |> Floki.text() + total_results_text = Floki.find(document, @selectors.total_results) |> Floki.text() total_results = Regex.named_captures(~r/About (?.*) results/, total_results_text) diff --git a/lib/google_crawler/search.ex b/lib/google_crawler/search.ex index 4e88b88..0f5c8f4 100644 --- a/lib/google_crawler/search.ex +++ b/lib/google_crawler/search.ex @@ -98,6 +98,13 @@ defmodule GoogleCrawler.Search do |> Repo.update() end + # TODO: + def update_keyword_result(%Keyword{} = keyword, attrs) do + keyword + |> Keyword.update_result_changeset(attrs) + |> Repo.update() + end + @doc """ Parses the keyword from the given file. Returns the stream for each line in the csv file as [line_result]. diff --git a/lib/google_crawler/search/keyword.ex b/lib/google_crawler/search/keyword.ex index 8872947..878e2bf 100644 --- a/lib/google_crawler/search/keyword.ex +++ b/lib/google_crawler/search/keyword.ex @@ -10,17 +10,27 @@ defmodule GoogleCrawler.Search.Keyword do field :keyword, :string field :status, GoogleCrawler.Search.Keyword.Status, default: :in_queue field :raw_html_result, :string + field :total_results, :integer + field :total_ads_links, :integer + field :total_links, :integer belongs_to :user, GoogleCrawler.Accounts.User + has_many :links, GoogleCrawler.Search.Link timestamps() end - @fields ~w(keyword user_id status raw_html_result)a + @fields ~w(keyword user_id status raw_html_result total_results total_ads_links total_links)a def changeset(keyword, attrs \\ %{}) do keyword |> cast(attrs, @fields) |> validate_required([:keyword, :user_id, :status]) end + + def update_result_changeset(keyword, attrs \\ %{}) do + keyword + |> changeset(attrs) + |> validate_required([:raw_html_result, :total_results, :total_ads_links, :total_links]) + end end diff --git a/lib/google_crawler/search/link.ex b/lib/google_crawler/search/link.ex new file mode 100644 index 0000000..961ec66 --- /dev/null +++ b/lib/google_crawler/search/link.ex @@ -0,0 +1,20 @@ +import EctoEnum + +defenum(GoogleCrawler.Search.Link.AdsPosition, top: 0, bottom: 1) + +defmodule GoogleCrawler.Search.Link do + use Ecto.Schema + + schema "links" do + field :url, :string + field :is_ads, :boolean + field :ads_position, GoogleCrawler.Search.Link.AdsPosition + + belongs_to :keyword, GoogleCrawler.Search.Keyword + end + + # TODO: Implement the changeset + # import Ecto.Changeset + # def changeset(link, attrs \\ %{}) do + # end +end diff --git a/lib/google_crawler/search/search_keyword_worker.ex b/lib/google_crawler/search/search_keyword_worker.ex index f2ed80d..64b558d 100644 --- a/lib/google_crawler/search/search_keyword_worker.ex +++ b/lib/google_crawler/search/search_keyword_worker.ex @@ -3,6 +3,7 @@ defmodule GoogleCrawler.SearchKeywordWorker do alias GoogleCrawler.Search alias GoogleCrawler.Search.Keyword + alias GoogleCrawler.Google.ScrapperResult @max_retry_count 3 @@ -45,10 +46,7 @@ defmodule GoogleCrawler.SearchKeywordWorker do def handle_info({ref, result}, state) do {keyword, _retry_count} = Map.get(state, ref) - Search.update_keyword(keyword, %{ - status: :completed, - raw_html_result: result.raw_html_result - }) + update_keyword_result(keyword, result) # Demonitor the task and remove from the state Process.demonitor(ref, [:flush]) @@ -80,4 +78,15 @@ defmodule GoogleCrawler.SearchKeywordWorker do GoogleCrawler.Search.SearchKeywordTask.perform(keyword) end) end + + # TODO: + defp update_keyword_result(%Keyword{} = keyword, %ScrapperResult{} = result) do + Search.update_keyword(keyword, %{ + status: :completed, + raw_html_result: result.raw_html_result, + total_results: result.total_results, + total_ads_links: result.total_top_ads_link + result.total_bottom_ads_link, + total_links: result.total_links + }) + end end diff --git a/priv/repo/migrations/20200414103012_create_links_and_add_links_count_to_keywords.exs b/priv/repo/migrations/20200414103012_create_links_and_add_links_count_to_keywords.exs new file mode 100644 index 0000000..450e968 --- /dev/null +++ b/priv/repo/migrations/20200414103012_create_links_and_add_links_count_to_keywords.exs @@ -0,0 +1,23 @@ +defmodule GoogleCrawler.Repo.Migrations.CreateLinksAndAddLinksCountToKeyword do + use Ecto.Migration + + def change do + alter table(:keywords) do + add :total_results, :integer + add :total_ads_links, :integer + add :total_links, :integer + end + + create table(:links) do + add :url, :text + add :is_ads, :boolean + add :ads_position, :integer + + add :keyword_id, references(:keywords, on_delete: :delete_all), null: false + + timestamps() + end + + create index(:links, [:keyword_id]) + end +end From 400b764d13e4e4bd1f48a3b8ef71698e94e56bf8 Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Wed, 15 Apr 2020 10:44:41 +0700 Subject: [PATCH 04/17] Specify the user agent and set result language to english --- lib/google_crawler/google/api_client.ex | 11 +++++++++-- lib/google_crawler/google/scrapper.ex | 1 - lib/google_crawler/search/search_keyword_worker.ex | 3 +-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/lib/google_crawler/google/api_client.ex b/lib/google_crawler/google/api_client.ex index 245013f..982a2af 100644 --- a/lib/google_crawler/google/api_client.ex +++ b/lib/google_crawler/google/api_client.ex @@ -5,10 +5,11 @@ end defmodule GoogleCrawler.Google.ApiClient do @behaviour GoogleCrawler.Google.ApiClientBehaviour - @url "https://www.google.com/search?q=" + @url "https://www.google.com/search?hl=en&q=" + @user_agent "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36" def search(keyword) do - case HTTPoison.get(@url <> URI.encode(keyword)) do + case HTTPoison.get(@url <> URI.encode(keyword), request_headers()) do {:ok, %HTTPoison.Response{status_code: 200, body: body}} -> {:ok, body} @@ -19,4 +20,10 @@ defmodule GoogleCrawler.Google.ApiClient do {:error, reason} end end + + def request_headers do + [ + {"User-Agent", @user_agent} + ] + end end diff --git a/lib/google_crawler/google/scrapper.ex b/lib/google_crawler/google/scrapper.ex index 74b919e..08cbaae 100644 --- a/lib/google_crawler/google/scrapper.ex +++ b/lib/google_crawler/google/scrapper.ex @@ -31,7 +31,6 @@ defmodule GoogleCrawler.Google.Scrapper do |> parse_bottom_ads_links(document) end - # TODO: Make sure the result page is in English to match the regex defp parse_total_results(result, document) do total_results_text = Floki.find(document, @selectors.total_results) |> Floki.text() diff --git a/lib/google_crawler/search/search_keyword_worker.ex b/lib/google_crawler/search/search_keyword_worker.ex index 64b558d..804551e 100644 --- a/lib/google_crawler/search/search_keyword_worker.ex +++ b/lib/google_crawler/search/search_keyword_worker.ex @@ -79,13 +79,12 @@ defmodule GoogleCrawler.SearchKeywordWorker do end) end - # TODO: defp update_keyword_result(%Keyword{} = keyword, %ScrapperResult{} = result) do Search.update_keyword(keyword, %{ status: :completed, raw_html_result: result.raw_html_result, total_results: result.total_results, - total_ads_links: result.total_top_ads_link + result.total_bottom_ads_link, + total_ads_links: result.total_top_ads_links + result.total_bottom_ads_links, total_links: result.total_links }) end From 3da469c556c3bcaedfdacb126b3dc01776e76f3f Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Wed, 15 Apr 2020 11:04:09 +0700 Subject: [PATCH 05/17] Separate the scrapper result file and show the number of result --- lib/google_crawler/google/scrapper.ex | 11 ----------- lib/google_crawler/google/scrapper_result.ex | 10 ++++++++++ lib/google_crawler/search.ex | 13 ++++++++++++- .../templates/keyword/show.html.eex | 14 ++++++++++++++ 4 files changed, 36 insertions(+), 12 deletions(-) create mode 100644 lib/google_crawler/google/scrapper_result.ex diff --git a/lib/google_crawler/google/scrapper.ex b/lib/google_crawler/google/scrapper.ex index 08cbaae..1163872 100644 --- a/lib/google_crawler/google/scrapper.ex +++ b/lib/google_crawler/google/scrapper.ex @@ -1,14 +1,3 @@ -defmodule GoogleCrawler.Google.ScrapperResult do - defstruct raw_html_result: nil, - total_results: 0, - links: [], - total_links: 0, - top_ads_links: [], - total_top_ads_links: 0, - bottom_ads_links: [], - total_bottom_ads_links: 0 -end - defmodule GoogleCrawler.Google.Scrapper do alias GoogleCrawler.Google.ScrapperResult diff --git a/lib/google_crawler/google/scrapper_result.ex b/lib/google_crawler/google/scrapper_result.ex new file mode 100644 index 0000000..9354ecb --- /dev/null +++ b/lib/google_crawler/google/scrapper_result.ex @@ -0,0 +1,10 @@ +defmodule GoogleCrawler.Google.ScrapperResult do + defstruct raw_html_result: nil, + total_results: 0, + links: [], + total_links: 0, + top_ads_links: [], + total_top_ads_links: 0, + bottom_ads_links: [], + total_bottom_ads_links: 0 +end diff --git a/lib/google_crawler/search.ex b/lib/google_crawler/search.ex index 0f5c8f4..0039a01 100644 --- a/lib/google_crawler/search.ex +++ b/lib/google_crawler/search.ex @@ -98,7 +98,18 @@ defmodule GoogleCrawler.Search do |> Repo.update() end - # TODO: + @doc """ + Update the search result for a keyword. + + ## Examples + + iex> update_keyword_result(keyword, %{field: new_value}) + {:ok, %Keyword{}} + + iex> update_keyword_result(keyword, %{field: bad_value}) + {:error, %Ecto.Changeset{}} + + """ def update_keyword_result(%Keyword{} = keyword, attrs) do keyword |> Keyword.update_result_changeset(attrs) diff --git a/lib/google_crawler_web/templates/keyword/show.html.eex b/lib/google_crawler_web/templates/keyword/show.html.eex index f297b04..caeedd5 100644 --- a/lib/google_crawler_web/templates/keyword/show.html.eex +++ b/lib/google_crawler_web/templates/keyword/show.html.eex @@ -1,5 +1,19 @@

<%= gettext("Keyword: %{keyword}", keyword: @keyword.keyword) %>

+
+
    +
  • + <%= gettext("Total Results: %{total_results}", total_results: @keyword.total_results)%> +
  • +
  • + <%= gettext("Total Ads: %{total_ads}", total_ads: @keyword.total_ads_links)%> +
  • +
  • + <%= gettext("Total Non-Ads: %{total_non_ads}", total_non_ads: @keyword.total_links)%> +
  • +
+
+

Result:

From 4690bfbdd17dcc3648d47b627aaff8962b69c380 Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Wed, 15 Apr 2020 12:42:27 +0700 Subject: [PATCH 06/17] Save the keyword result links to the database --- lib/google_crawler/search.ex | 58 +++++++++++++++++++ lib/google_crawler/search/link.ex | 33 +++++++++-- .../search/search_keyword_worker.ex | 15 ++--- test/factories/scrapper_result_factory.ex | 33 +++++++++++ test/google_crawler/search_test.exs | 48 +++++++++++++++ 5 files changed, 172 insertions(+), 15 deletions(-) create mode 100644 test/factories/scrapper_result_factory.ex diff --git a/lib/google_crawler/search.ex b/lib/google_crawler/search.ex index 0039a01..ff867fe 100644 --- a/lib/google_crawler/search.ex +++ b/lib/google_crawler/search.ex @@ -4,10 +4,13 @@ defmodule GoogleCrawler.Search do """ import Ecto.Query, warn: false + alias Ecto.Multi alias GoogleCrawler.Repo alias GoogleCrawler.Search.Keyword alias GoogleCrawler.Search.KeywordFile + alias GoogleCrawler.Search.Link + alias GoogleCrawler.Google.ScrapperResult @doc """ Returns the list of keywords belongs to the given user. @@ -116,6 +119,35 @@ defmodule GoogleCrawler.Search do |> Repo.update() end + @doc """ + Update the keyword result from the scrapper result and mark the keyword as completed. + """ + def update_keyword_result_from_scrapper(%Keyword{} = keyword, %ScrapperResult{} = result) do + keyword_changeset = + Keyword.update_result_changeset(keyword, %{ + status: :completed, + raw_html_result: result.raw_html_result, + total_results: result.total_results, + total_ads_links: result.total_top_ads_links + result.total_bottom_ads_links, + total_links: result.total_links + }) + + Multi.new() + |> create_keyword_link_multi(keyword, result.top_ads_links, %{ + is_ads: true, + ads_position: :top + }) + |> create_keyword_link_multi(keyword, result.bottom_ads_links, %{ + is_ads: true, + ads_position: :bottom + }) + |> create_keyword_link_multi(keyword, result.links, %{ + is_ads: false + }) + |> Multi.update(:keyword, keyword_changeset) + |> Repo.transaction() + end + @doc """ Parses the keyword from the given file. Returns the stream for each line in the csv file as [line_result]. @@ -130,4 +162,30 @@ defmodule GoogleCrawler.Search do def parse_keywords_from_file!(file_path, mime_type) do KeywordFile.parse!(file_path, mime_type) end + + @doc """ + List result links of the given keyword. + """ + def list_keyword_links(keyword, query \\ []) do + Link + |> where(keyword_id: ^keyword.id) + |> where(^query) + |> order_by(desc: :is_ads) + |> Repo.all() + end + + # TODO: Recheck how to document private functions + # Create the multi to insert the links. + # Other attributes of the link except the link itself must be specified + defp create_keyword_link_multi(multi, _keyword, [], _attrs), do: multi + + defp create_keyword_link_multi(multi, keyword, [link | rest_of_links], attrs) do + changeset = + Ecto.build_assoc(keyword, :links) + |> Link.changeset(Map.put(attrs, :url, link)) + + multi + |> Multi.insert("link_#{length(multi.operations)}", changeset) + |> create_keyword_link_multi(keyword, rest_of_links, attrs) + end end diff --git a/lib/google_crawler/search/link.ex b/lib/google_crawler/search/link.ex index 961ec66..be8a73f 100644 --- a/lib/google_crawler/search/link.ex +++ b/lib/google_crawler/search/link.ex @@ -4,6 +4,7 @@ defenum(GoogleCrawler.Search.Link.AdsPosition, top: 0, bottom: 1) defmodule GoogleCrawler.Search.Link do use Ecto.Schema + import Ecto.Changeset schema "links" do field :url, :string @@ -11,10 +12,34 @@ defmodule GoogleCrawler.Search.Link do field :ads_position, GoogleCrawler.Search.Link.AdsPosition belongs_to :keyword, GoogleCrawler.Search.Keyword + + timestamps() + end + + @fields ~w(url is_ads ads_position)a + + def changeset(link, attrs \\ %{}) do + link + |> cast(attrs, @fields) + |> validate_required([:url, :is_ads]) + |> validate_ads_position() end - # TODO: Implement the changeset - # import Ecto.Changeset - # def changeset(link, attrs \\ %{}) do - # end + def validate_ads_position(%Ecto.Changeset{changes: %{is_ads: true}} = changeset) do + validate_change(changeset, :ads_position, fn :ads_position, ads_position -> + case ads_position do + nil -> [ads_position: "is required"] + _ -> [] + end + end) + end + + def validate_ads_position(%Ecto.Changeset{changes: %{is_ads: false}} = changeset) do + validate_change(changeset, :ads_position, fn :ads_position, ads_position -> + case ads_position do + nil -> [] + _ -> [ads_position: "must be nil"] + end + end) + end end diff --git a/lib/google_crawler/search/search_keyword_worker.ex b/lib/google_crawler/search/search_keyword_worker.ex index 804551e..8901e80 100644 --- a/lib/google_crawler/search/search_keyword_worker.ex +++ b/lib/google_crawler/search/search_keyword_worker.ex @@ -46,7 +46,10 @@ defmodule GoogleCrawler.SearchKeywordWorker do def handle_info({ref, result}, state) do {keyword, _retry_count} = Map.get(state, ref) - update_keyword_result(keyword, result) + case Search.update_keyword_result_from_scrapper(keyword, result) do + {:ok, _result} -> :ok + {:error, _reason} -> send(self(), {:DOWN, ref, :process, self(), :failed_to_save_result}) + end # Demonitor the task and remove from the state Process.demonitor(ref, [:flush]) @@ -78,14 +81,4 @@ defmodule GoogleCrawler.SearchKeywordWorker do GoogleCrawler.Search.SearchKeywordTask.perform(keyword) end) end - - defp update_keyword_result(%Keyword{} = keyword, %ScrapperResult{} = result) do - Search.update_keyword(keyword, %{ - status: :completed, - raw_html_result: result.raw_html_result, - total_results: result.total_results, - total_ads_links: result.total_top_ads_links + result.total_bottom_ads_links, - total_links: result.total_links - }) - end end diff --git a/test/factories/scrapper_result_factory.ex b/test/factories/scrapper_result_factory.ex new file mode 100644 index 0000000..45c7383 --- /dev/null +++ b/test/factories/scrapper_result_factory.ex @@ -0,0 +1,33 @@ +defmodule GoogleCrawler.ScrapperResultFactory do + def default_attrs(total_links, total_top_ads_links, total_bottom_ads_links) do + total_links = total_links || Enum.random(0..10) + total_top_ads_links = total_top_ads_links || Enum.random(0..5) + total_bottom_ads_links = total_bottom_ads_links || Enum.random(0..5) + + %{ + raw_html_result: FakerElixir.Lorem.sentences(10..20), + total_results: Enum.random(100_000..200_000), + links: build_link(total_links), + total_links: total_links, + top_ads_links: build_link(total_top_ads_links), + total_top_ads_links: total_top_ads_links, + bottom_ads_links: build_link(total_bottom_ads_links), + total_bottom_ads_links: total_bottom_ads_links + } + end + + def build_attrs(attrs \\ %{}) do + Enum.into( + attrs, + default_attrs( + attrs[:total_links], + attrs[:total_top_ads_links], + attrs[:total_bottom_ads_links] + ) + ) + end + + defp build_link(count) do + for _ <- 0..(count - 1), do: FakerElixir.Internet.url() + end +end diff --git a/test/google_crawler/search_test.exs b/test/google_crawler/search_test.exs index 52c9b49..9151d12 100644 --- a/test/google_crawler/search_test.exs +++ b/test/google_crawler/search_test.exs @@ -3,7 +3,9 @@ defmodule GoogleCrawler.SearchTest do alias GoogleCrawler.Search alias GoogleCrawler.Search.Keyword + alias GoogleCrawler.Google.ScrapperResult alias GoogleCrawler.KeywordFactory + alias GoogleCrawler.ScrapperResultFactory alias GoogleCrawler.UserFactory describe "keywords" do @@ -72,6 +74,48 @@ defmodule GoogleCrawler.SearchTest do assert {:error, %Ecto.Changeset{}} = Search.update_keyword(keyword, keyword_attrs) assert Repo.get_by(Keyword, keyword: keyword.keyword) != nil end + + test "update_keyword_result_from_scrapper/2 updates the keyword results and associates the keyword links" do + keyword = KeywordFactory.create() + + scrapper_result = + struct( + ScrapperResult, + ScrapperResultFactory.build_attrs( + total_results: 50_000, + total_links: 10, + total_top_ads_links: 3, + total_bottom_ads_links: 1 + ) + ) + + Search.update_keyword_result_from_scrapper(keyword, scrapper_result) + + # Keyword result summary is updated + assert %{ + total_results: 50_000, + total_links: 10, + total_ads_links: 4 + } = Search.get_keyword(keyword.id) + + # Top Ads links is inserted + top_ads_query = [is_ads: true, ads_position: :top] + top_ads_links = get_link_urls(Search.list_keyword_links(keyword, top_ads_query)) + assert 3 = length(top_ads_links) + assert scrapper_result.top_ads_links == top_ads_links + + # Bottom Ads links is inserted + bottom_ads_query = [is_ads: true, ads_position: :bottom] + bottom_ads_links = get_link_urls(Search.list_keyword_links(keyword, bottom_ads_query)) + assert 1 = length(bottom_ads_links) + assert scrapper_result.bottom_ads_links == bottom_ads_links + + # Non-Ads links is inserted + non_ads_query = [is_ads: false] + links = get_link_urls(Search.list_keyword_links(keyword, non_ads_query)) + assert 10 = length(links) + assert scrapper_result.links == links + end end describe "keyword file" do @@ -94,4 +138,8 @@ defmodule GoogleCrawler.SearchTest do end end end + + defp get_link_urls(links) do + Enum.map(links, &Map.get(&1, :url)) + end end From b8b247b226d398d5de2ae16b3a8974ab727dfcb8 Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Wed, 15 Apr 2020 15:43:42 +0700 Subject: [PATCH 07/17] Extract similar code to function for scrapper --- lib/google_crawler/google/scrapper.ex | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/lib/google_crawler/google/scrapper.ex b/lib/google_crawler/google/scrapper.ex index 1163872..61148d3 100644 --- a/lib/google_crawler/google/scrapper.ex +++ b/lib/google_crawler/google/scrapper.ex @@ -34,28 +34,19 @@ defmodule GoogleCrawler.Google.Scrapper do end defp parse_non_ads_links(result, document) do - non_ads_links = - document - |> Floki.find(@selectors.non_ads_links) - |> Floki.attribute("href") + non_ads_links = parse_links(document, @selectors.non_ads_links) %{result | links: non_ads_links, total_links: length(non_ads_links)} end defp parse_top_ads_links(result, document) do - top_ads_links = - document - |> Floki.find(@selectors.top_ads_links) - |> Floki.attribute("href") + top_ads_links = parse_links(document, @selectors.top_ads_links) %{result | top_ads_links: top_ads_links, total_top_ads_links: length(top_ads_links)} end defp parse_bottom_ads_links(result, document) do - bottom_ads_links = - document - |> Floki.find(@selectors.bottom_ads_links) - |> Floki.attribute("href") + bottom_ads_links = parse_links(document, @selectors.bottom_ads_links) %{ result @@ -68,6 +59,12 @@ defmodule GoogleCrawler.Google.Scrapper do %{result | raw_html_result: cleanup_html(html)} end + defp parse_links(document, selector) do + document + |> Floki.find(selector) + |> Floki.attribute("href") + end + defp cleanup_html(html) do html |> String.chunk(:printable) From d0be953c26c5b6832217ab230471bb33e8338146 Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Wed, 15 Apr 2020 17:11:51 +0700 Subject: [PATCH 08/17] Add tests for link --- lib/google_crawler/search/link.ex | 21 ++++-------- test/factories/link_factory.ex | 27 ++++++++++++++++ test/google_crawler/search/link_test.exs | 41 ++++++++++++++++++++++++ 3 files changed, 74 insertions(+), 15 deletions(-) create mode 100644 test/factories/link_factory.ex create mode 100644 test/google_crawler/search/link_test.exs diff --git a/lib/google_crawler/search/link.ex b/lib/google_crawler/search/link.ex index be8a73f..dcc6e3c 100644 --- a/lib/google_crawler/search/link.ex +++ b/lib/google_crawler/search/link.ex @@ -8,7 +8,7 @@ defmodule GoogleCrawler.Search.Link do schema "links" do field :url, :string - field :is_ads, :boolean + field :is_ads, :boolean, default: false field :ads_position, GoogleCrawler.Search.Link.AdsPosition belongs_to :keyword, GoogleCrawler.Search.Keyword @@ -26,20 +26,11 @@ defmodule GoogleCrawler.Search.Link do end def validate_ads_position(%Ecto.Changeset{changes: %{is_ads: true}} = changeset) do - validate_change(changeset, :ads_position, fn :ads_position, ads_position -> - case ads_position do - nil -> [ads_position: "is required"] - _ -> [] - end - end) + case get_field(changeset, :ads_position) do + nil -> add_error(changeset, :ads_position, "can't be blank") + _ -> changeset + end end - def validate_ads_position(%Ecto.Changeset{changes: %{is_ads: false}} = changeset) do - validate_change(changeset, :ads_position, fn :ads_position, ads_position -> - case ads_position do - nil -> [] - _ -> [ads_position: "must be nil"] - end - end) - end + def validate_ads_position(changeset), do: changeset end diff --git a/test/factories/link_factory.ex b/test/factories/link_factory.ex new file mode 100644 index 0000000..582a941 --- /dev/null +++ b/test/factories/link_factory.ex @@ -0,0 +1,27 @@ +defmodule GoogleCrawler.LinkFactory do + alias GoogleCrawler.Repo + alias GoogleCrawler.Search.Link + alias GoogleCrawler.KeywordFactory + + def default_attrs do + %{ + url: FakerElixir.Internet.url(), + is_ads: false + } + end + + def build_attrs(attrs \\ %{}) do + Enum.into(attrs, default_attrs()) + end + + def create(attrs \\ %{}, keyword \\ KeywordFactory.create()) do + link_attrs = build_attrs(attrs) + + {:ok, link} = + Ecto.build_assoc(keyword, :links) + |> Link.changeset(link_attrs) + |> Repo.insert + + link + end +end diff --git a/test/google_crawler/search/link_test.exs b/test/google_crawler/search/link_test.exs new file mode 100644 index 0000000..19a5920 --- /dev/null +++ b/test/google_crawler/search/link_test.exs @@ -0,0 +1,41 @@ +defmodule Googlecrawler.Search.LinkTest do + use GoogleCrawler.DataCase + + alias GoogleCrawler.LinkFactory + alias GoogleCrawler.Search.Link + + describe "changeset" do + test "url is required" do + attrs = LinkFactory.build_attrs(%{url: ""}) + changeset = Link.changeset(%Link{}, attrs) + + refute changeset.valid? + assert %{url: ["can't be blank"]} = errors_on(changeset) + end + + test "is_ads is required" do + attrs = LinkFactory.build_attrs(%{is_ads: nil}) + changeset = Link.changeset(%Link{}, attrs) + + refute changeset.valid? + assert %{is_ads: ["can't be blank"]} = errors_on(changeset) + end + + test "ads position is required if the link is ads" do + attrs = LinkFactory.build_attrs(%{is_ads: true, ads_position: nil}) + changeset = Link.changeset(%Link{}, attrs) + + refute changeset.valid? + assert %{ads_position: ["can't be blank"]} = errors_on(changeset) + end + + test "ads position is valid" do + attrs = LinkFactory.build_attrs(%{is_ads: true, ads_position: :left}) + changeset = Link.changeset(%Link{}, attrs) + + refute changeset.valid? + # TODO: Recheck the validations + assert %{ads_position: ["can't be blank", "is invalid"]} = errors_on(changeset) + end + end +end From 294909141dac5a6921454885b656097defd7f20d Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Wed, 15 Apr 2020 17:38:33 +0700 Subject: [PATCH 09/17] Add tests for keyword result validations --- lib/google_crawler/search/keyword.ex | 3 + test/google_crawler/search/keyword_test.exs | 82 +++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/lib/google_crawler/search/keyword.ex b/lib/google_crawler/search/keyword.ex index 878e2bf..a775ad9 100644 --- a/lib/google_crawler/search/keyword.ex +++ b/lib/google_crawler/search/keyword.ex @@ -32,5 +32,8 @@ defmodule GoogleCrawler.Search.Keyword do keyword |> changeset(attrs) |> validate_required([:raw_html_result, :total_results, :total_ads_links, :total_links]) + |> validate_number(:total_results, greater_than_or_equal_to: 0) + |> validate_number(:total_ads_links, greater_than_or_equal_to: 0) + |> validate_number(:total_links, greater_than_or_equal_to: 0) end end diff --git a/test/google_crawler/search/keyword_test.exs b/test/google_crawler/search/keyword_test.exs index 7e4b4b9..b691b0a 100644 --- a/test/google_crawler/search/keyword_test.exs +++ b/test/google_crawler/search/keyword_test.exs @@ -40,4 +40,86 @@ defmodule Googlecrawler.Search.KeywordTest do assert %{status: ["is invalid"]} = errors_on(changeset) end end + + describe "update result changeset" do + test "raw_html_result is required" do + attrs = KeywordFactory.build_attrs(%{raw_html_result: ""}) + changeset = Keyword.update_result_changeset(%Keyword{}, attrs) + + refute changeset.valid? + assert %{raw_html_result: ["can't be blank"]} = errors_on(changeset) + end + + test "total_results is required" do + attrs = KeywordFactory.build_attrs(%{total_results: ""}) + changeset = Keyword.update_result_changeset(%Keyword{}, attrs) + + refute changeset.valid? + assert %{total_results: ["can't be blank"]} = errors_on(changeset) + end + + test "total_results is a number" do + attrs = KeywordFactory.build_attrs(%{total_results: "invalid"}) + changeset = Keyword.update_result_changeset(%Keyword{}, attrs) + + refute changeset.valid? + assert %{total_results: ["is invalid"]} = errors_on(changeset) + end + + test "total_results is greater than or equal to 0" do + attrs = KeywordFactory.build_attrs(%{total_results: -1}) + changeset = Keyword.update_result_changeset(%Keyword{}, attrs) + + refute changeset.valid? + assert %{total_results: ["must be greater than or equal to 0"]} = errors_on(changeset) + end + + test "total_ads_links is required" do + attrs = KeywordFactory.build_attrs(%{total_ads_links: ""}) + changeset = Keyword.update_result_changeset(%Keyword{}, attrs) + + refute changeset.valid? + assert %{total_ads_links: ["can't be blank"]} = errors_on(changeset) + end + + test "total_ads_links is a number" do + attrs = KeywordFactory.build_attrs(%{total_ads_links: "invalid"}) + changeset = Keyword.update_result_changeset(%Keyword{}, attrs) + + refute changeset.valid? + assert %{total_ads_links: ["is invalid"]} = errors_on(changeset) + end + + test "total_ads_links is greater than or equal to 0" do + attrs = KeywordFactory.build_attrs(%{total_ads_links: -1}) + changeset = Keyword.update_result_changeset(%Keyword{}, attrs) + + refute changeset.valid? + assert %{total_ads_links: ["must be greater than or equal to 0"]} = errors_on(changeset) + end + + test "total_links is required" do + attrs = KeywordFactory.build_attrs(%{total_links: ""}) + changeset = Keyword.update_result_changeset(%Keyword{}, attrs) + + refute changeset.valid? + assert %{total_links: ["can't be blank"]} = errors_on(changeset) + end + + test "total_links is a number" do + attrs = KeywordFactory.build_attrs(%{total_links: "invalid"}) + changeset = Keyword.update_result_changeset(%Keyword{}, attrs) + + refute changeset.valid? + assert %{total_links: ["is invalid"]} = errors_on(changeset) + end + + test "total_links is greater than or equal to 0" do + attrs = KeywordFactory.build_attrs(%{total_links: -1}) + changeset = Keyword.update_result_changeset(%Keyword{}, attrs) + + refute changeset.valid? + assert %{total_links: ["must be greater than or equal to 0"]} = errors_on(changeset) + end + end end From d5d2274d3b7492ff8577afc7b165e310081c1b32 Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Wed, 15 Apr 2020 18:37:35 +0700 Subject: [PATCH 10/17] Handle the error when database insertion is failed --- .../search/search_keyword_worker.ex | 56 +++++++++++-------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/lib/google_crawler/search/search_keyword_worker.ex b/lib/google_crawler/search/search_keyword_worker.ex index 8901e80..912cedc 100644 --- a/lib/google_crawler/search/search_keyword_worker.ex +++ b/lib/google_crawler/search/search_keyword_worker.ex @@ -1,9 +1,13 @@ defmodule GoogleCrawler.SearchKeywordWorker do + @moduledoc """ + Perform the keyword search and scrap in background. + Update the result of the keyword after it is successfully scraped. + The retry mechanism is implemented. So the task will be retried if it is failed. + """ use GenServer alias GoogleCrawler.Search alias GoogleCrawler.Search.Keyword - alias GoogleCrawler.Google.ScrapperResult @max_retry_count 3 @@ -44,38 +48,46 @@ defmodule GoogleCrawler.SearchKeywordWorker do end def handle_info({ref, result}, state) do - {keyword, _retry_count} = Map.get(state, ref) - - case Search.update_keyword_result_from_scrapper(keyword, result) do - {:ok, _result} -> :ok - {:error, _reason} -> send(self(), {:DOWN, ref, :process, self(), :failed_to_save_result}) - end + {keyword, retry_count} = Map.get(state, ref) - # Demonitor the task and remove from the state - Process.demonitor(ref, [:flush]) - new_state = Map.delete(state, ref) + new_state = + case Search.update_keyword_result_from_scrapper(keyword, result) do + {:ok, _result} -> + # Demonitor the task and remove from the state + Process.demonitor(ref, [:flush]) + Map.delete(state, ref) + + {:error, _reason} -> + maybe_retry(state, ref, keyword, retry_count) + end {:noreply, new_state} end + def handle_info({:DOWN, _ref, :process, _pid, :normal}, state) do + {:noreply, state} + end + def handle_info({:DOWN, ref, :process, _pid, _reason}, state) do {keyword, retry_count} = Map.get(state, ref) - - new_state = - if retry_count < @max_retry_count do - task = start_task(keyword) - - state - |> Map.delete(ref) - |> Map.put(task.ref, {keyword, retry_count + 1}) - else - Search.update_keyword(keyword, %{status: :failed}) - Map.delete(state, ref) - end + new_state = maybe_retry(state, ref, keyword, retry_count) {:noreply, new_state} end + defp maybe_retry(state, ref, keyword, retry_count) do + if retry_count < @max_retry_count do + task = start_task(keyword) + + state + |> Map.delete(ref) + |> Map.put(task.ref, {keyword, retry_count + 1}) + else + Search.update_keyword(keyword, %{status: :failed}) + Map.delete(state, ref) + end + end + defp start_task(%Keyword{} = keyword) do Task.Supervisor.async_nolink(GoogleCrawler.TaskSupervisor, fn -> GoogleCrawler.Search.SearchKeywordTask.perform(keyword) From d2da49d279e0ac5c0e64ad5eeb698659d2c8eab9 Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Thu, 16 Apr 2020 10:35:43 +0700 Subject: [PATCH 11/17] Fix small typo and unused var --- lib/google_crawler/accounts.ex | 2 +- test/google_crawler/search/search_keyword_worker_test.exs | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/google_crawler/accounts.ex b/lib/google_crawler/accounts.ex index 874d58f..4b71e29 100644 --- a/lib/google_crawler/accounts.ex +++ b/lib/google_crawler/accounts.ex @@ -64,7 +64,7 @@ defmodule GoogleCrawler.Accounts do ## Examples iex> auth_user("bob@email.com", "valid_password") - {:ok, $User{}} + {:ok, %User{}} iex> auth_user("bob@email.com", "invalid_password") {:error, "invalid password"} diff --git a/test/google_crawler/search/search_keyword_worker_test.exs b/test/google_crawler/search/search_keyword_worker_test.exs index db0fcd8..07432f4 100644 --- a/test/google_crawler/search/search_keyword_worker_test.exs +++ b/test/google_crawler/search/search_keyword_worker_test.exs @@ -38,7 +38,6 @@ defmodule GoogleCrawler.Search.SearchKeywordWorkerTest do keyword = KeywordFactory.create(%{keyword: "error"}) task = SearchKeywordWorker.search(keyword.id) - task_ref = task.ref # Find a way to test without sleep 😔 :timer.sleep(1000) From df7fc934a04223aa6fe9153ee8f1bf0a1245ba5e Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Thu, 16 Apr 2020 11:35:15 +0700 Subject: [PATCH 12/17] Display the link result --- assets/css/screens/keyword.scss | 21 +++++++++++++++++++ lib/google_crawler/search.ex | 5 ++++- .../templates/keyword/show.html.eex | 17 ++++++++++++++- 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/assets/css/screens/keyword.scss b/assets/css/screens/keyword.scss index ea55079..973e515 100644 --- a/assets/css/screens/keyword.scss +++ b/assets/css/screens/keyword.scss @@ -1,4 +1,25 @@ body.keyword.show { + .badge { + border: { + width: 1px; + style: solid; + radius: 5px; + } + font-size: x-small; + padding: 3px; + text-transform: uppercase; + } + + .badge-ads { + border-color: green; + color: green; + } + + .badge-ads-position { + border-color: grey; + color: grey; + } + iframe { width: 100%; height: 400px; diff --git a/lib/google_crawler/search.ex b/lib/google_crawler/search.ex index ff867fe..d3b2c48 100644 --- a/lib/google_crawler/search.ex +++ b/lib/google_crawler/search.ex @@ -41,7 +41,10 @@ defmodule GoogleCrawler.Search do ** (Ecto.NoResultsError) """ - def get_keyword(id), do: Repo.get(Keyword, id) + def get_keyword(id) do + Repo.get(Keyword, id) + |> Repo.preload(:links) + end @doc """ Creates a keyword. diff --git a/lib/google_crawler_web/templates/keyword/show.html.eex b/lib/google_crawler_web/templates/keyword/show.html.eex index caeedd5..08ab05c 100644 --- a/lib/google_crawler_web/templates/keyword/show.html.eex +++ b/lib/google_crawler_web/templates/keyword/show.html.eex @@ -15,6 +15,21 @@
-

Result:

+

Results:

+ +
+ +
+

Raw Html Result:

From 9fdfead1822c584ff3735ee4ccb1f37502209bcc Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Thu, 16 Apr 2020 11:35:45 +0700 Subject: [PATCH 13/17] Fix code format --- test/factories/link_factory.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/factories/link_factory.ex b/test/factories/link_factory.ex index 582a941..6007309 100644 --- a/test/factories/link_factory.ex +++ b/test/factories/link_factory.ex @@ -20,7 +20,7 @@ defmodule GoogleCrawler.LinkFactory do {:ok, link} = Ecto.build_assoc(keyword, :links) |> Link.changeset(link_attrs) - |> Repo.insert + |> Repo.insert() link end From 23e3cde1fd50ef507145d230b7035bd44c82e760 Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Thu, 16 Apr 2020 12:03:52 +0700 Subject: [PATCH 14/17] Add back nav and specify keyword ordering in the query --- assets/css/screens/keyword.scss | 4 ++++ lib/google_crawler/search.ex | 1 + .../templates/keyword/show.html.eex | 12 +++++++++--- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/assets/css/screens/keyword.scss b/assets/css/screens/keyword.scss index 973e515..74abcd6 100644 --- a/assets/css/screens/keyword.scss +++ b/assets/css/screens/keyword.scss @@ -1,4 +1,8 @@ body.keyword.show { + nav ul { + list-style: none; + } + .badge { border: { width: 1px; diff --git a/lib/google_crawler/search.ex b/lib/google_crawler/search.ex index d3b2c48..cb8e1e1 100644 --- a/lib/google_crawler/search.ex +++ b/lib/google_crawler/search.ex @@ -24,6 +24,7 @@ defmodule GoogleCrawler.Search do def list_user_keywords(user) do Keyword |> where(user_id: ^user.id) + |> order_by(desc: :inserted_at) |> Repo.all() end diff --git a/lib/google_crawler_web/templates/keyword/show.html.eex b/lib/google_crawler_web/templates/keyword/show.html.eex index 08ab05c..cdfec27 100644 --- a/lib/google_crawler_web/templates/keyword/show.html.eex +++ b/lib/google_crawler_web/templates/keyword/show.html.eex @@ -1,3 +1,9 @@ + +

<%= gettext("Keyword: %{keyword}", keyword: @keyword.keyword) %>

@@ -15,12 +21,12 @@
-

Results:

+

<%= gettext("Results:") %>

-

Raw Html Result:

+

<%= gettext("Raw Html Result:") %>

From 1b813a38db4e9bba8c7768b26b6f3bc20a190006 Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Thu, 16 Apr 2020 12:19:38 +0700 Subject: [PATCH 15/17] Increase the wait time to process the keyword --- ...03012_create_links_and_add_links_count_to_keywords.exs | 2 +- test/google_crawler/search/search_keyword_worker_test.exs | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/priv/repo/migrations/20200414103012_create_links_and_add_links_count_to_keywords.exs b/priv/repo/migrations/20200414103012_create_links_and_add_links_count_to_keywords.exs index 450e968..63c8e1a 100644 --- a/priv/repo/migrations/20200414103012_create_links_and_add_links_count_to_keywords.exs +++ b/priv/repo/migrations/20200414103012_create_links_and_add_links_count_to_keywords.exs @@ -3,7 +3,7 @@ defmodule GoogleCrawler.Repo.Migrations.CreateLinksAndAddLinksCountToKeyword do def change do alter table(:keywords) do - add :total_results, :integer + add :total_results, :bigint add :total_ads_links, :integer add :total_links, :integer end diff --git a/test/google_crawler/search/search_keyword_worker_test.exs b/test/google_crawler/search/search_keyword_worker_test.exs index 07432f4..18ccbaa 100644 --- a/test/google_crawler/search/search_keyword_worker_test.exs +++ b/test/google_crawler/search/search_keyword_worker_test.exs @@ -15,7 +15,9 @@ defmodule GoogleCrawler.Search.SearchKeywordWorkerTest do assert %{^task_ref => {keyword, 0}} = SearchKeywordWorker.get_state() assert Search.get_keyword(keyword.id).status == :in_progress - :timer.sleep(1000) + # Find a way to test without sleep 😔 + :timer.sleep(1500) + assert SearchKeywordWorker.get_state() == %{} end @@ -25,7 +27,7 @@ defmodule GoogleCrawler.Search.SearchKeywordWorkerTest do SearchKeywordWorker.search(keyword.id) # Find a way to test without sleep 😔 - :timer.sleep(1000) + :timer.sleep(1500) keyword = Search.get_keyword(keyword.id) assert keyword.status == :completed @@ -40,7 +42,7 @@ defmodule GoogleCrawler.Search.SearchKeywordWorkerTest do task = SearchKeywordWorker.search(keyword.id) # Find a way to test without sleep 😔 - :timer.sleep(1000) + :timer.sleep(1500) assert Search.get_keyword(keyword.id).status == :failed assert SearchKeywordWorker.get_state() == %{} From 665d3a7324603480c15fb00ab04997e9e360e547 Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Thu, 16 Apr 2020 13:42:45 +0700 Subject: [PATCH 16/17] Fix scraper typo --- .../google/{scrapper.ex => scraper.ex} | 6 +++--- .../{scrapper_result.ex => scraper_result.ex} | 2 +- lib/google_crawler/search.ex | 6 +++--- .../search/search_keyword_task.ex | 4 ++-- .../search/search_keyword_worker.ex | 2 +- .../controllers/upload_controller.ex | 2 +- ...t_factory.ex => scraper_result_factory.ex} | 2 +- .../{scrapper_test.exs => scraper_test.exs} | 10 +++++----- test/google_crawler/search_test.exs | 20 +++++++++---------- 9 files changed, 27 insertions(+), 27 deletions(-) rename lib/google_crawler/google/{scrapper.ex => scraper.ex} (94%) rename lib/google_crawler/google/{scrapper_result.ex => scraper_result.ex} (84%) rename test/factories/{scrapper_result_factory.ex => scraper_result_factory.ex} (95%) rename test/google_crawler/google/{scrapper_test.exs => scraper_test.exs} (87%) diff --git a/lib/google_crawler/google/scrapper.ex b/lib/google_crawler/google/scraper.ex similarity index 94% rename from lib/google_crawler/google/scrapper.ex rename to lib/google_crawler/google/scraper.ex index 61148d3..a8ad2cc 100644 --- a/lib/google_crawler/google/scrapper.ex +++ b/lib/google_crawler/google/scraper.ex @@ -1,5 +1,5 @@ -defmodule GoogleCrawler.Google.Scrapper do - alias GoogleCrawler.Google.ScrapperResult +defmodule GoogleCrawler.Google.Scraper do + alias GoogleCrawler.Google.ScraperResult @selectors %{ total_results: "#result-stats", @@ -9,7 +9,7 @@ defmodule GoogleCrawler.Google.Scrapper do } def scrap(html) do - result = %ScrapperResult{} + result = %ScraperResult{} {:ok, document} = Floki.parse_document(html) diff --git a/lib/google_crawler/google/scrapper_result.ex b/lib/google_crawler/google/scraper_result.ex similarity index 84% rename from lib/google_crawler/google/scrapper_result.ex rename to lib/google_crawler/google/scraper_result.ex index 9354ecb..652dfb5 100644 --- a/lib/google_crawler/google/scrapper_result.ex +++ b/lib/google_crawler/google/scraper_result.ex @@ -1,4 +1,4 @@ -defmodule GoogleCrawler.Google.ScrapperResult do +defmodule GoogleCrawler.Google.ScraperResult do defstruct raw_html_result: nil, total_results: 0, links: [], diff --git a/lib/google_crawler/search.ex b/lib/google_crawler/search.ex index cb8e1e1..ad45314 100644 --- a/lib/google_crawler/search.ex +++ b/lib/google_crawler/search.ex @@ -10,7 +10,7 @@ defmodule GoogleCrawler.Search do alias GoogleCrawler.Search.Keyword alias GoogleCrawler.Search.KeywordFile alias GoogleCrawler.Search.Link - alias GoogleCrawler.Google.ScrapperResult + alias GoogleCrawler.Google.ScraperResult @doc """ Returns the list of keywords belongs to the given user. @@ -124,9 +124,9 @@ defmodule GoogleCrawler.Search do end @doc """ - Update the keyword result from the scrapper result and mark the keyword as completed. + Update the keyword result from the scraper result and mark the keyword as completed. """ - def update_keyword_result_from_scrapper(%Keyword{} = keyword, %ScrapperResult{} = result) do + def update_keyword_result_from_scraper(%Keyword{} = keyword, %ScraperResult{} = result) do keyword_changeset = Keyword.update_result_changeset(keyword, %{ status: :completed, diff --git a/lib/google_crawler/search/search_keyword_task.ex b/lib/google_crawler/search/search_keyword_task.ex index bd14ffa..ab820b7 100644 --- a/lib/google_crawler/search/search_keyword_task.ex +++ b/lib/google_crawler/search/search_keyword_task.ex @@ -1,11 +1,11 @@ defmodule GoogleCrawler.Search.SearchKeywordTask do alias GoogleCrawler.Search.Keyword - alias GoogleCrawler.Google.Scrapper + alias GoogleCrawler.Google.Scraper def perform(%Keyword{} = keyword) do case google_api_client().search(keyword.keyword) do {:ok, body} -> - Scrapper.scrap(body) + Scraper.scrap(body) {:error, reason} -> raise "Keyword search failed: #{reason}" diff --git a/lib/google_crawler/search/search_keyword_worker.ex b/lib/google_crawler/search/search_keyword_worker.ex index 912cedc..8bdf9fe 100644 --- a/lib/google_crawler/search/search_keyword_worker.ex +++ b/lib/google_crawler/search/search_keyword_worker.ex @@ -51,7 +51,7 @@ defmodule GoogleCrawler.SearchKeywordWorker do {keyword, retry_count} = Map.get(state, ref) new_state = - case Search.update_keyword_result_from_scrapper(keyword, result) do + case Search.update_keyword_result_from_scraper(keyword, result) do {:ok, _result} -> # Demonitor the task and remove from the state Process.demonitor(ref, [:flush]) diff --git a/lib/google_crawler_web/controllers/upload_controller.ex b/lib/google_crawler_web/controllers/upload_controller.ex index d1a6fb2..cba72e1 100644 --- a/lib/google_crawler_web/controllers/upload_controller.ex +++ b/lib/google_crawler_web/controllers/upload_controller.ex @@ -22,7 +22,7 @@ defmodule GoogleCrawlerWeb.UploadController do end end - # TODO: Trigger the scrapper background worker + # TODO: Trigger the scraper background worker defp create_and_trigger_google_search(csv_result, conn) do csv_result |> Stream.map(fn keyword_row -> List.first(keyword_row) end) diff --git a/test/factories/scrapper_result_factory.ex b/test/factories/scraper_result_factory.ex similarity index 95% rename from test/factories/scrapper_result_factory.ex rename to test/factories/scraper_result_factory.ex index 45c7383..c28bee6 100644 --- a/test/factories/scrapper_result_factory.ex +++ b/test/factories/scraper_result_factory.ex @@ -1,4 +1,4 @@ -defmodule GoogleCrawler.ScrapperResultFactory do +defmodule GoogleCrawler.ScraperResultFactory do def default_attrs(total_links, total_top_ads_links, total_bottom_ads_links) do total_links = total_links || Enum.random(0..10) total_top_ads_links = total_top_ads_links || Enum.random(0..5) diff --git a/test/google_crawler/google/scrapper_test.exs b/test/google_crawler/google/scraper_test.exs similarity index 87% rename from test/google_crawler/google/scrapper_test.exs rename to test/google_crawler/google/scraper_test.exs index 02816c5..c9ff989 100644 --- a/test/google_crawler/google/scrapper_test.exs +++ b/test/google_crawler/google/scraper_test.exs @@ -1,16 +1,16 @@ -defmodule GoogleCrawler.Google.ScrapperTest do +defmodule GoogleCrawler.Google.ScraperTest do use ExUnit.Case - alias GoogleCrawler.Google.Scrapper - alias GoogleCrawler.Google.ScrapperResult + alias GoogleCrawler.Google.Scraper + alias GoogleCrawler.Google.ScraperResult test "scrap/1" do html = response_fixtures("hotels.html") - result = Scrapper.scrap(html) + result = Scraper.scrap(html) raw_html = cleanup_html(html) - assert %ScrapperResult{ + assert %ScraperResult{ raw_html_result: ^raw_html, total_results: 5_970_000_000, links: [ diff --git a/test/google_crawler/search_test.exs b/test/google_crawler/search_test.exs index 9151d12..cc337ff 100644 --- a/test/google_crawler/search_test.exs +++ b/test/google_crawler/search_test.exs @@ -3,9 +3,9 @@ defmodule GoogleCrawler.SearchTest do alias GoogleCrawler.Search alias GoogleCrawler.Search.Keyword - alias GoogleCrawler.Google.ScrapperResult + alias GoogleCrawler.Google.ScraperResult alias GoogleCrawler.KeywordFactory - alias GoogleCrawler.ScrapperResultFactory + alias GoogleCrawler.ScraperResultFactory alias GoogleCrawler.UserFactory describe "keywords" do @@ -75,13 +75,13 @@ defmodule GoogleCrawler.SearchTest do assert Repo.get_by(Keyword, keyword: keyword.keyword) != nil end - test "update_keyword_result_from_scrapper/2 updates the keyword results and associates the keyword links" do + test "update_keyword_result_from_scraper/2 updates the keyword results and associates the keyword links" do keyword = KeywordFactory.create() - scrapper_result = + scraper_result = struct( - ScrapperResult, - ScrapperResultFactory.build_attrs( + ScraperResult, + ScraperResultFactory.build_attrs( total_results: 50_000, total_links: 10, total_top_ads_links: 3, @@ -89,7 +89,7 @@ defmodule GoogleCrawler.SearchTest do ) ) - Search.update_keyword_result_from_scrapper(keyword, scrapper_result) + Search.update_keyword_result_from_scraper(keyword, scraper_result) # Keyword result summary is updated assert %{ @@ -102,19 +102,19 @@ defmodule GoogleCrawler.SearchTest do top_ads_query = [is_ads: true, ads_position: :top] top_ads_links = get_link_urls(Search.list_keyword_links(keyword, top_ads_query)) assert 3 = length(top_ads_links) - assert scrapper_result.top_ads_links == top_ads_links + assert scraper_result.top_ads_links == top_ads_links # Bottom Ads links is inserted bottom_ads_query = [is_ads: true, ads_position: :bottom] bottom_ads_links = get_link_urls(Search.list_keyword_links(keyword, bottom_ads_query)) assert 1 = length(bottom_ads_links) - assert scrapper_result.bottom_ads_links == bottom_ads_links + assert scraper_result.bottom_ads_links == bottom_ads_links # Non-Ads links is inserted non_ads_query = [is_ads: false] links = get_link_urls(Search.list_keyword_links(keyword, non_ads_query)) assert 10 = length(links) - assert scrapper_result.links == links + assert scraper_result.links == links end end From 39f55b935192a0b90b98a24217b6ee0694fa7260 Mon Sep 17 00:00:00 2001 From: Rossukhon Leagmongkol Date: Thu, 16 Apr 2020 13:53:07 +0700 Subject: [PATCH 17/17] Remove TODOs --- lib/google_crawler/search.ex | 1 - lib/google_crawler/search/link.ex | 6 ++---- .../controllers/registration_controller.ex | 1 - lib/google_crawler_web/controllers/upload_controller.ex | 1 - test/google_crawler/search/link_test.exs | 2 +- 5 files changed, 3 insertions(+), 8 deletions(-) diff --git a/lib/google_crawler/search.ex b/lib/google_crawler/search.ex index ad45314..27e5c1f 100644 --- a/lib/google_crawler/search.ex +++ b/lib/google_crawler/search.ex @@ -178,7 +178,6 @@ defmodule GoogleCrawler.Search do |> Repo.all() end - # TODO: Recheck how to document private functions # Create the multi to insert the links. # Other attributes of the link except the link itself must be specified defp create_keyword_link_multi(multi, _keyword, [], _attrs), do: multi diff --git a/lib/google_crawler/search/link.ex b/lib/google_crawler/search/link.ex index dcc6e3c..5ea4924 100644 --- a/lib/google_crawler/search/link.ex +++ b/lib/google_crawler/search/link.ex @@ -26,10 +26,8 @@ defmodule GoogleCrawler.Search.Link do end def validate_ads_position(%Ecto.Changeset{changes: %{is_ads: true}} = changeset) do - case get_field(changeset, :ads_position) do - nil -> add_error(changeset, :ads_position, "can't be blank") - _ -> changeset - end + changeset + |> validate_required(:ads_position) end def validate_ads_position(changeset), do: changeset diff --git a/lib/google_crawler_web/controllers/registration_controller.ex b/lib/google_crawler_web/controllers/registration_controller.ex index e7f8d2e..cd33b0a 100644 --- a/lib/google_crawler_web/controllers/registration_controller.ex +++ b/lib/google_crawler_web/controllers/registration_controller.ex @@ -15,7 +15,6 @@ defmodule GoogleCrawlerWeb.RegistrationController do {:ok, _user} -> conn |> put_flash(:info, gettext("You have signed up successfully!")) - # TODO: Change to login path |> redirect(to: Routes.dashboard_path(conn, :index)) {:error, changeset} -> diff --git a/lib/google_crawler_web/controllers/upload_controller.ex b/lib/google_crawler_web/controllers/upload_controller.ex index cba72e1..38256cb 100644 --- a/lib/google_crawler_web/controllers/upload_controller.ex +++ b/lib/google_crawler_web/controllers/upload_controller.ex @@ -22,7 +22,6 @@ defmodule GoogleCrawlerWeb.UploadController do end end - # TODO: Trigger the scraper background worker defp create_and_trigger_google_search(csv_result, conn) do csv_result |> Stream.map(fn keyword_row -> List.first(keyword_row) end) diff --git a/test/google_crawler/search/link_test.exs b/test/google_crawler/search/link_test.exs index 19a5920..aa4458e 100644 --- a/test/google_crawler/search/link_test.exs +++ b/test/google_crawler/search/link_test.exs @@ -35,7 +35,7 @@ defmodule Googlecrawler.Search.LinkTest do refute changeset.valid? # TODO: Recheck the validations - assert %{ads_position: ["can't be blank", "is invalid"]} = errors_on(changeset) + assert %{ads_position: ["is invalid"]} = errors_on(changeset) end end end