From 5cf59c6fe915f4723cdc640e4c42d34345fdd19b Mon Sep 17 00:00:00 2001 From: Karl-Aksel Puulmann Date: Tue, 9 Jul 2024 20:31:06 +0300 Subject: [PATCH 01/15] Add data migration for creating and syncing location_data table and dictionary --- lib/plausible/clickhouse_location_data.ex | 13 ++ .../data_migration/locations_sync.ex | 122 ++++++++++++++++++ .../populate_event_session_columns.ex | 19 +-- lib/plausible/migration_utils.ex | 13 ++ mix.exs | 2 +- mix.lock | 2 +- .../sql/add-alias-column.sql.eex | 5 + .../sql/create-location-data-table.sql.eex | 13 ++ .../get-location-data-table-comment.sql.eex | 1 + .../sql/truncate-location-data-table.sql.eex | 1 + .../update-location-data-dictionary.sql.eex | 11 ++ ...update-location-data-table-comment.sql.eex | 3 + 12 files changed, 189 insertions(+), 16 deletions(-) create mode 100644 lib/plausible/clickhouse_location_data.ex create mode 100644 lib/plausible/data_migration/locations_sync.ex create mode 100644 priv/data_migrations/LocationsSync/sql/add-alias-column.sql.eex create mode 100644 priv/data_migrations/LocationsSync/sql/create-location-data-table.sql.eex create mode 100644 priv/data_migrations/LocationsSync/sql/get-location-data-table-comment.sql.eex create mode 100644 priv/data_migrations/LocationsSync/sql/truncate-location-data-table.sql.eex create mode 100644 priv/data_migrations/LocationsSync/sql/update-location-data-dictionary.sql.eex create mode 100644 priv/data_migrations/LocationsSync/sql/update-location-data-table-comment.sql.eex diff --git a/lib/plausible/clickhouse_location_data.ex b/lib/plausible/clickhouse_location_data.ex new file mode 100644 index 000000000000..453eafa21afa --- /dev/null +++ b/lib/plausible/clickhouse_location_data.ex @@ -0,0 +1,13 @@ +defmodule Plausible.ClickhouseLocationData do + @moduledoc """ + Schema for storing location id <-> translation mappins in Clickhouse + """ + use Ecto.Schema + + @primary_key false + schema "location_data" do + field :type, Ch, type: "LowCardinality(String)" + field :id, :string + field :name, :string + end +end diff --git a/lib/plausible/data_migration/locations_sync.ex b/lib/plausible/data_migration/locations_sync.ex new file mode 100644 index 000000000000..a4f9febb762f --- /dev/null +++ b/lib/plausible/data_migration/locations_sync.ex @@ -0,0 +1,122 @@ +defmodule Plausible.DataMigration.LocationsSync do + @moduledoc """ + ClickHouse locations data migration for storing locations in clickhouse. + + Run regularly as plausible/locations data changes. + + SQL files available at: priv/data_migrations/LocationsSync/sql + """ + alias Plausible.ClickhouseLocationData + + use Plausible.DataMigration, dir: "LocationsSync", repo: Plausible.IngestRepo + + @columns [ + %{ + table: "events_v2", + column_name: "country_name", + type: "country", + input_column: "country_code" + }, + %{ + table: "events_v2", + column_name: "region_name", + type: "subdivision", + input_column: "subdivision1_code" + }, + %{ + table: "events_v2", + column_name: "city_name", + type: "city", + input_column: "city_geoname_id" + }, + %{ + table: "sessions_v2", + column_name: "country_name", + type: "country", + input_column: "country_code" + }, + %{ + table: "sessions_v2", + column_name: "region_name", + type: "subdivision", + input_column: "subdivision1_code" + }, + %{ + table: "sessions_v2", + column_name: "city_name", + type: "city", + input_column: "city_geoname_id" + }, + %{ + table: "imported_locations", + column_name: "country_name", + type: "country", + input_column: "country" + }, + %{ + table: "imported_locations", + column_name: "region_name", + type: "subdivision", + input_column: "region" + }, + %{ + table: "imported_locations", + column_name: "city_name", + type: "city", + input_column: "city" + } + ] + + def out_of_date?() do + case run_sql("get-location-data-table-comment") do + {:ok, %{rows: [[stored_version]]}} -> stored_version != Location.version() + _ -> true + end + end + + def run() do + cluster? = Plausible.MigrationUtils.clustered_table?("sessions_v2") + + {:ok, _} = run_sql("truncate-location-data-table", cluster?: cluster?) + {:ok, _} = run_sql("create-location-data-table", cluster?: cluster?) + + countries = + Location.Country.all() + |> Enum.map(fn country -> %{type: "country", id: country.alpha_2, name: country.name} end) + + subdivisions = + Location.Subdivision.all() + |> Enum.map(fn subdivision -> + %{type: "subdivision", id: subdivision.code, name: subdivision.name} + end) + + cities = + Location.City.all() + |> Enum.map(fn city -> %{type: "city", id: Integer.to_string(city.id), name: city.name} end) + + @repo.insert_all(ClickhouseLocationData, Enum.concat([countries, subdivisions, cities])) + + {:ok, _} = + run_sql("update-location-data-dictionary", + cluster?: cluster?, + dictionary_connection_params: Plausible.MigrationUtils.dictionary_connection_params() + ) + + for column <- @columns do + {:ok, _} = + run_sql("add-alias-column", + cluster?: cluster?, + table: column.table, + column_name: column.column_name, + type: column.type, + input_column: column.input_column + ) + end + + {:ok, _} = + run_sql("update-location-data-table-comment", + cluster?: cluster?, + version: Location.version() + ) + end +end diff --git a/lib/plausible/data_migration/populate_event_session_columns.ex b/lib/plausible/data_migration/populate_event_session_columns.ex index 659ca8c83186..8ae2f9008eb9 100644 --- a/lib/plausible/data_migration/populate_event_session_columns.ex +++ b/lib/plausible/data_migration/populate_event_session_columns.ex @@ -28,7 +28,11 @@ defmodule Plausible.DataMigration.PopulateEventSessionColumns do run_sql("create-sessions-dictionary", cluster?: cluster?, dictionary_connection_params: - Keyword.get(opts, :dictionary_connection_string, dictionary_connection_params()), + Keyword.get( + opts, + :dictionary_connection_string, + Plausible.MigrationUtils.dictionary_connection_params() + ), dictionary_config: dictionary_config(opts) ) @@ -136,19 +140,6 @@ defmodule Plausible.DataMigration.PopulateEventSessionColumns do |> Map.merge(Keyword.get(opts, :dictionary_config, %{})) end - # See https://clickhouse.com/docs/en/sql-reference/dictionaries#clickhouse for context - defp dictionary_connection_params() do - Plausible.IngestRepo.config() - |> Enum.map(fn - {:database, database} -> "DB '#{database}'" - {:username, username} -> "USER '#{username}'" - {:password, password} -> "PASSWORD '#{password}'" - _ -> nil - end) - |> Enum.reject(&is_nil/1) - |> Enum.join(" ") - end - defp get_partitions(opts) do [min_partition, max_partition] = Keyword.get(opts, :partition_range, ["0", "999999"]) diff --git a/lib/plausible/migration_utils.ex b/lib/plausible/migration_utils.ex index ac416a389a4e..a5f4eba6bf31 100644 --- a/lib/plausible/migration_utils.ex +++ b/lib/plausible/migration_utils.ex @@ -13,4 +13,17 @@ defmodule Plausible.MigrationUtils do {:ok, _} -> true end end + + # See https://clickhouse.com/docs/en/sql-reference/dictionaries#clickhouse for context + def dictionary_connection_params() do + Plausible.IngestRepo.config() + |> Enum.map(fn + {:database, database} -> "DB '#{database}'" + {:username, username} -> "USER '#{username}'" + {:password, password} -> "PASSWORD '#{password}'" + _ -> nil + end) + |> Enum.reject(&is_nil/1) + |> Enum.join(" ") + end end diff --git a/mix.exs b/mix.exs index 19b023a42717..ea953b31a9d1 100644 --- a/mix.exs +++ b/mix.exs @@ -93,7 +93,7 @@ defmodule Plausible.MixProject do {:hackney, "~> 1.8"}, {:jason, "~> 1.3"}, {:kaffy, "~> 0.10.2", only: [:dev, :test, :staging, :prod]}, - {:location, git: "https://github.com/plausible/location.git"}, + {:location, git: "https://github.com/plausible/location.git", branch: "all-all"}, {:mox, "~> 1.0", only: [:test, :ce_test]}, {:nanoid, "~> 2.1.0"}, {:nimble_totp, "~> 1.0"}, diff --git a/mix.lock b/mix.lock index 9474037a6057..c5a48d88c5dd 100644 --- a/mix.lock +++ b/mix.lock @@ -73,7 +73,7 @@ "joken": {:hex, :joken, "2.6.0", "b9dd9b6d52e3e6fcb6c65e151ad38bf4bc286382b5b6f97079c47ade6b1bcc6a", [:mix], [{:jose, "~> 1.11.5", [hex: :jose, repo: "hexpm", optional: false]}], "hexpm", "5a95b05a71cd0b54abd35378aeb1d487a23a52c324fa7efdffc512b655b5aaa7"}, "jose": {:hex, :jose, "1.11.6", "613fda82552128aa6fb804682e3a616f4bc15565a048dabd05b1ebd5827ed965", [:mix, :rebar3], [], "hexpm", "6275cb75504f9c1e60eeacb771adfeee4905a9e182103aa59b53fed651ff9738"}, "kaffy": {:hex, :kaffy, "0.10.2", "72e807c525323bd0cbc3ac0c127b7bde61caffdc576fb6554964d3fe6a2a6100", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: false]}, {:phoenix, "~> 1.6", [hex: :phoenix, repo: "hexpm", optional: false]}, {:phoenix_html, "~> 3.0", [hex: :phoenix_html, repo: "hexpm", optional: false]}, {:phoenix_view, "~> 2.0.2", [hex: :phoenix_view, repo: "hexpm", optional: false]}], "hexpm", "651cad5f3bcc91510a671c13c7a273b8b8195fdf2d809208708baecbb77300bf"}, - "location": {:git, "https://github.com/plausible/location.git", "3f360af0c9deac1d2ca0bd1c4fcb8769b673d948", []}, + "location": {:git, "https://github.com/plausible/location.git", "5eeaa752d8f68236ffe637446a8d7aabb05c900f", [branch: "all-all"]}, "locus": {:hex, :locus, "2.3.6", "c9f53fd5df872fca66a54dc0aa2f8b2d3640388e56a0c39a741be0df6d8854bf", [:rebar3], [{:tls_certificate_check, "~> 1.9", [hex: :tls_certificate_check, repo: "hexpm", optional: false]}], "hexpm", "6087aa9a69673e7011837fb4b3d7f756560adde76892c32f5f93904ee30064e2"}, "mail": {:hex, :mail, "0.3.1", "cb0a14e4ed8904e4e5a08214e686ccf6f9099346885db17d8c309381f865cc5c", [:mix], [], "hexpm", "1db701e89865c1d5fa296b2b57b1cd587587cca8d8a1a22892b35ef5a8e352a6"}, "makeup": {:hex, :makeup, "1.1.1", "fa0bc768698053b2b3869fa8a62616501ff9d11a562f3ce39580d60860c3a55e", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "5dc62fbdd0de44de194898b6710692490be74baa02d9d108bc29f007783b0b48"}, diff --git a/priv/data_migrations/LocationsSync/sql/add-alias-column.sql.eex b/priv/data_migrations/LocationsSync/sql/add-alias-column.sql.eex new file mode 100644 index 000000000000..36c77dab8f30 --- /dev/null +++ b/priv/data_migrations/LocationsSync/sql/add-alias-column.sql.eex @@ -0,0 +1,5 @@ +ALTER TABLE <%= @table %> +<%= if @cluster? do %>ON CLUSTER '{cluster}'<% end %> +ADD COLUMN IF NOT EXISTS +<%= @column_name %> String +ALIAS dictGet('location_data_dict', 'name', tuple('<%= @type %>', <%= @input_column %>)) diff --git a/priv/data_migrations/LocationsSync/sql/create-location-data-table.sql.eex b/priv/data_migrations/LocationsSync/sql/create-location-data-table.sql.eex new file mode 100644 index 000000000000..db2f3ebb91fb --- /dev/null +++ b/priv/data_migrations/LocationsSync/sql/create-location-data-table.sql.eex @@ -0,0 +1,13 @@ +CREATE TABLE IF NOT EXISTS location_data <%= if @cluster? do %>ON CLUSTER '{cluster}'<% end %> +( + `type` LowCardinality(String), + `id` String, + `name` String +) +<%= if @cluster? do %> +ENGINE = ReplicateMergeTree('/clickhouse/{cluster}/tables/{shard}/plausible_prod/location_data', '{replica}') +<% else %> +ENGINE = MergeTree() +<% end %> +ORDER BY (type, id) +SETTINGS index_granularity = 128 diff --git a/priv/data_migrations/LocationsSync/sql/get-location-data-table-comment.sql.eex b/priv/data_migrations/LocationsSync/sql/get-location-data-table-comment.sql.eex new file mode 100644 index 000000000000..8d8acc1e4aea --- /dev/null +++ b/priv/data_migrations/LocationsSync/sql/get-location-data-table-comment.sql.eex @@ -0,0 +1 @@ +select comment from system.tables where database = currentDatabase() and table = 'location_data' diff --git a/priv/data_migrations/LocationsSync/sql/truncate-location-data-table.sql.eex b/priv/data_migrations/LocationsSync/sql/truncate-location-data-table.sql.eex new file mode 100644 index 000000000000..852d56c11f3c --- /dev/null +++ b/priv/data_migrations/LocationsSync/sql/truncate-location-data-table.sql.eex @@ -0,0 +1 @@ +TRUNCATE TABLE IF EXISTS location_data <%= if @cluster? do %>ON CLUSTER '{cluster}'<% end %> diff --git a/priv/data_migrations/LocationsSync/sql/update-location-data-dictionary.sql.eex b/priv/data_migrations/LocationsSync/sql/update-location-data-dictionary.sql.eex new file mode 100644 index 000000000000..1b24d16d3000 --- /dev/null +++ b/priv/data_migrations/LocationsSync/sql/update-location-data-dictionary.sql.eex @@ -0,0 +1,11 @@ +CREATE OR REPLACE DICTIONARY location_data_dict +<%= if @cluster? do %>ON CLUSTER '{cluster}'<% end %> +( + `type` String, + `id` String, + `name` String +) +PRIMARY KEY type, id +SOURCE(CLICKHOUSE(TABLE location_data <%= @dictionary_connection_params %>)) +LIFETIME(0) +LAYOUT(complex_key_cache(size_in_cells 500000)) diff --git a/priv/data_migrations/LocationsSync/sql/update-location-data-table-comment.sql.eex b/priv/data_migrations/LocationsSync/sql/update-location-data-table-comment.sql.eex new file mode 100644 index 000000000000..94b3c081afc6 --- /dev/null +++ b/priv/data_migrations/LocationsSync/sql/update-location-data-table-comment.sql.eex @@ -0,0 +1,3 @@ +ALTER TABLE location_data +<%= if @cluster? do %>ON CLUSTER '{cluster}'<% end %> +MODIFY COMMENT '<%= @version %>' From 17b93c0f3e1a6353be24475e40fea704aaf24e9d Mon Sep 17 00:00:00 2001 From: Karl-Aksel Puulmann Date: Tue, 9 Jul 2024 21:16:10 +0300 Subject: [PATCH 02/15] Migration to populate location data --- .../20240709181437_populate_location_data.exs | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 priv/ingest_repo/migrations/20240709181437_populate_location_data.exs diff --git a/priv/ingest_repo/migrations/20240709181437_populate_location_data.exs b/priv/ingest_repo/migrations/20240709181437_populate_location_data.exs new file mode 100644 index 000000000000..9610ddba6d55 --- /dev/null +++ b/priv/ingest_repo/migrations/20240709181437_populate_location_data.exs @@ -0,0 +1,11 @@ +defmodule Plausible.IngestRepo.Migrations.PopulateLocationData do + use Ecto.Migration + + def up do + Plausible.DataMigration.LocationsSync.run() + end + + def down do + raise "Irreversible" + end +end From f64d4992df62f079a1ad5a26c06e31c6819558e4 Mon Sep 17 00:00:00 2001 From: Karl-Aksel Puulmann Date: Tue, 9 Jul 2024 21:17:01 +0300 Subject: [PATCH 03/15] Daily cron to refresh location dataset if changed --- config/runtime.exs | 4 +++- lib/mix/tasks/clean_clickhouse.ex | 2 +- lib/plausible/data_migration/locations_sync.ex | 2 +- lib/workers/locations_sync.ex | 13 +++++++++++++ .../20240709181437_populate_location_data.exs | 6 ++++++ 5 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 lib/workers/locations_sync.ex diff --git a/config/runtime.exs b/config/runtime.exs index 170f39933d45..17008f93a282 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -532,7 +532,9 @@ base_cron = [ # Every day at 1am {"0 1 * * *", Plausible.Workers.CleanInvitations}, # Every 2 hours - {"0 */2 * * *", Plausible.Workers.ExpireDomainChangeTransitions} + {"0 */2 * * *", Plausible.Workers.ExpireDomainChangeTransitions}, + # Daily at midnight + {"0 0 * * *", Plausible.Workers.LocationsSync} ] cloud_cron = [ diff --git a/lib/mix/tasks/clean_clickhouse.ex b/lib/mix/tasks/clean_clickhouse.ex index 71b984b49200..16731c976466 100644 --- a/lib/mix/tasks/clean_clickhouse.ex +++ b/lib/mix/tasks/clean_clickhouse.ex @@ -6,7 +6,7 @@ defmodule Mix.Tasks.CleanClickhouse do def run(_) do %{rows: rows} = IngestRepo.query!("show tables") tables = Enum.map(rows, fn [table] -> table end) - to_truncate = tables -- ["schema_migrations"] + to_truncate = tables -- ["schema_migrations", "location_data", "location_data_dict"] Enum.each(to_truncate, fn table -> IngestRepo.query!("truncate #{table}") diff --git a/lib/plausible/data_migration/locations_sync.ex b/lib/plausible/data_migration/locations_sync.ex index a4f9febb762f..5790f4e5d261 100644 --- a/lib/plausible/data_migration/locations_sync.ex +++ b/lib/plausible/data_migration/locations_sync.ex @@ -1,6 +1,6 @@ defmodule Plausible.DataMigration.LocationsSync do @moduledoc """ - ClickHouse locations data migration for storing locations in clickhouse. + ClickHouse locations data migration for storing location names in clickhouse. Run regularly as plausible/locations data changes. diff --git a/lib/workers/locations_sync.ex b/lib/workers/locations_sync.ex new file mode 100644 index 000000000000..fe93adb9d997 --- /dev/null +++ b/lib/workers/locations_sync.ex @@ -0,0 +1,13 @@ +defmodule Plausible.Workers.LocationsSync do + use Plausible.Repo + use Oban.Worker, queue: :update_locations + + @impl Oban.Worker + def perform(_job) do + if Plausible.DataMigration.LocationsSync.out_of_date?() do + Plausible.DataMigration.LocationsSync.run() + end + + :ok + end +end diff --git a/priv/ingest_repo/migrations/20240709181437_populate_location_data.exs b/priv/ingest_repo/migrations/20240709181437_populate_location_data.exs index 9610ddba6d55..edcc58f25e5f 100644 --- a/priv/ingest_repo/migrations/20240709181437_populate_location_data.exs +++ b/priv/ingest_repo/migrations/20240709181437_populate_location_data.exs @@ -2,6 +2,12 @@ defmodule Plausible.IngestRepo.Migrations.PopulateLocationData do use Ecto.Migration def up do + try do + Location.load_all() + rescue + _ -> nil + end + Plausible.DataMigration.LocationsSync.run() end From 6aed88656a9997b2874d9198992c38102fda1672 Mon Sep 17 00:00:00 2001 From: Karl-Aksel Puulmann Date: Tue, 9 Jul 2024 22:17:17 +0300 Subject: [PATCH 04/15] Add support for visit:country_name, visit:region_name and visit:city_name dimensions Under the hood this relies on a `location_data` table in clickhouse being regularly synced with plausible/location repo and dictionary lookups used in ALIAS columns --- lib/plausible/stats/filters/filters.ex | 3 ++ lib/plausible/stats/imported/base.ex | 3 ++ lib/plausible/stats/imported/sql/builder.ex | 4 ++ lib/plausible/stats/sql/expression.ex | 9 ++++ .../query_imported_test.exs | 53 ++++++++++++++++++- 5 files changed, 71 insertions(+), 1 deletion(-) diff --git a/lib/plausible/stats/filters/filters.ex b/lib/plausible/stats/filters/filters.ex index 60c1575fc8db..5195dd3a4e38 100644 --- a/lib/plausible/stats/filters/filters.ex +++ b/lib/plausible/stats/filters/filters.ex @@ -23,6 +23,9 @@ defmodule Plausible.Stats.Filters do :country, :region, :city, + :country_name, + :region_name, + :city_name, :entry_page, :exit_page, :entry_page_hostname, diff --git a/lib/plausible/stats/imported/base.ex b/lib/plausible/stats/imported/base.ex index 11909b234bf2..a7c0b144dcc2 100644 --- a/lib/plausible/stats/imported/base.ex +++ b/lib/plausible/stats/imported/base.ex @@ -21,6 +21,9 @@ defmodule Plausible.Stats.Imported.Base do "visit:country" => "imported_locations", "visit:region" => "imported_locations", "visit:city" => "imported_locations", + "visit:country_name" => "imported_locations", + "visit:region_name" => "imported_locations", + "visit:city_name" => "imported_locations", "visit:device" => "imported_devices", "visit:browser" => "imported_browsers", "visit:browser_version" => "imported_browsers", diff --git a/lib/plausible/stats/imported/sql/builder.ex b/lib/plausible/stats/imported/sql/builder.ex index cf81959d3fd4..eff213029db4 100644 --- a/lib/plausible/stats/imported/sql/builder.ex +++ b/lib/plausible/stats/imported/sql/builder.ex @@ -318,6 +318,10 @@ defmodule Plausible.Stats.Imported.SQL.Builder do defp filter_group_values(q, "visit:region"), do: where(q, [i], i.region != "") defp filter_group_values(q, "visit:city"), do: where(q, [i], i.city != 0 and not is_nil(i.city)) + defp filter_group_values(q, "visit:country_name"), do: where(q, [i], i.country_name != "ZZ") + defp filter_group_values(q, "visit:region_name"), do: where(q, [i], i.region_name != "") + defp filter_group_values(q, "visit:city_name"), do: where(q, [i], i.city_name != "") + defp filter_group_values(q, _dimension), do: q def select_joined_dimensions(q, query) do diff --git a/lib/plausible/stats/sql/expression.ex b/lib/plausible/stats/sql/expression.ex index b3b8a8dab1b6..0ce6274f9db1 100644 --- a/lib/plausible/stats/sql/expression.ex +++ b/lib/plausible/stats/sql/expression.ex @@ -179,6 +179,15 @@ defmodule Plausible.Stats.SQL.Expression do def dimension(key, "visit:city", _table, _query), do: wrap_alias([t], %{key => t.city}) + def dimension(key, "visit:country_name", _table, _query), + do: wrap_alias([t], %{key => t.country_name}) + + def dimension(key, "visit:region_name", _table, _query), + do: wrap_alias([t], %{key => t.region_name}) + + def dimension(key, "visit:city_name", _table, _query), + do: wrap_alias([t], %{key => t.city_name}) + def event_metric(:pageviews) do wrap_alias([e], %{ pageviews: diff --git a/test/plausible_web/controllers/api/external_stats_controller/query_imported_test.exs b/test/plausible_web/controllers/api/external_stats_controller/query_imported_test.exs index 2e395a6acd4c..82f87a9cb67c 100644 --- a/test/plausible_web/controllers/api/external_stats_controller/query_imported_test.exs +++ b/test/plausible_web/controllers/api/external_stats_controller/query_imported_test.exs @@ -674,7 +674,10 @@ defmodule PlausibleWeb.Api.ExternalStatsController.QueryImportedTest do for {dimension, stats_value, imports_value} <- [ {"visit:country", "DE", "EE"}, {"visit:region", "DE-BE", "EE-37"}, - {"visit:city", 2_950_159, 588_409} + {"visit:city", 2_950_159, 588_409}, + {"visit:country_name", "Germany", "Estonia"}, + {"visit:region_name", "Berlin", "Harjumaa"}, + {"visit:city_name", "Berlin", "Tallinn"} ] do conn = post(conn, "/api/v2/query", %{ @@ -691,5 +694,53 @@ defmodule PlausibleWeb.Api.ExternalStatsController.QueryImportedTest do ] end end + + test "imported country and city names", %{ + site: site, + conn: conn + } do + site_import = insert(:site_import, site: site) + + populate_stats(site, site_import.id, [ + build(:pageview, + country_code: "GB", + # London + city_geoname_id: 2_643_743 + ), + build(:pageview, + country_code: "CA", + # Different London + city_geoname_id: 6_058_560 + ), + build(:imported_locations, country: "GB", city: 2_643_743, visitors: 33) + ]) + + conn = + post(conn, "/api/v2/query", %{ + "site_id" => site.domain, + "metrics" => ["visitors"], + "date_range" => "all", + "dimensions" => ["visit:city_name"], + "include" => %{"imports" => true} + }) + + assert json_response(conn, 200)["results"] == [ + %{"dimensions" => ["London"], "metrics" => [35]} + ] + + conn = + post(conn, "/api/v2/query", %{ + "site_id" => site.domain, + "metrics" => ["visitors"], + "date_range" => "all", + "dimensions" => ["visit:city_name", "visit:country_name"], + "include" => %{"imports" => true} + }) + + assert json_response(conn, 200)["results"] == [ + %{"dimensions" => ["London", "United Kingdom"], "metrics" => [34]}, + %{"dimensions" => ["London", "Canada"], "metrics" => [1]} + ] + end end end From 981577ff6e0750ab191937b0e38cd5ba94e04924 Mon Sep 17 00:00:00 2001 From: Karl-Aksel Puulmann Date: Tue, 9 Jul 2024 22:54:51 +0300 Subject: [PATCH 05/15] Update queue name --- lib/plausible/data_migration/locations_sync.ex | 3 ++- lib/workers/locations_sync.ex | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/plausible/data_migration/locations_sync.ex b/lib/plausible/data_migration/locations_sync.ex index 5790f4e5d261..806737ede142 100644 --- a/lib/plausible/data_migration/locations_sync.ex +++ b/lib/plausible/data_migration/locations_sync.ex @@ -94,7 +94,8 @@ defmodule Plausible.DataMigration.LocationsSync do Location.City.all() |> Enum.map(fn city -> %{type: "city", id: Integer.to_string(city.id), name: city.name} end) - @repo.insert_all(ClickhouseLocationData, Enum.concat([countries, subdivisions, cities])) + insert_data = Enum.concat([countries, subdivisions, cities]) + @repo.insert_all(ClickhouseLocationData, insert_data) {:ok, _} = run_sql("update-location-data-dictionary", diff --git a/lib/workers/locations_sync.ex b/lib/workers/locations_sync.ex index fe93adb9d997..a261af32613a 100644 --- a/lib/workers/locations_sync.ex +++ b/lib/workers/locations_sync.ex @@ -1,6 +1,8 @@ defmodule Plausible.Workers.LocationsSync do + @moduledoc false + use Plausible.Repo - use Oban.Worker, queue: :update_locations + use Oban.Worker, queue: :locations_sync @impl Oban.Worker def perform(_job) do From 4e34c6c1402983461a135fd29ff6d674bc56066f Mon Sep 17 00:00:00 2001 From: Karl-Aksel Puulmann Date: Wed, 10 Jul 2024 10:29:00 +0300 Subject: [PATCH 06/15] Update documentation --- lib/plausible/clickhouse_location_data.ex | 5 ++++- lib/plausible/data_migration/locations_sync.ex | 17 +++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/lib/plausible/clickhouse_location_data.ex b/lib/plausible/clickhouse_location_data.ex index 453eafa21afa..cb936d2b5f0c 100644 --- a/lib/plausible/clickhouse_location_data.ex +++ b/lib/plausible/clickhouse_location_data.ex @@ -1,6 +1,9 @@ defmodule Plausible.ClickhouseLocationData do @moduledoc """ - Schema for storing location id <-> translation mappins in Clickhouse + Schema for storing location id <-> translation mappings in ClickHouse + + Indirectly read via dictionary `location_data_dictionary` in ALIAS columns in + `events_v2`, `sessions_v2` and `imported_locations` table. """ use Ecto.Schema diff --git a/lib/plausible/data_migration/locations_sync.ex b/lib/plausible/data_migration/locations_sync.ex index 806737ede142..2ca3971d2c54 100644 --- a/lib/plausible/data_migration/locations_sync.ex +++ b/lib/plausible/data_migration/locations_sync.ex @@ -1,8 +1,21 @@ defmodule Plausible.DataMigration.LocationsSync do @moduledoc """ - ClickHouse locations data migration for storing location names in clickhouse. + ClickHouse locations data migration for storing location names in ClickHouse. - Run regularly as plausible/locations data changes. + Only run when `Location.version()` changes. + + The migration: + 1. Truncates existing `location_data` table (if exists) + 2. Creates new table (if needed) + 3. Inserts new data from Location module + 4. (Re-)Creates dictionary to read location data from table + 5. Creates ALIAS columns in `events_v2`, `sessions_v2` and `imported_locations` table to make reading location names easy + 6. Updates table comment for `location_data` to indicate last version synced. + + Note that the dictionary is large enough to cache the whole dataset in memory, making lookups fast. + + This migration is intended to be idempotent and rerunnable - if run multiple times, it should always set things to the same + result as if run once. SQL files available at: priv/data_migrations/LocationsSync/sql """ From 7391de6a4173ff6c18d37dcd9d5aac24519a5842 Mon Sep 17 00:00:00 2001 From: Karl-Aksel Puulmann Date: Wed, 10 Jul 2024 10:30:54 +0300 Subject: [PATCH 07/15] Explicit structs --- lib/plausible/data_migration/locations_sync.ex | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/plausible/data_migration/locations_sync.ex b/lib/plausible/data_migration/locations_sync.ex index 2ca3971d2c54..7867e6ef7202 100644 --- a/lib/plausible/data_migration/locations_sync.ex +++ b/lib/plausible/data_migration/locations_sync.ex @@ -95,17 +95,21 @@ defmodule Plausible.DataMigration.LocationsSync do countries = Location.Country.all() - |> Enum.map(fn country -> %{type: "country", id: country.alpha_2, name: country.name} end) + |> Enum.map(fn %Location.Country{alpha_2: alpha_2, name: name} -> + %{type: "country", id: alpha_2, name: name} + end) subdivisions = Location.Subdivision.all() - |> Enum.map(fn subdivision -> - %{type: "subdivision", id: subdivision.code, name: subdivision.name} + |> Enum.map(fn %Location.Subdivision{code: code, name: name} -> + %{type: "subdivision", id: code, name: name} end) cities = Location.City.all() - |> Enum.map(fn city -> %{type: "city", id: Integer.to_string(city.id), name: city.name} end) + |> Enum.map(fn %Location.City{id: id, name: name} -> + %{type: "city", id: Integer.to_string(id), name: name} + end) insert_data = Enum.concat([countries, subdivisions, cities]) @repo.insert_all(ClickhouseLocationData, insert_data) From ca2ab6de38fd35606cef67732e79fd3166e44bfe Mon Sep 17 00:00:00 2001 From: Karl-Aksel Puulmann Date: Wed, 10 Jul 2024 10:31:31 +0300 Subject: [PATCH 08/15] Improve docs further --- lib/plausible/data_migration/locations_sync.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/plausible/data_migration/locations_sync.ex b/lib/plausible/data_migration/locations_sync.ex index 7867e6ef7202..9b81179c84ae 100644 --- a/lib/plausible/data_migration/locations_sync.ex +++ b/lib/plausible/data_migration/locations_sync.ex @@ -2,7 +2,7 @@ defmodule Plausible.DataMigration.LocationsSync do @moduledoc """ ClickHouse locations data migration for storing location names in ClickHouse. - Only run when `Location.version()` changes. + Only run when `Location.version()` changes: either as a migration or in cron. The migration: 1. Truncates existing `location_data` table (if exists) From 9f31843b405cfec4010e50579099f9b3baf59a5b Mon Sep 17 00:00:00 2001 From: Karl-Aksel Puulmann Date: Wed, 10 Jul 2024 10:34:08 +0300 Subject: [PATCH 09/15] Migration comment --- .../migrations/20240709181437_populate_location_data.exs | 1 + 1 file changed, 1 insertion(+) diff --git a/priv/ingest_repo/migrations/20240709181437_populate_location_data.exs b/priv/ingest_repo/migrations/20240709181437_populate_location_data.exs index edcc58f25e5f..a39275c6e708 100644 --- a/priv/ingest_repo/migrations/20240709181437_populate_location_data.exs +++ b/priv/ingest_repo/migrations/20240709181437_populate_location_data.exs @@ -2,6 +2,7 @@ defmodule Plausible.IngestRepo.Migrations.PopulateLocationData do use Ecto.Migration def up do + # Location data may not be loaded, so _try_ to load it. Failure is OK - it means it's loaded. try do Location.load_all() rescue From 5179dae2907fa74d368eabf8adf25e7ee10852d1 Mon Sep 17 00:00:00 2001 From: Karl-Aksel Puulmann Date: Wed, 10 Jul 2024 10:51:06 +0300 Subject: [PATCH 10/15] Add queues --- config/runtime.exs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/config/runtime.exs b/config/runtime.exs index 17008f93a282..c5568bfcc5fb 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -566,7 +566,9 @@ base_queues = [ analytics_exports: 1, notify_exported_analytics: 1, domain_change_transition: 1, - check_accept_traffic_until: 1 + check_accept_traffic_until: 1, + clickhouse_clean_sites: 1, + locations_sync: 1 ] cloud_queues = [ From fe207269fb1c2f65bd50d32ae6e808a2ca910de3 Mon Sep 17 00:00:00 2001 From: Karl-Aksel Puulmann Date: Wed, 10 Jul 2024 10:53:42 +0300 Subject: [PATCH 11/15] Add error when already loaded --- .../migrations/20240709181437_populate_location_data.exs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/priv/ingest_repo/migrations/20240709181437_populate_location_data.exs b/priv/ingest_repo/migrations/20240709181437_populate_location_data.exs index a39275c6e708..8bb0fb126a9c 100644 --- a/priv/ingest_repo/migrations/20240709181437_populate_location_data.exs +++ b/priv/ingest_repo/migrations/20240709181437_populate_location_data.exs @@ -2,11 +2,11 @@ defmodule Plausible.IngestRepo.Migrations.PopulateLocationData do use Ecto.Migration def up do - # Location data may not be loaded, so _try_ to load it. Failure is OK - it means it's loaded. try do Location.load_all() rescue - _ -> nil + # Already loaded + ArgumentError -> nil end Plausible.DataMigration.LocationsSync.run() From f6a76cae58735d4ae68ebcfbf8868baecdd5baa9 Mon Sep 17 00:00:00 2001 From: Karl-Aksel Puulmann Date: Wed, 10 Jul 2024 11:22:05 +0300 Subject: [PATCH 12/15] Test for filtering by new dimensions --- .../external_stats_controller/query_test.exs | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/test/plausible_web/controllers/api/external_stats_controller/query_test.exs b/test/plausible_web/controllers/api/external_stats_controller/query_test.exs index 5af0f704399a..e126eb9725c6 100644 --- a/test/plausible_web/controllers/api/external_stats_controller/query_test.exs +++ b/test/plausible_web/controllers/api/external_stats_controller/query_test.exs @@ -2813,4 +2813,60 @@ defmodule PlausibleWeb.Api.ExternalStatsController.QueryTest do %{"dimensions" => ["2021-01-03 00:00:00", "Twitter"], "metrics" => [1]} ] end + + test "filtering by visit:country_name, visit:region_name, visit:city_name", %{ + conn: conn, + site: site + } do + populate_stats(site, [ + # GB, London + build(:pageview, + country_code: "GB", + subdivision1_code: "GB-LND", + city_geoname_id: 2_643_743 + ), + # CA, London + build(:pageview, + country_code: "CA", + subdivision1_code: "CA-ON", + city_geoname_id: 6_058_560 + ), + # EE, Tallinn + build(:pageview, + country_code: "EE", + subdivision1_code: "EE-37", + city_geoname_id: 588_409 + ), + # EE, Tartu + build(:pageview, + country_code: "EE", + subdivision1_code: "EE-79", + city_geoname_id: 588_335 + ), + # EE, Jõgeva + build(:pageview, + country_code: "EE", + subdivision1_code: "EE-50", + city_geoname_id: 591_902 + ) + ]) + + conn = + post(conn, "/api/v2/query", %{ + "site_id" => site.domain, + "date_range" => "all", + "metrics" => ["pageviews"], + "filters" => [ + ["is", "visit:country_name", ["Estonia", "United Kingdom"]], + ["is_not", "visit:region_name", ["Tartumaa"]], + ["contains", "visit:city_name", ["n"]] + ], + "dimensions" => ["visit:country_name", "visit:region_name", "visit:city_name"] + }) + + assert json_response(conn, 200)["results"] == [ + %{"dimensions" => ["Estonia", "Harjumaa", "Tallinn"], "metrics" => [1]}, + %{"dimensions" => ["United Kingdom", "London, City of", "London"], "metrics" => [1]} + ] + end end From ac2c71d0eecd2b52961cbed2e797f27cd1e0a9d4 Mon Sep 17 00:00:00 2001 From: Karl-Aksel Puulmann Date: Mon, 12 Aug 2024 10:27:00 +0300 Subject: [PATCH 13/15] Update deps --- mix.exs | 2 +- mix.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mix.exs b/mix.exs index ea953b31a9d1..19b023a42717 100644 --- a/mix.exs +++ b/mix.exs @@ -93,7 +93,7 @@ defmodule Plausible.MixProject do {:hackney, "~> 1.8"}, {:jason, "~> 1.3"}, {:kaffy, "~> 0.10.2", only: [:dev, :test, :staging, :prod]}, - {:location, git: "https://github.com/plausible/location.git", branch: "all-all"}, + {:location, git: "https://github.com/plausible/location.git"}, {:mox, "~> 1.0", only: [:test, :ce_test]}, {:nanoid, "~> 2.1.0"}, {:nimble_totp, "~> 1.0"}, diff --git a/mix.lock b/mix.lock index c5a48d88c5dd..ac3b3c89c691 100644 --- a/mix.lock +++ b/mix.lock @@ -73,7 +73,7 @@ "joken": {:hex, :joken, "2.6.0", "b9dd9b6d52e3e6fcb6c65e151ad38bf4bc286382b5b6f97079c47ade6b1bcc6a", [:mix], [{:jose, "~> 1.11.5", [hex: :jose, repo: "hexpm", optional: false]}], "hexpm", "5a95b05a71cd0b54abd35378aeb1d487a23a52c324fa7efdffc512b655b5aaa7"}, "jose": {:hex, :jose, "1.11.6", "613fda82552128aa6fb804682e3a616f4bc15565a048dabd05b1ebd5827ed965", [:mix, :rebar3], [], "hexpm", "6275cb75504f9c1e60eeacb771adfeee4905a9e182103aa59b53fed651ff9738"}, "kaffy": {:hex, :kaffy, "0.10.2", "72e807c525323bd0cbc3ac0c127b7bde61caffdc576fb6554964d3fe6a2a6100", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: false]}, {:phoenix, "~> 1.6", [hex: :phoenix, repo: "hexpm", optional: false]}, {:phoenix_html, "~> 3.0", [hex: :phoenix_html, repo: "hexpm", optional: false]}, {:phoenix_view, "~> 2.0.2", [hex: :phoenix_view, repo: "hexpm", optional: false]}], "hexpm", "651cad5f3bcc91510a671c13c7a273b8b8195fdf2d809208708baecbb77300bf"}, - "location": {:git, "https://github.com/plausible/location.git", "5eeaa752d8f68236ffe637446a8d7aabb05c900f", [branch: "all-all"]}, + "location": {:git, "https://github.com/plausible/location.git", "a89bf79985c3c3d0830477ae587001156a646ce8", []}, "locus": {:hex, :locus, "2.3.6", "c9f53fd5df872fca66a54dc0aa2f8b2d3640388e56a0c39a741be0df6d8854bf", [:rebar3], [{:tls_certificate_check, "~> 1.9", [hex: :tls_certificate_check, repo: "hexpm", optional: false]}], "hexpm", "6087aa9a69673e7011837fb4b3d7f756560adde76892c32f5f93904ee30064e2"}, "mail": {:hex, :mail, "0.3.1", "cb0a14e4ed8904e4e5a08214e686ccf6f9099346885db17d8c309381f865cc5c", [:mix], [], "hexpm", "1db701e89865c1d5fa296b2b57b1cd587587cca8d8a1a22892b35ef5a8e352a6"}, "makeup": {:hex, :makeup, "1.1.1", "fa0bc768698053b2b3869fa8a62616501ff9d11a562f3ce39580d60860c3a55e", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "5dc62fbdd0de44de194898b6710692490be74baa02d9d108bc29f007783b0b48"}, From a0ac1475d0191d0f8102b8499a1a40476343e370 Mon Sep 17 00:00:00 2001 From: Karl-Aksel Puulmann Date: Mon, 12 Aug 2024 10:43:47 +0300 Subject: [PATCH 14/15] dimension -> select_dimension --- lib/plausible/stats/sql/expression.ex | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/plausible/stats/sql/expression.ex b/lib/plausible/stats/sql/expression.ex index 8639aa237b15..146eae8feed2 100644 --- a/lib/plausible/stats/sql/expression.ex +++ b/lib/plausible/stats/sql/expression.ex @@ -187,14 +187,14 @@ defmodule Plausible.Stats.SQL.Expression do def select_dimension(q, key, "visit:city", _table, _query), do: select_merge_as(q, [t], %{key => t.city}) - def dimension(key, "visit:country_name", _table, _query), - do: wrap_alias([t], %{key => t.country_name}) + def select_dimension(q, key, "visit:country_name", _table, _query), + do: select_merge_as(q, [t], %{key => t.country_name}) - def dimension(key, "visit:region_name", _table, _query), - do: wrap_alias([t], %{key => t.region_name}) + def select_dimension(q, key, "visit:region_name", _table, _query), + do: select_merge_as(q, [t], %{key => t.region_name}) - def dimension(key, "visit:city_name", _table, _query), - do: wrap_alias([t], %{key => t.city_name}) + def select_dimension(q, key, "visit:city_name", _table, _query), + do: select_merge_as(q, [t], %{key => t.city_name}) def event_metric(:pageviews) do wrap_alias([e], %{ From 632d62f25a548662e5c61f689f13096bd91a4f49 Mon Sep 17 00:00:00 2001 From: Karl-Aksel Puulmann Date: Mon, 12 Aug 2024 11:01:55 +0300 Subject: [PATCH 15/15] Update a test --- .../controllers/api/external_stats_controller/query_test.exs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/plausible_web/controllers/api/external_stats_controller/query_test.exs b/test/plausible_web/controllers/api/external_stats_controller/query_test.exs index 1acdf698e842..0c64082ac29e 100644 --- a/test/plausible_web/controllers/api/external_stats_controller/query_test.exs +++ b/test/plausible_web/controllers/api/external_stats_controller/query_test.exs @@ -2973,7 +2973,7 @@ defmodule PlausibleWeb.Api.ExternalStatsController.QueryTest do assert json_response(conn, 200)["results"] == [ %{"dimensions" => ["Estonia", "Harjumaa", "Tallinn"], "metrics" => [1]}, - %{"dimensions" => ["United Kingdom", "London, City of", "London"], "metrics" => [1]} + %{"dimensions" => ["United Kingdom", "London", "London"], "metrics" => [1]} ] end end