From 62d499e59b3a23c62e2d73a6d9160cbe211f3b29 Mon Sep 17 00:00:00 2001 From: mithereal Date: Sat, 13 Jan 2024 02:45:05 -0700 Subject: [PATCH 01/11] add postal code --- README.md | 5 +- config/runtime.exs | 1 - lib/location.ex | 3 + lib/location/postalcode.ex | 116 +++++++++++++++++++++++ lib/mix/tasks/update_geoname_data.ex | 32 ++++--- lib/mix/tasks/update_postal_code_data.ex | 33 +++++++ lib/scraper/scraper.ex | 64 ++++++++----- test/location_test.exs | 8 ++ 8 files changed, 221 insertions(+), 41 deletions(-) create mode 100644 lib/location/postalcode.ex create mode 100644 lib/mix/tasks/update_postal_code_data.ex diff --git a/README.md b/README.md index 469f117..3b0c85b 100644 --- a/README.md +++ b/README.md @@ -18,5 +18,8 @@ We also add some data manually that is missing from upstream. Overrides can be f ### Cities -The data for cities comes from the [geonames](http://www.geonames.org/) project. This project has scripts to downlaod the main `allCountries.txt` file. It is then processed to make it smaller +The data for cities comes from the [geonames](http://www.geonames.org/) project. This project has scripts to download the main `allCountries.txt` file. It is then processed to make it smaller (from 1.3GB to about 130MB). Still, the resulting file is quite large so we also provide a city database based on the smaller `cities500.txt` file. + +### Postal Codes +The data for postal codes comes from the [geonames](http://www.geonames.org/) project. This project has scripts to download the main `allCountries.txt` file. \ No newline at end of file diff --git a/config/runtime.exs b/config/runtime.exs index cd92a06..b8db304 100644 --- a/config/runtime.exs +++ b/config/runtime.exs @@ -3,4 +3,3 @@ import Config if config_env() != :prod do config :location, :lightweight, true end - diff --git a/lib/location.ex b/lib/location.ex index 70acebf..f1f8f78 100644 --- a/lib/location.ex +++ b/lib/location.ex @@ -8,6 +8,8 @@ defmodule Location do defdelegate search_subdivision(code), to: Location.Subdivision defdelegate get_city(code), to: Location.City defdelegate get_city(city_name, country_code), to: Location.City + defdelegate get_postal_code(code), to: Location.PostalCode + defdelegate get_postal_codes(country_code, state_code, city_name), to: Location.PostalCode def load_all() do Logger.debug("Loading location databases...") @@ -15,6 +17,7 @@ defmodule Location do :ok = load(Location.Country) :ok = load(Location.Subdivision) :ok = load(Location.City) + :ok = load(Location.PostalCode) end defp load(module) do diff --git a/lib/location/postalcode.ex b/lib/location/postalcode.ex new file mode 100644 index 0000000..ea8ff53 --- /dev/null +++ b/lib/location/postalcode.ex @@ -0,0 +1,116 @@ +defmodule Location.PostalCode do + @ets_table_by_id __MODULE__ + @ets_table_by_lookup Module.concat(__MODULE__, ByLookup) + + defstruct [ + :postal_code, + :country_code, + :state_code, + :city_name, + :latitude, + :longitude + ] + + def load() do + @ets_table_by_lookup = + :ets.new(@ets_table_by_lookup, [ + :set, + :named_table, + :public, + :compressed, + {:write_concurrency, true}, + {:read_concurrency, true}, + {:decentralized_counters, false} + ]) + + source_file() + |> File.stream!() + |> Stream.chunk_every(15_000) + |> Task.async_stream( + fn chunk -> + chunk + |> LocationCSV.parse_stream() + |> Stream.each(fn [ + country_code, + postal_code, + city_name, + _state_name, + state_code, + _municipality, + _municipality_code, + _admin_name3, + _admin_code3, + latitude, + longitude, + _accuracy + ] -> + country_code = String.trim(country_code) + + true = + :ets.insert( + @ets_table_by_lookup, + {{country_code, state_code, city_name}, {postal_code, latitude, longitude}} + ) + + true = + :ets.insert( + @ets_table_by_id, + {postal_code, {country_code, state_code, city_name, latitude, longitude}} + ) + end) + |> Stream.run() + end, + timeout: :infinity + ) + |> Stream.run() + end + + @doc """ + Finds postal_code information by postal code. + """ + @spec get_postal_code(string()) :: %__MODULE__{} | nil + def get_postal_code(code) do + case :ets.lookup(@ets_table_by_id, code) do + [{postal_code, {country_code, state_code, city_name, latitude, longitude}}] -> + to_struct(postal_code, country_code, state_code, city_name, latitude, longitude) + + _ -> + nil + end + end + + @doc """ + Finds postal codes by city code, state code and country code. + + This function returns all postal code founds when the country has multiple + cities with the same name. + """ + @spec get_postal_codes(string(), string(), string()) :: %__MODULE__{} | nil + def get_postal_codes(country_code, state_code, city_name) do + case :ets.lookup(@ets_table_by_lookup, {country_code, state_code, city_name}) do + data when is_list(data) -> + Enum.map(data, fn x -> + {{country_code, state_code, city_name}, {postal_code, latitude, longitude}} = x + to_struct(postal_code, country_code, state_code, city_name, latitude, longitude) + end) + + _ -> + nil + end + end + + defp source_file() do + Application.app_dir(:location, "priv/postal_codes.csv") + end + + defp to_struct(postal_code, country_code, state_code, city_name, latitude, longitude) do + %__MODULE__{ + postal_code: postal_code, + country_code: country_code, + state_code: state_code, + city_name: city_name, + latitude: latitude, + longitude: longitude + } + end +end diff --git a/lib/mix/tasks/update_geoname_data.ex b/lib/mix/tasks/update_geoname_data.ex index 9acb040..712c8e4 100644 --- a/lib/mix/tasks/update_geoname_data.ex +++ b/lib/mix/tasks/update_geoname_data.ex @@ -8,22 +8,24 @@ defmodule Mix.Tasks.UpdateGeonameData do The data source allCountries.txt clocks in at 1.5GB. Expect this to take a while. """ def run(_) do - #System.cmd("wget", [@allcountries_src, "-O", "/tmp/allCountries.zip"]) - #System.cmd("unzip", ["/tmp/allCountries.zip", "-d", "/tmp"]) + # System.cmd("wget", [@allcountries_src, "-O", "/tmp/allCountries.zip"]) + # System.cmd("unzip", ["/tmp/allCountries.zip", "-d", "/tmp"]) process_geonames_file("/tmp/allCountries.txt") end defp process_geonames_file(filename) do - tab = :binary.compile_pattern("\t") # BINARY - - result = filename - |> File.stream!(read_ahead: 100_000) - |> Flow.from_enumerable() - |> Flow.map(&String.split(&1, tab)) - |> Flow.partition() - |> Flow.reduce(fn -> [] end, &reduce_chunk/2) - |> Enum.into([]) + # BINARY + tab = :binary.compile_pattern("\t") + + result = + filename + |> File.stream!(read_ahead: 100_000) + |> Flow.from_enumerable() + |> Flow.map(&String.split(&1, tab)) + |> Flow.partition() + |> Flow.reduce(fn -> [] end, &reduce_chunk/2) + |> Enum.into([]) IO.puts("Writing result to #{@allcountries_dest}") @@ -32,10 +34,14 @@ defmodule Mix.Tasks.UpdateGeonameData do defp reduce_chunk(row, result) do case row do - [geoname_id, name, _, _, _, _, feature_class, _, country_code | _rest] when feature_class in ["P", "A"] -> # feature classes defined here: http://download.geonames.org/export/dump/ + # feature classes defined here: http://download.geonames.org/export/dump/ + [geoname_id, name, _, _, _, _, feature_class, _, country_code | _rest] + when feature_class in ["P", "A"] -> row = geoname_id <> "\t" <> name <> "\t" <> country_code [row | result] - _ -> result + + _ -> + result end end end diff --git a/lib/mix/tasks/update_postal_code_data.ex b/lib/mix/tasks/update_postal_code_data.ex new file mode 100644 index 0000000..ec34b02 --- /dev/null +++ b/lib/mix/tasks/update_postal_code_data.ex @@ -0,0 +1,33 @@ +defmodule Mix.Tasks.UpdatePostalCodeData do + use Mix.Task + + @allcountries_src "https://download.geonames.org/export/zip/allCountries.zip" + @allcountries_dest Application.app_dir(:location, "/priv/postal_codes.csv") + + @doc """ + The data source allCountries.zip clocks in at 16mb. Expect this to take a while. + """ + def run(_) do + System.cmd("wget", [@allcountries_src, "-O", "/tmp/allCountries.zip"]) + System.cmd("unzip", ["/tmp/allCountries.zip", "-d", "/tmp"]) + + process_file("/tmp/allCountries.txt") + end + + defp process_file(filename) do + # BINARY + tab = :binary.compile_pattern("\t") + + result = + filename + |> File.stream!(read_ahead: 100_000) + |> Flow.from_enumerable() + |> Flow.map(&String.split(&1, tab)) + |> Flow.partition() + |> Enum.into([]) + + IO.puts("Writing result to #{@allcountries_dest}") + + File.write!(@allcountries_dest, Enum.join(result, "\n")) + end +end diff --git a/lib/scraper/scraper.ex b/lib/scraper/scraper.ex index d59db93..aab48eb 100644 --- a/lib/scraper/scraper.ex +++ b/lib/scraper/scraper.ex @@ -3,41 +3,50 @@ defmodule Location.Scraper do @subdivision_base_url @base_url <> "/wiki/ISO_3166-2:" @translations_dest Application.app_dir(:location, "/priv/iso_3166-2.en-translations.json") @countries_to_skip [ - "EE", # For Estonia the local names are better than English ones - "JP" # Source data from salsa-debian already has english translations where applicable + # For Estonia the local names are better than English ones + "EE", + # Source data from salsa-debian already has english translations where applicable + "JP" ] def scrape() do countries = Location.Country.all() - res = Enum.map(countries, &scrape_country/1) - |> Enum.filter(&(not is_nil(&1))) - |> List.flatten - |> Enum.into(%{}) - |> Jason.encode! + res = + Enum.map(countries, &scrape_country/1) + |> Enum.filter(&(not is_nil(&1))) + |> List.flatten() + |> Enum.into(%{}) + |> Jason.encode!() File.write!(@translations_dest, res) end defp scrape_country(%Location.Country{alpha_2: code}) when code in @countries_to_skip, do: nil + defp scrape_country(country) do url = @subdivision_base_url <> country.alpha_2 response = HTTPoison.get!(url) {:ok, document} = Floki.parse_document(response.body) - rows = Floki.find(document, "table.wikitable.sortable") - |> List.first - |> Floki.find("tbody tr") + rows = + Floki.find(document, "table.wikitable.sortable") + |> List.first() + |> Floki.find("tbody tr") - english_name_column = case List.first(rows) do - {"tr", _attrs, cells} -> - Enum.find_index(cells, fn cell -> - text = String.downcase(cell_text(cell)) - String.starts_with?(text, "subdivision name (en)") - || String.starts_with?(text, "subdivision name (sv)") - end) - _ -> nil - end + english_name_column = + case List.first(rows) do + {"tr", _attrs, cells} -> + Enum.find_index(cells, fn cell -> + text = String.downcase(cell_text(cell)) + + String.starts_with?(text, "subdivision name (en)") || + String.starts_with?(text, "subdivision name (sv)") + end) + + _ -> + nil + end if english_name_column do IO.puts("Scraping " <> country.name) @@ -51,13 +60,15 @@ defmodule Location.Scraper do end defp scrape_row({"tr", _attrs, children}, name_column_index) do - code = children - |> Enum.at(0) - |> cell_text + code = + children + |> Enum.at(0) + |> cell_text - name = children - |> Enum.at(name_column_index) - |> cell_text + name = + children + |> Enum.at(name_column_index) + |> cell_text {code, name} end @@ -66,6 +77,7 @@ defmodule Location.Scraper do Floki.text(text) |> String.trim() |> String.trim("[a]") - |> String.replace(~r/\[note \d\]$/, "") # Sometimes the entry contains something like "Region name[note 3]" + # Sometimes the entry contains something like "Region name[note 3]" + |> String.replace(~r/\[note \d\]$/, "") end end diff --git a/test/location_test.exs b/test/location_test.exs index 16f9df0..3d83762 100644 --- a/test/location_test.exs +++ b/test/location_test.exs @@ -69,4 +69,12 @@ defmodule LocationTest do assert city.name == "Springfield" end end + + describe "postal code" do + test "can look up postal codes for a city" do + codes = Location.get_postal_codes("US", "AZ", "Tucson") + + assert Enum.count(codes) > 0 + end + end end From 4109a6c9112d1e1bee6741e24d552578398b3db8 Mon Sep 17 00:00:00 2001 From: mithereal Date: Sat, 13 Jan 2024 06:03:30 -0700 Subject: [PATCH 02/11] add --source flag to choose other datasets --- lib/location/postalcode.ex | 3 ++- lib/mix/tasks/update_geoname_data.ex | 27 +++++++++++++------- lib/mix/tasks/update_postal_code_data.ex | 32 +++++++++++++++++------- 3 files changed, 43 insertions(+), 19 deletions(-) diff --git a/lib/location/postalcode.ex b/lib/location/postalcode.ex index ea8ff53..8f79329 100644 --- a/lib/location/postalcode.ex +++ b/lib/location/postalcode.ex @@ -100,7 +100,8 @@ defmodule Location.PostalCode do end defp source_file() do - Application.app_dir(:location, "priv/postal_codes.csv") + default = Application.app_dir(:location, "/priv/postal_codes.csv") + Application.get_env(:location, :postal_codes_source_file, default) end defp to_struct(postal_code, country_code, state_code, city_name, latitude, longitude) do diff --git a/lib/mix/tasks/update_geoname_data.ex b/lib/mix/tasks/update_geoname_data.ex index 712c8e4..1a3e989 100644 --- a/lib/mix/tasks/update_geoname_data.ex +++ b/lib/mix/tasks/update_geoname_data.ex @@ -1,17 +1,26 @@ defmodule Mix.Tasks.UpdateGeonameData do use Mix.Task - @allcountries_src "https://download.geonames.org/export/dump/allCountries.zip" - @allcountries_dest Application.app_dir(:location, "/priv/geonames.csv") + @destination_filename Location.PostalCode.source_file() @doc """ - The data source allCountries.txt clocks in at 1.5GB. Expect this to take a while. + The data source clocks in at 1.5GB. Expect this to take a while. """ - def run(_) do - # System.cmd("wget", [@allcountries_src, "-O", "/tmp/allCountries.zip"]) - # System.cmd("unzip", ["/tmp/allCountries.zip", "-d", "/tmp"]) - process_geonames_file("/tmp/allCountries.txt") + def run(args) do + {options, _, _} = + OptionParser.parse(["--source", "allCountries"], strict: [source: :string]) + + Keyword.get(options, :source) + |> main() + end + + def main(name) do + # src = "https://download.geonames.org/export/dump/#{name}.zip" + # System.cmd("wget", [src, "-O", "/tmp/#{name}.zip"]) + # System.cmd("unzip", ["/tmp/#{name}.zip", "-d", "/tmp"]) + + process_geonames_file("/tmp/#{name}.txt") end defp process_geonames_file(filename) do @@ -27,9 +36,9 @@ defmodule Mix.Tasks.UpdateGeonameData do |> Flow.reduce(fn -> [] end, &reduce_chunk/2) |> Enum.into([]) - IO.puts("Writing result to #{@allcountries_dest}") + IO.puts("Writing result to #{@destination_filename}") - File.write!(@allcountries_dest, Enum.join(result, "\n")) + File.write!(@destination_filename, Enum.join(result, "\n")) end defp reduce_chunk(row, result) do diff --git a/lib/mix/tasks/update_postal_code_data.ex b/lib/mix/tasks/update_postal_code_data.ex index ec34b02..40153b5 100644 --- a/lib/mix/tasks/update_postal_code_data.ex +++ b/lib/mix/tasks/update_postal_code_data.ex @@ -1,17 +1,31 @@ defmodule Mix.Tasks.UpdatePostalCodeData do use Mix.Task - @allcountries_src "https://download.geonames.org/export/zip/allCountries.zip" - @allcountries_dest Application.app_dir(:location, "/priv/postal_codes.csv") + @destination_filename Location.PostalCode.source_file() @doc """ - The data source allCountries.zip clocks in at 16mb. Expect this to take a while. + The data source clocks in at 16mb. Expect this to take a while. + The option --source will download and parse different datasets ie. AZ (https://download.geonames.org/export/zip/AZ.zip) in order to keep the set small """ - def run(_) do - System.cmd("wget", [@allcountries_src, "-O", "/tmp/allCountries.zip"]) - System.cmd("unzip", ["/tmp/allCountries.zip", "-d", "/tmp"]) - process_file("/tmp/allCountries.txt") + def run(args) do + {options, _, _} = + OptionParser.parse(["--source", "allCountries"], strict: [source: :string]) + + Keyword.get(options, :source) + |> main() + end + + @doc """ + Fetch and Prepare a Postal Code Export + + """ + def main(name) do + src = "https://download.geonames.org/export/zip/#{name}.zip" + System.cmd("wget", [src, "-O", "/tmp/#{name}.zip"]) + System.cmd("unzip", ["/tmp/#{name}.zip", "-d", "/tmp"]) + + process_file("/tmp/#{name}.txt") end defp process_file(filename) do @@ -26,8 +40,8 @@ defmodule Mix.Tasks.UpdatePostalCodeData do |> Flow.partition() |> Enum.into([]) - IO.puts("Writing result to #{@allcountries_dest}") + IO.puts("Writing result to #{@destination_filename}") - File.write!(@allcountries_dest, Enum.join(result, "\n")) + File.write!(@destination_filename, Enum.join(result, "\n")) end end From f121355484c1609d83cc5431fafa18130e19cee7 Mon Sep 17 00:00:00 2001 From: mithereal Date: Sat, 13 Jan 2024 13:57:50 -0700 Subject: [PATCH 03/11] add --list --help and --append flags --- README.md | 8 +++--- lib/mix/tasks/update_geoname_data.ex | 34 +++++++++++++++++++----- lib/mix/tasks/update_postal_code_data.ex | 34 +++++++++++++++++++----- lib/scraper/scraper.ex | 15 +++++++++++ 4 files changed, 73 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 3b0c85b..7fb3e9c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Location -Elixir library for accessing ISO3166-1 (country) and ISO3166-2 (subdivision) data as well as geoname data for cities. Source data comes from the upstream [debian iso-codes](https://salsa.debian.org/iso-codes-team/iso-codes) package and the [Geonames](http://www.geonames.org/) project. +Elixir library for accessing ISO3166-1 (country) and ISO3166-2 (subdivision) data as well as geoname data for cities and postal code data. Source data comes from the upstream [debian iso-codes](https://salsa.debian.org/iso-codes-team/iso-codes) package and the [Geonames](http://www.geonames.org/) project. @@ -18,8 +18,8 @@ We also add some data manually that is missing from upstream. Overrides can be f ### Cities -The data for cities comes from the [geonames](http://www.geonames.org/) project. This project has scripts to download the main `allCountries.txt` file. It is then processed to make it smaller -(from 1.3GB to about 130MB). Still, the resulting file is quite large so we also provide a city database based on the smaller `cities500.txt` file. +The data for cities comes from the [geonames](http://www.geonames.org/) project. This project has scripts to download the main `allCountries.txt` file or individual country files. If allCountries is chosen is then processed to make it smaller +(from 1.3GB to about 130MB). Still, the resulting file is quite large so we also provide a city database based on the smaller `cities500.txt` file or one choose the --source option. ### Postal Codes -The data for postal codes comes from the [geonames](http://www.geonames.org/) project. This project has scripts to download the main `allCountries.txt` file. \ No newline at end of file +The data for postal codes comes from the [geonames](http://www.geonames.org/) project. This project has scripts to download all or individual postal code files via the --source option. diff --git a/lib/mix/tasks/update_geoname_data.ex b/lib/mix/tasks/update_geoname_data.ex index 1a3e989..0818eb6 100644 --- a/lib/mix/tasks/update_geoname_data.ex +++ b/lib/mix/tasks/update_geoname_data.ex @@ -9,21 +9,38 @@ defmodule Mix.Tasks.UpdateGeonameData do def run(args) do {options, _, _} = - OptionParser.parse(["--source", "allCountries"], strict: [source: :string]) + OptionParser.parse(["--source", "allCountries", "--list", "--append", "--help"], + strict: [source: :string, list: :boolean, append: :boolean, help: :boolean] + ) - Keyword.get(options, :source) - |> main() + case(Keyword.get(options, :help) || Keyword.get(options, :list)) do + false -> + Keyword.get(options, :source) + |> main(Keyword.get(options, :append)) + + true -> + if(Keyword.get(options, :help)) do + IO.puts( + "The following options are available, --source 'Choose an option from --list', --list 'List of available countries by code', --append 'Append to the downloaded file (if you want multiple countries but not all)'" + ) + end + + if(Keyword.get(options, :list)) do + sources = Location.Scraper.scrape_postal_files() + IO.puts("The following Postal Code Sources are Available #{sources}") + end + end end - def main(name) do + def main(name, append \\ false) do # src = "https://download.geonames.org/export/dump/#{name}.zip" # System.cmd("wget", [src, "-O", "/tmp/#{name}.zip"]) # System.cmd("unzip", ["/tmp/#{name}.zip", "-d", "/tmp"]) - process_geonames_file("/tmp/#{name}.txt") + process_geonames_file("/tmp/#{name}.txt", append) end - defp process_geonames_file(filename) do + defp process_geonames_file(filename, append \\ false) do # BINARY tab = :binary.compile_pattern("\t") @@ -38,7 +55,10 @@ defmodule Mix.Tasks.UpdateGeonameData do IO.puts("Writing result to #{@destination_filename}") - File.write!(@destination_filename, Enum.join(result, "\n")) + case append do + false -> File.write!(@destination_filename, Enum.join(result, "\n")) + true -> File.write!(@destination_filename, Enum.join(result, "\n"), :append) + end end defp reduce_chunk(row, result) do diff --git a/lib/mix/tasks/update_postal_code_data.ex b/lib/mix/tasks/update_postal_code_data.ex index 40153b5..7e0b8d0 100644 --- a/lib/mix/tasks/update_postal_code_data.ex +++ b/lib/mix/tasks/update_postal_code_data.ex @@ -10,25 +10,42 @@ defmodule Mix.Tasks.UpdatePostalCodeData do def run(args) do {options, _, _} = - OptionParser.parse(["--source", "allCountries"], strict: [source: :string]) + OptionParser.parse(["--source", "allCountries", "--list", "--append", "--help"], + strict: [source: :string, list: :boolean, append: :boolean, help: :boolean] + ) - Keyword.get(options, :source) - |> main() + case(Keyword.get(options, :help) || Keyword.get(options, :list)) do + false -> + Keyword.get(options, :source) + |> main(Keyword.get(options, :append)) + + true -> + if(Keyword.get(options, :help)) do + IO.puts( + "The following options are available, --source 'Choose an option from --list', --list 'List of available countries by code', --append 'Append to the downloaded file (if you want multiple countries but not all)'" + ) + end + + if(Keyword.get(options, :list)) do + sources = Location.Scraper.scrape_postal_files() + IO.puts("The following Postal Code Sources are Available #{sources}") + end + end end @doc """ Fetch and Prepare a Postal Code Export """ - def main(name) do + def main(name, append \\ false) do src = "https://download.geonames.org/export/zip/#{name}.zip" System.cmd("wget", [src, "-O", "/tmp/#{name}.zip"]) System.cmd("unzip", ["/tmp/#{name}.zip", "-d", "/tmp"]) - process_file("/tmp/#{name}.txt") + process_file("/tmp/#{name}.txt", append) end - defp process_file(filename) do + defp process_file(filename, append) do # BINARY tab = :binary.compile_pattern("\t") @@ -42,6 +59,9 @@ defmodule Mix.Tasks.UpdatePostalCodeData do IO.puts("Writing result to #{@destination_filename}") - File.write!(@destination_filename, Enum.join(result, "\n")) + case append do + false -> File.write!(@destination_filename, Enum.join(result, "\n")) + true -> File.write!(@destination_filename, Enum.join(result, "\n"), :append) + end end end diff --git a/lib/scraper/scraper.ex b/lib/scraper/scraper.ex index aab48eb..ed93d53 100644 --- a/lib/scraper/scraper.ex +++ b/lib/scraper/scraper.ex @@ -1,5 +1,6 @@ defmodule Location.Scraper do @base_url "https://en.wikipedia.org" + @postal_code_url "https://download.geonames.org/export/zip/" @subdivision_base_url @base_url <> "/wiki/ISO_3166-2:" @translations_dest Application.app_dir(:location, "/priv/iso_3166-2.en-translations.json") @countries_to_skip [ @@ -80,4 +81,18 @@ defmodule Location.Scraper do # Sometimes the entry contains something like "Region name[note 3]" |> String.replace(~r/\[note \d\]$/, "") end + + def scrape_postal_files() do + response = HTTPoison.get!(@postal_code_url) + {:ok, document} = Floki.parse_document(response.body) + + result = + Floki.find(document, "pre") + + Enum.map(result, fn x -> + [{_, [{_, href}], [name]}] = Floki.find(x, "a") + String.replace(name, ".zip", "") + end) + |> Enum.join(", ") + end end From fdaf83ce29d6d9ebc931007707d4b8fced4699d4 Mon Sep 17 00:00:00 2001 From: mithereal Date: Tue, 23 Jan 2024 23:44:33 -0700 Subject: [PATCH 04/11] fix missing function --- lib/location/postalcode.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/location/postalcode.ex b/lib/location/postalcode.ex index 8f79329..2f90cc0 100644 --- a/lib/location/postalcode.ex +++ b/lib/location/postalcode.ex @@ -99,7 +99,7 @@ defmodule Location.PostalCode do end end - defp source_file() do + def source_file() do default = Application.app_dir(:location, "/priv/postal_codes.csv") Application.get_env(:location, :postal_codes_source_file, default) end From 2389a1b8dbc0cfd9f572bdc4abfa583021269823 Mon Sep 17 00:00:00 2001 From: mithereal Date: Tue, 23 Jan 2024 23:50:50 -0700 Subject: [PATCH 05/11] fix default filename --- lib/location/postalcode.ex | 2 +- lib/mix/tasks/update_geoname_data.ex | 3 ++- lib/mix/tasks/update_postal_code_data.ex | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/location/postalcode.ex b/lib/location/postalcode.ex index 2f90cc0..8f79329 100644 --- a/lib/location/postalcode.ex +++ b/lib/location/postalcode.ex @@ -99,7 +99,7 @@ defmodule Location.PostalCode do end end - def source_file() do + defp source_file() do default = Application.app_dir(:location, "/priv/postal_codes.csv") Application.get_env(:location, :postal_codes_source_file, default) end diff --git a/lib/mix/tasks/update_geoname_data.ex b/lib/mix/tasks/update_geoname_data.ex index 0818eb6..e251844 100644 --- a/lib/mix/tasks/update_geoname_data.ex +++ b/lib/mix/tasks/update_geoname_data.ex @@ -1,7 +1,8 @@ defmodule Mix.Tasks.UpdateGeonameData do use Mix.Task - @destination_filename Location.PostalCode.source_file() + default = Application.app_dir(:location, "/priv/postal_codes.csv") + @destination_filename Application.get_env(:location, :postal_codes_source_file, default) @doc """ The data source clocks in at 1.5GB. Expect this to take a while. diff --git a/lib/mix/tasks/update_postal_code_data.ex b/lib/mix/tasks/update_postal_code_data.ex index 7e0b8d0..ec3e5e2 100644 --- a/lib/mix/tasks/update_postal_code_data.ex +++ b/lib/mix/tasks/update_postal_code_data.ex @@ -1,7 +1,8 @@ defmodule Mix.Tasks.UpdatePostalCodeData do use Mix.Task - @destination_filename Location.PostalCode.source_file() + default = Application.app_dir(:location, "/priv/postal_codes.csv") + @destination_filename Application.get_env(:location, :postal_codes_source_file, default) @doc """ The data source clocks in at 16mb. Expect this to take a while. From 12bc24fd2128e85165fa6e21d845f97d94862e3f Mon Sep 17 00:00:00 2001 From: mithereal Date: Thu, 25 Jan 2024 06:42:40 -0700 Subject: [PATCH 06/11] fix build errors with certs --- lib/scraper/scraper.ex | 6 ++++-- mix.exs | 9 +++++---- mix.lock | 16 +++++++++------- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/lib/scraper/scraper.ex b/lib/scraper/scraper.ex index ed93d53..4c591c8 100644 --- a/lib/scraper/scraper.ex +++ b/lib/scraper/scraper.ex @@ -1,4 +1,6 @@ defmodule Location.Scraper do + use Tesla + @base_url "https://en.wikipedia.org" @postal_code_url "https://download.geonames.org/export/zip/" @subdivision_base_url @base_url <> "/wiki/ISO_3166-2:" @@ -27,7 +29,7 @@ defmodule Location.Scraper do defp scrape_country(country) do url = @subdivision_base_url <> country.alpha_2 - response = HTTPoison.get!(url) + response = get!(url) {:ok, document} = Floki.parse_document(response.body) rows = @@ -83,7 +85,7 @@ defmodule Location.Scraper do end def scrape_postal_files() do - response = HTTPoison.get!(@postal_code_url) + response = get!(@postal_code_url) {:ok, document} = Floki.parse_document(response.body) result = diff --git a/mix.exs b/mix.exs index 992bc22..a7ed139 100644 --- a/mix.exs +++ b/mix.exs @@ -39,10 +39,11 @@ defmodule Location.MixProject do defp deps do [ {:jason, "~> 1.3"}, - {:nimble_csv, "~> 1.1"}, - {:floki, "~> 0.31.0", only: [:dev, :test]}, - {:httpoison, "~> 1.8", only: [:dev, :test]}, - {:flow, "~> 1.0", only: [:dev, :test]} + {:nimble_csv, "~> 1.2"}, + {:floki, "~> 0.35.2", only: [:dev, :test]}, + {:tesla, "~> 1.8"}, + {:hackney, "~> 1.20"}, + {:flow, "~> 1.2", only: [:dev, :test]} ] end end diff --git a/mix.lock b/mix.lock index fff16eb..a2cfe41 100644 --- a/mix.lock +++ b/mix.lock @@ -1,17 +1,19 @@ %{ - "certifi": {:hex, :certifi, "2.8.0", "d4fb0a6bb20b7c9c3643e22507e42f356ac090a1dcea9ab99e27e0376d695eba", [:rebar3], [], "hexpm", "6ac7efc1c6f8600b08d625292d4bbf584e14847ce1b6b5c44d983d273e1097ea"}, - "floki": {:hex, :floki, "0.31.0", "f05ee8a8e6a3ced4e62beeb2c79a63bc8e12ab98fbaaf6e6a3d9b76b1278e23f", [:mix], [{:html_entities, "~> 0.5.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm", "b05afa372f5c345a5bf240ac25ea1f0f3d5fcfd7490ac0beeb4a203f9444891e"}, - "flow": {:hex, :flow, "1.1.0", "b569c1042cb2da97103f6d70a0267a5657dce1402f41b4020bef98bbef9c7c1e", [:mix], [{:gen_stage, "~> 1.0", [hex: :gen_stage, repo: "hexpm", optional: false]}], "hexpm", "066f42f7a1ea6a86cb4ef763310338981a5cfb93bcebce10863a23a4859fd785"}, - "gen_stage": {:hex, :gen_stage, "1.1.2", "b1656cd4ba431ed02c5656fe10cb5423820847113a07218da68eae5d6a260c23", [:mix], [], "hexpm", "9e39af23140f704e2b07a3e29d8f05fd21c2aaf4088ff43cb82be4b9e3148d02"}, - "hackney": {:hex, :hackney, "1.18.0", "c4443d960bb9fba6d01161d01cd81173089686717d9490e5d3606644c48d121f", [:rebar3], [{:certifi, "~>2.8.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~>6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~>1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.3.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~>1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "9afcda620704d720db8c6a3123e9848d09c87586dc1c10479c42627b905b5c5e"}, + "certifi": {:hex, :certifi, "2.12.0", "2d1cca2ec95f59643862af91f001478c9863c2ac9cb6e2f89780bfd8de987329", [:rebar3], [], "hexpm", "ee68d85df22e554040cdb4be100f33873ac6051387baf6a8f6ce82272340ff1c"}, + "floki": {:hex, :floki, "0.35.2", "87f8c75ed8654b9635b311774308b2760b47e9a579dabf2e4d5f1e1d42c39e0b", [:mix], [], "hexpm", "6b05289a8e9eac475f644f09c2e4ba7e19201fd002b89c28c1293e7bd16773d9"}, + "flow": {:hex, :flow, "1.2.4", "1dd58918287eb286656008777cb32714b5123d3855956f29aa141ebae456922d", [:mix], [{:gen_stage, "~> 1.0", [hex: :gen_stage, repo: "hexpm", optional: false]}], "hexpm", "874adde96368e71870f3510b91e35bc31652291858c86c0e75359cbdd35eb211"}, + "gen_stage": {:hex, :gen_stage, "1.2.1", "19d8b5e9a5996d813b8245338a28246307fd8b9c99d1237de199d21efc4c76a1", [:mix], [], "hexpm", "83e8be657fa05b992ffa6ac1e3af6d57aa50aace8f691fcf696ff02f8335b001"}, + "hackney": {:hex, :hackney, "1.20.1", "8d97aec62ddddd757d128bfd1df6c5861093419f8f7a4223823537bad5d064e2", [:rebar3], [{:certifi, "~> 2.12.0", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "~> 6.1.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "~> 1.0.0", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~> 1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:parse_trans, "3.4.1", [hex: :parse_trans, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "~> 1.1.0", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}, {:unicode_util_compat, "~> 0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "fe9094e5f1a2a2c0a7d10918fee36bfec0ec2a979994cff8cfe8058cd9af38e3"}, "html_entities": {:hex, :html_entities, "0.5.2", "9e47e70598da7de2a9ff6af8758399251db6dbb7eebe2b013f2bbd2515895c3c", [:mix], [], "hexpm", "c53ba390403485615623b9531e97696f076ed415e8d8058b1dbaa28181f4fdcc"}, "httpoison": {:hex, :httpoison, "1.8.0", "6b85dea15820b7804ef607ff78406ab449dd78bed923a49c7160e1886e987a3d", [:mix], [{:hackney, "~> 1.17", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm", "28089eaa98cf90c66265b6b5ad87c59a3729bea2e74e9d08f9b51eb9729b3c3a"}, "idna": {:hex, :idna, "6.1.1", "8a63070e9f7d0c62eb9d9fcb360a7de382448200fbbd1b106cc96d3d8099df8d", [:rebar3], [{:unicode_util_compat, "~>0.7.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "92376eb7894412ed19ac475e4a86f7b413c1b9fbb5bd16dccd57934157944cea"}, "jason": {:hex, :jason, "1.3.0", "fa6b82a934feb176263ad2df0dbd91bf633d4a46ebfdffea0c8ae82953714946", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "53fc1f51255390e0ec7e50f9cb41e751c260d065dcba2bf0d08dc51a4002c2ac"}, "metrics": {:hex, :metrics, "1.0.1", "25f094dea2cda98213cecc3aeff09e940299d950904393b2a29d191c346a8486", [:rebar3], [], "hexpm", "69b09adddc4f74a40716ae54d140f93beb0fb8978d8636eaded0c31b6f099f16"}, + "mime": {:hex, :mime, "2.0.5", "dc34c8efd439abe6ae0343edbb8556f4d63f178594894720607772a041b04b02", [:mix], [], "hexpm", "da0d64a365c45bc9935cc5c8a7fc5e49a0e0f9932a761c55d6c52b142780a05c"}, "mimerl": {:hex, :mimerl, "1.2.0", "67e2d3f571088d5cfd3e550c383094b47159f3eee8ffa08e64106cdf5e981be3", [:rebar3], [], "hexpm", "f278585650aa581986264638ebf698f8bb19df297f66ad91b18910dfc6e19323"}, "nimble_csv": {:hex, :nimble_csv, "1.2.0", "4e26385d260c61eba9d4412c71cea34421f296d5353f914afe3f2e71cce97722", [:mix], [], "hexpm", "d0628117fcc2148178b034044c55359b26966c6eaa8e2ce15777be3bbc91b12a"}, - "parse_trans": {:hex, :parse_trans, "3.3.1", "16328ab840cc09919bd10dab29e431da3af9e9e7e7e6f0089dd5a2d2820011d8", [:rebar3], [], "hexpm", "07cd9577885f56362d414e8c4c4e6bdf10d43a8767abb92d24cbe8b24c54888b"}, - "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.6", "cf344f5692c82d2cd7554f5ec8fd961548d4fd09e7d22f5b62482e5aeaebd4b0", [:make, :mix, :rebar3], [], "hexpm", "bdb0d2471f453c88ff3908e7686f86f9be327d065cc1ec16fa4540197ea04680"}, + "parse_trans": {:hex, :parse_trans, "3.4.1", "6e6aa8167cb44cc8f39441d05193be6e6f4e7c2946cb2759f015f8c56b76e5ff", [:rebar3], [], "hexpm", "620a406ce75dada827b82e453c19cf06776be266f5a67cff34e1ef2cbb60e49a"}, + "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"}, + "tesla": {:hex, :tesla, "1.8.0", "d511a4f5c5e42538d97eef7c40ec4f3e44effdc5068206f42ed859e09e51d1fd", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:exjsx, ">= 3.0.0", [hex: :exjsx, repo: "hexpm", optional: true]}, {:finch, "~> 0.13", [hex: :finch, repo: "hexpm", optional: true]}, {:fuse, "~> 2.4", [hex: :fuse, repo: "hexpm", optional: true]}, {:gun, ">= 1.0.0", [hex: :gun, repo: "hexpm", optional: true]}, {:hackney, "~> 1.6", [hex: :hackney, repo: "hexpm", optional: true]}, {:ibrowse, "4.4.2", [hex: :ibrowse, repo: "hexpm", optional: true]}, {:jason, ">= 1.0.0", [hex: :jason, repo: "hexpm", optional: true]}, {:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.0", [hex: :mint, repo: "hexpm", optional: true]}, {:msgpax, "~> 2.3", [hex: :msgpax, repo: "hexpm", optional: true]}, {:poison, ">= 1.0.0", [hex: :poison, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: true]}], "hexpm", "10501f360cd926a309501287470372af1a6e1cbed0f43949203a4c13300bc79f"}, "unicode_util_compat": {:hex, :unicode_util_compat, "0.7.0", "bc84380c9ab48177092f43ac89e4dfa2c6d62b40b8bd132b1059ecc7232f9a78", [:rebar3], [], "hexpm", "25eee6d67df61960cf6a794239566599b09e17e668d3700247bc498638152521"}, } From 9fc3523448b9f99d08c392ac3847dcefa411e87c Mon Sep 17 00:00:00 2001 From: mithereal Date: Thu, 21 Mar 2024 18:36:21 -0700 Subject: [PATCH 07/11] fix make load publoc --- lib/location.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/location.ex b/lib/location.ex index f1f8f78..6d271e1 100644 --- a/lib/location.ex +++ b/lib/location.ex @@ -20,7 +20,7 @@ defmodule Location do :ok = load(Location.PostalCode) end - defp load(module) do + def load(module) do {t, _result} = :timer.tc(fn -> module.load() From 6576d4ffde46d4cda0909951f33e6f9cddb16f52 Mon Sep 17 00:00:00 2001 From: mithereal Date: Thu, 21 Mar 2024 20:02:43 -0700 Subject: [PATCH 08/11] add zip code fetch and extract --- README.md | 3 +++ lib/scraper/scraper.ex | 26 ++++++++++++++++++++++++++ mix.exs | 3 ++- mix.lock | 1 + 4 files changed, 32 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7fb3e9c..627b1be 100644 --- a/README.md +++ b/README.md @@ -23,3 +23,6 @@ The data for cities comes from the [geonames](http://www.geonames.org/) project. ### Postal Codes The data for postal codes comes from the [geonames](http://www.geonames.org/) project. This project has scripts to download all or individual postal code files via the --source option. + +#### Postal Code Helpers +Postal codes can be downloaded \ No newline at end of file diff --git a/lib/scraper/scraper.ex b/lib/scraper/scraper.ex index 4c591c8..2e791d3 100644 --- a/lib/scraper/scraper.ex +++ b/lib/scraper/scraper.ex @@ -3,6 +3,7 @@ defmodule Location.Scraper do @base_url "https://en.wikipedia.org" @postal_code_url "https://download.geonames.org/export/zip/" + @postal_code_dest Application.app_dir(:location, "/priv/") @subdivision_base_url @base_url <> "/wiki/ISO_3166-2:" @translations_dest Application.app_dir(:location, "/priv/iso_3166-2.en-translations.json") @countries_to_skip [ @@ -91,10 +92,35 @@ defmodule Location.Scraper do result = Floki.find(document, "pre") + result = Floki.find(result, "a") |> Enum.drop(5) + Enum.map(result, fn x -> [{_, [{_, href}], [name]}] = Floki.find(x, "a") String.replace(name, ".zip", "") end) |> Enum.join(", ") end + + def fetch_postal_file(file) do + response = get!(@postal_code_url <> "#{file}.zip") + File.write!(@postal_code_dest <> "/#{file}.zip", response.body) + end + + def extract_postal_file(file) do + zip_file = Unzip.LocalFile.open("priv/#{file}.zip") + + try do + {:ok, unzip} = Unzip.new(zip_file) + + Unzip.file_stream!(unzip, "#{file}.txt") + |> Stream.into(File.stream!("priv/#{file}.csv")) + |> Stream.run() + after + Unzip.LocalFile.close(zip_file) + end + end + + def fetch_postal_files(files) do + Enum.each(files, fn file -> fetch_postal_file(file) end) + end end diff --git a/mix.exs b/mix.exs index a7ed139..5689c1a 100644 --- a/mix.exs +++ b/mix.exs @@ -43,7 +43,8 @@ defmodule Location.MixProject do {:floki, "~> 0.35.2", only: [:dev, :test]}, {:tesla, "~> 1.8"}, {:hackney, "~> 1.20"}, - {:flow, "~> 1.2", only: [:dev, :test]} + {:flow, "~> 1.2", only: [:dev, :test]}, + {:unzip, "0.11.0"} ] end end diff --git a/mix.lock b/mix.lock index a2cfe41..b0631e6 100644 --- a/mix.lock +++ b/mix.lock @@ -16,4 +16,5 @@ "ssl_verify_fun": {:hex, :ssl_verify_fun, "1.1.7", "354c321cf377240c7b8716899e182ce4890c5938111a1296add3ec74cf1715df", [:make, :mix, :rebar3], [], "hexpm", "fe4c190e8f37401d30167c8c405eda19469f34577987c76dde613e838bbc67f8"}, "tesla": {:hex, :tesla, "1.8.0", "d511a4f5c5e42538d97eef7c40ec4f3e44effdc5068206f42ed859e09e51d1fd", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:exjsx, ">= 3.0.0", [hex: :exjsx, repo: "hexpm", optional: true]}, {:finch, "~> 0.13", [hex: :finch, repo: "hexpm", optional: true]}, {:fuse, "~> 2.4", [hex: :fuse, repo: "hexpm", optional: true]}, {:gun, ">= 1.0.0", [hex: :gun, repo: "hexpm", optional: true]}, {:hackney, "~> 1.6", [hex: :hackney, repo: "hexpm", optional: true]}, {:ibrowse, "4.4.2", [hex: :ibrowse, repo: "hexpm", optional: true]}, {:jason, ">= 1.0.0", [hex: :jason, repo: "hexpm", optional: true]}, {:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:mint, "~> 1.0", [hex: :mint, repo: "hexpm", optional: true]}, {:msgpax, "~> 2.3", [hex: :msgpax, repo: "hexpm", optional: true]}, {:poison, ">= 1.0.0", [hex: :poison, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: true]}], "hexpm", "10501f360cd926a309501287470372af1a6e1cbed0f43949203a4c13300bc79f"}, "unicode_util_compat": {:hex, :unicode_util_compat, "0.7.0", "bc84380c9ab48177092f43ac89e4dfa2c6d62b40b8bd132b1059ecc7232f9a78", [:rebar3], [], "hexpm", "25eee6d67df61960cf6a794239566599b09e17e668d3700247bc498638152521"}, + "unzip": {:hex, :unzip, "0.11.0", "ffa85fede998a84c7d7eb026f99d2eb13a7806a121e9239503f310062185cdce", [:mix], [], "hexpm", "f536dc40011d4a0d6d3ddb0919daea01684912fa62b6b9495ded879a1fd8b265"}, } From 6d0a92814ed5251262a80b32439fe5baa2e8fa5d Mon Sep 17 00:00:00 2001 From: mithereal Date: Sun, 23 Feb 2025 20:48:27 -0700 Subject: [PATCH 09/11] refactor tasks so they show up in mix help --- README.md | 3 + {mix_tasks => lib/location}/http.ex | 0 lib/location/postalcode.ex | 4 +- {mix_tasks => lib/location}/scraper.ex | 2 +- .../mix/tasks}/update_english_translations.ex | 3 +- .../mix/tasks}/update_geoname_data.ex | 14 ++-- .../mix/tasks}/update_iso_data.ex | 3 +- .../mix/tasks}/update_postal_code_data.ex | 20 +++-- location.iml | 81 +++++++++++++++++++ mix.exs | 6 +- priv/version | 1 - 11 files changed, 117 insertions(+), 20 deletions(-) rename {mix_tasks => lib/location}/http.ex (100%) rename {mix_tasks => lib/location}/scraper.ex (95%) rename {mix_tasks => lib/mix/tasks}/update_english_translations.ex (98%) rename {mix_tasks => lib/mix/tasks}/update_geoname_data.ex (70%) rename {mix_tasks => lib/mix/tasks}/update_iso_data.ex (95%) rename {mix_tasks => lib/mix/tasks}/update_postal_code_data.ex (78%) create mode 100644 location.iml delete mode 100644 priv/version diff --git a/README.md b/README.md index 1c15484..c3bcb74 100644 --- a/README.md +++ b/README.md @@ -19,3 +19,6 @@ The data for cities comes from the [geonames](http://www.geonames.org/) project. ### Postal Codes The data for postal codes comes from the [geonames](http://www.geonames.org/) project. This project has scripts to download all or individual postal code files via the --source option. + +### Mix Tasks +one must first refresh the applications task lists by running `mix compile` tasks can then be viewed via `mix help` \ No newline at end of file diff --git a/mix_tasks/http.ex b/lib/location/http.ex similarity index 100% rename from mix_tasks/http.ex rename to lib/location/http.ex diff --git a/lib/location/postalcode.ex b/lib/location/postalcode.ex index 8f79329..b65cc4e 100644 --- a/lib/location/postalcode.ex +++ b/lib/location/postalcode.ex @@ -68,7 +68,7 @@ defmodule Location.PostalCode do @doc """ Finds postal_code information by postal code. """ - @spec get_postal_code(string()) :: %__MODULE__{} | nil + @spec get_postal_code(String.t()) :: %__MODULE__{} | nil def get_postal_code(code) do case :ets.lookup(@ets_table_by_id, code) do [{postal_code, {country_code, state_code, city_name, latitude, longitude}}] -> @@ -85,7 +85,7 @@ defmodule Location.PostalCode do This function returns all postal code founds when the country has multiple cities with the same name. """ - @spec get_postal_codes(string(), string(), string()) :: %__MODULE__{} | nil + @spec get_postal_codes(String.t(), String.t(), String.t()) :: %__MODULE__{} | nil def get_postal_codes(country_code, state_code, city_name) do case :ets.lookup(@ets_table_by_lookup, {country_code, state_code, city_name}) do data when is_list(data) -> diff --git a/mix_tasks/scraper.ex b/lib/location/scraper.ex similarity index 95% rename from mix_tasks/scraper.ex rename to lib/location/scraper.ex index 2e75f6b..a5db935 100644 --- a/mix_tasks/scraper.ex +++ b/lib/location/scraper.ex @@ -19,7 +19,7 @@ defmodule Location.Scraper do result = Floki.find(result, "a") |> Enum.drop(5) Enum.map(result, fn x -> - [{_, [{_, href}], [name]}] = Floki.find(x, "a") + [{_, [{_, _href}], [name]}] = Floki.find(x, "a") String.replace(name, ".zip", "") end) |> Enum.join(", ") diff --git a/mix_tasks/update_english_translations.ex b/lib/mix/tasks/update_english_translations.ex similarity index 98% rename from mix_tasks/update_english_translations.ex rename to lib/mix/tasks/update_english_translations.ex index 5f717b5..03cb873 100644 --- a/mix_tasks/update_english_translations.ex +++ b/lib/mix/tasks/update_english_translations.ex @@ -1,5 +1,6 @@ -defmodule Mix.Tasks.UpdateEnglishTranslations do +defmodule Mix.Tasks.Location.UpdateEnglishTranslations do use Mix.Task + @shortdoc "Updates english translations for locations" @cldr_url "https://raw.githubusercontent.com/unicode-org/cldr/main/common/subdivisions/en.xml" @translations_dest Application.app_dir(:location, "/priv/iso_3166-2.en-translations.json") diff --git a/mix_tasks/update_geoname_data.ex b/lib/mix/tasks/update_geoname_data.ex similarity index 70% rename from mix_tasks/update_geoname_data.ex rename to lib/mix/tasks/update_geoname_data.ex index 51a4159..9b574d1 100644 --- a/mix_tasks/update_geoname_data.ex +++ b/lib/mix/tasks/update_geoname_data.ex @@ -1,15 +1,19 @@ -defmodule Mix.Tasks.UpdateGeonameData do +defmodule Mix.Tasks.Location.UpdateGeonameData do use Mix.Task - - # @allcountries_src "https://download.geonames.org/export/dump/allCountries.zip" + @shortdoc "Updates the geonamedata for locations" + @allcountries_src "https://download.geonames.org/export/dump/allCountries.zip" @allcountries_dest Application.app_dir(:location, "/priv/geonames.csv") @doc """ The data source allCountries.txt clocks in at 1.5GB. Expect this to take a while. """ def run(_) do - # System.cmd("wget", [@allcountries_src, "-O", "/tmp/allCountries.zip"]) - # System.cmd("unzip", ["/tmp/allCountries.zip", "-d", "/tmp"]) + System.cmd("wget", [@allcountries_src, "-O", "/tmp/allCountries.zip"]) + zip_file = Unzip.LocalFile.open("/tmp/allCountries.zip") + {:ok, unzip} = Unzip.new(zip_file) + Unzip.file_stream!(unzip, "allCountries.txt") + |> Stream.into(File.stream!("/tmp/allCountries.txt")) + |> Stream.run() process_geonames_file("/tmp/allCountries.txt") end diff --git a/mix_tasks/update_iso_data.ex b/lib/mix/tasks/update_iso_data.ex similarity index 95% rename from mix_tasks/update_iso_data.ex rename to lib/mix/tasks/update_iso_data.ex index a09a710..4ae23da 100644 --- a/mix_tasks/update_iso_data.ex +++ b/lib/mix/tasks/update_iso_data.ex @@ -1,5 +1,6 @@ -defmodule Mix.Tasks.UpdateIsoData do +defmodule Mix.Tasks.Location.UpdateIsoData do use Mix.Task + @shortdoc "Updates the isocodes for locations" @countries_src "https://salsa.debian.org/iso-codes-team/iso-codes/-/raw/main/data/iso_3166-1.json" @subdivisions_src "https://salsa.debian.org/iso-codes-team/iso-codes/-/raw/main/data/iso_3166-2.json" diff --git a/mix_tasks/update_postal_code_data.ex b/lib/mix/tasks/update_postal_code_data.ex similarity index 78% rename from mix_tasks/update_postal_code_data.ex rename to lib/mix/tasks/update_postal_code_data.ex index 2486afb..eba6005 100644 --- a/mix_tasks/update_postal_code_data.ex +++ b/lib/mix/tasks/update_postal_code_data.ex @@ -1,8 +1,8 @@ -defmodule Mix.Tasks.UpdatePostalCodeData do +defmodule Mix.Tasks.Location.UpdatePostalCodeData do use Mix.Task + @shortdoc "Updates the postal code data from source" - default = Application.app_dir(:location, "/priv/postal_codes.csv") - @destination_filename Application.get_env(:location, :postal_codes_source_file, default) + @destination_filename Application.compile_env(:location, :postal_codes_source_file, "priv/postal_codes.csv") @doc """ The data source clocks in at 16mb. Expect this to take a while. @@ -11,10 +11,12 @@ defmodule Mix.Tasks.UpdatePostalCodeData do def run(args) do {options, _, _} = - OptionParser.parse(["--source", "allCountries", "--list", "--append", "--help"], + OptionParser.parse(args, strict: [source: :string, list: :boolean, append: :boolean, help: :boolean] ) + options = Keyword.merge([help: false, list: false, append: false], options) + case(Keyword.get(options, :help) || Keyword.get(options, :list)) do false -> Keyword.get(options, :source) @@ -41,7 +43,12 @@ defmodule Mix.Tasks.UpdatePostalCodeData do def main(name, append \\ false) do src = "https://download.geonames.org/export/zip/#{name}.zip" System.cmd("wget", [src, "-O", "/tmp/#{name}.zip"]) - System.cmd("unzip", ["/tmp/#{name}.zip", "-d", "/tmp"]) + + zip_file = Unzip.LocalFile.open("/tmp/#{name}.zip") + {:ok, unzip} = Unzip.new(zip_file) + Unzip.file_stream!(unzip, "#{name}.txt") + |> Stream.into(File.stream!("/tmp/#{name}.txt")) + |> Stream.run() process_file("/tmp/#{name}.txt", append) end @@ -60,11 +67,12 @@ defmodule Mix.Tasks.UpdatePostalCodeData do IO.puts("Writing result to #{@destination_filename}") + Location.Scraper.write_date_to_version() + case append do false -> File.write!(@destination_filename, Enum.join(result, "\n")) true -> File.write!(@destination_filename, Enum.join(result, "\n"), :append) end - Location.Scraper.write_date_to_version() end end diff --git a/location.iml b/location.iml new file mode 100644 index 0000000..1c30b2f --- /dev/null +++ b/location.iml @@ -0,0 +1,81 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/mix.exs b/mix.exs index 375a2da..c2ef6e9 100644 --- a/mix.exs +++ b/mix.exs @@ -40,7 +40,7 @@ defmodule Location.MixProject do defp extra_applications(_env), do: [] # Specifies which paths to compile per environment. - defp elixirc_paths(env) when env in [:dev, :test], do: ["lib", "mix_tasks"] + defp elixirc_paths(:test), do: ["lib"] defp elixirc_paths(_env), do: ["lib"] # Run "mix help deps" to learn about dependencies. @@ -48,10 +48,10 @@ defmodule Location.MixProject do [ {:jason, "~> 1.3"}, {:nimble_csv, "~> 1.2"}, - {:floki, "~> 0.35.2", only: [:dev, :test]}, + {:floki, "~> 0.35.2"}, {:tesla, "~> 1.8"}, {:hackney, "~> 1.20"}, - {:flow, "~> 1.2", only: [:dev, :test]}, + {:flow, "~> 1.2"}, {:unzip, "0.11.0"} ] end diff --git a/priv/version b/priv/version deleted file mode 100644 index f69ed48..0000000 --- a/priv/version +++ /dev/null @@ -1 +0,0 @@ -2024-07-09 \ No newline at end of file From df8a0af92e60771b3ed4cf75eb5a5deda6757901 Mon Sep 17 00:00:00 2001 From: mithereal Date: Wed, 26 Feb 2025 01:54:49 -0700 Subject: [PATCH 10/11] fix postal code downloader/parser --- lib/location.ex | 2 + lib/location/postalcode.ex | 170 +++++++++++++++++++---- lib/mix/tasks/update_geoname_data.ex | 1 + lib/mix/tasks/update_postal_code_data.ex | 16 ++- location.iml | 1 + mix.exs | 3 +- mix.lock | 1 + priv/postal_codes.csv | 0 8 files changed, 158 insertions(+), 36 deletions(-) delete mode 100644 priv/postal_codes.csv diff --git a/lib/location.ex b/lib/location.ex index 13e6054..68a2763 100644 --- a/lib/location.ex +++ b/lib/location.ex @@ -1,3 +1,4 @@ +NimbleCSV.define(PostCodeCSV, separator: ",", escape: "\~") NimbleCSV.define(LocationCSV, separator: "\t", escape: "\~") defmodule Location do @@ -10,6 +11,7 @@ defmodule Location do defdelegate get_city(city_name, country_code), to: Location.City defdelegate get_postal_code(code), to: Location.PostalCode defdelegate get_postal_codes(country_code, state_code, city_name), to: Location.PostalCode + defdelegate get_postal_codes(), to: Location.PostalCode def load_all() do Logger.debug("Loading location databases...") diff --git a/lib/location/postalcode.ex b/lib/location/postalcode.ex index b65cc4e..b0fb81a 100644 --- a/lib/location/postalcode.ex +++ b/lib/location/postalcode.ex @@ -23,40 +23,26 @@ defmodule Location.PostalCode do {:decentralized_counters, false} ]) + @ets_table_by_id = + :ets.new(@ets_table_by_id, [ + :set, + :named_table, + :public, + :compressed, + {:write_concurrency, true}, + {:read_concurrency, true}, + {:decentralized_counters, false} + ]) + source_file() |> File.stream!() |> Stream.chunk_every(15_000) |> Task.async_stream( fn chunk -> chunk - |> LocationCSV.parse_stream() - |> Stream.each(fn [ - country_code, - postal_code, - city_name, - _state_name, - state_code, - _municipality, - _municipality_code, - _admin_name3, - _admin_code3, - latitude, - longitude, - _accuracy - ] -> - country_code = String.trim(country_code) - - true = - :ets.insert( - @ets_table_by_lookup, - {{country_code, state_code, city_name}, {postal_code, latitude, longitude}} - ) - - true = - :ets.insert( - @ets_table_by_id, - {postal_code, {country_code, state_code, city_name, latitude, longitude}} - ) + |> PostCodeCSV.parse_stream() + |> Stream.each(fn data -> + __MODULE__.parse(data) end) |> Stream.run() end, @@ -65,6 +51,125 @@ defmodule Location.PostalCode do |> Stream.run() end + def parse(data) do + case data do + [ + country_code, + postal_code, + city_name, + _state_name, + state_code, + _municipality, + _municipality_code, + _admin_name3, + _admin_code3, + latitude, + longitude, + _accuracy, + _, + _ + ] -> + country_code = String.trim(country_code) + + true = + :ets.insert( + @ets_table_by_lookup, + {{country_code, state_code, city_name}, {postal_code, latitude, longitude}} + ) + + true = + :ets.insert( + @ets_table_by_id, + {postal_code, {country_code, state_code, city_name, latitude, longitude}} + ) + + [ + country_code, + postal_code, + city_name, + _state_name, + state_code, + _municipality, + _municipality_code, + _admin_name3, + _admin_code3, + latitude, + longitude, + _accuracy, + _ + ] -> + country_code = String.trim(country_code) + + true = + :ets.insert( + @ets_table_by_lookup, + {{country_code, state_code, city_name}, {postal_code, latitude, longitude}} + ) + + true = + :ets.insert( + @ets_table_by_id, + {postal_code, {country_code, state_code, city_name, latitude, longitude}} + ) + + [ + country_code, + postal_code, + city_name, + _state_name, + state_code, + _municipality, + _municipality_code, + _admin_name3, + _admin_code3, + latitude, + longitude, + _accuracy + ] -> + country_code = String.trim(country_code) + + true = + :ets.insert( + @ets_table_by_lookup, + {{country_code, state_code, city_name}, {postal_code, latitude, longitude}} + ) + + true = + :ets.insert( + @ets_table_by_id, + {postal_code, {country_code, state_code, city_name, latitude, longitude}} + ) + + [ + country_code, + postal_code, + city_name, + _state_name, + state_code, + _municipality, + _municipality_code, + _admin_name3, + _admin_code3, + latitude, + longitude + ] -> + true = + :ets.insert( + @ets_table_by_lookup, + {{country_code, state_code, city_name}, {postal_code, latitude, longitude}} + ) + + true = + :ets.insert( + @ets_table_by_id, + {postal_code, {country_code, state_code, city_name, latitude, longitude}} + ) + + _data -> + :ok + end + end + @doc """ Finds postal_code information by postal code. """ @@ -99,6 +204,15 @@ defmodule Location.PostalCode do end end + @spec get_postal_codes() :: %__MODULE__{} | nil + def get_postal_codes() do + :ets.tab2list(@ets_table_by_lookup) + |> Enum.map(fn x -> + {{country_code, state_code, city_name}, {postal_code, latitude, longitude}} = x + to_struct(postal_code, country_code, state_code, city_name, latitude, longitude) + end) + end + defp source_file() do default = Application.app_dir(:location, "/priv/postal_codes.csv") Application.get_env(:location, :postal_codes_source_file, default) diff --git a/lib/mix/tasks/update_geoname_data.ex b/lib/mix/tasks/update_geoname_data.ex index 9b574d1..9a20fbb 100644 --- a/lib/mix/tasks/update_geoname_data.ex +++ b/lib/mix/tasks/update_geoname_data.ex @@ -11,6 +11,7 @@ defmodule Mix.Tasks.Location.UpdateGeonameData do System.cmd("wget", [@allcountries_src, "-O", "/tmp/allCountries.zip"]) zip_file = Unzip.LocalFile.open("/tmp/allCountries.zip") {:ok, unzip} = Unzip.new(zip_file) + Unzip.file_stream!(unzip, "allCountries.txt") |> Stream.into(File.stream!("/tmp/allCountries.txt")) |> Stream.run() diff --git a/lib/mix/tasks/update_postal_code_data.ex b/lib/mix/tasks/update_postal_code_data.ex index eba6005..bc3cb64 100644 --- a/lib/mix/tasks/update_postal_code_data.ex +++ b/lib/mix/tasks/update_postal_code_data.ex @@ -2,7 +2,11 @@ defmodule Mix.Tasks.Location.UpdatePostalCodeData do use Mix.Task @shortdoc "Updates the postal code data from source" - @destination_filename Application.compile_env(:location, :postal_codes_source_file, "priv/postal_codes.csv") + @destination_filename Application.compile_env( + :location, + :postal_codes_source_file, + "priv/postal_codes.csv" + ) @doc """ The data source clocks in at 16mb. Expect this to take a while. @@ -46,6 +50,7 @@ defmodule Mix.Tasks.Location.UpdatePostalCodeData do zip_file = Unzip.LocalFile.open("/tmp/#{name}.zip") {:ok, unzip} = Unzip.new(zip_file) + Unzip.file_stream!(unzip, "#{name}.txt") |> Stream.into(File.stream!("/tmp/#{name}.txt")) |> Stream.run() @@ -61,7 +66,7 @@ defmodule Mix.Tasks.Location.UpdatePostalCodeData do filename |> File.stream!(read_ahead: 100_000) |> Flow.from_enumerable() - |> Flow.map(&String.split(&1, tab)) + |> Flow.map(&(String.trim(&1) |> String.split(tab))) |> Flow.partition() |> Enum.into([]) @@ -69,10 +74,7 @@ defmodule Mix.Tasks.Location.UpdatePostalCodeData do Location.Scraper.write_date_to_version() - case append do - false -> File.write!(@destination_filename, Enum.join(result, "\n")) - true -> File.write!(@destination_filename, Enum.join(result, "\n"), :append) - end - + file = File.open!(@destination_filename, [:write, :utf8]) + result |> CSV.encode() |> Enum.each(&IO.write(file, &1)) end end diff --git a/location.iml b/location.iml index 1c30b2f..89e3c09 100644 --- a/location.iml +++ b/location.iml @@ -77,5 +77,6 @@ + \ No newline at end of file diff --git a/mix.exs b/mix.exs index c2ef6e9..00ed18c 100644 --- a/mix.exs +++ b/mix.exs @@ -52,7 +52,8 @@ defmodule Location.MixProject do {:tesla, "~> 1.8"}, {:hackney, "~> 1.20"}, {:flow, "~> 1.2"}, - {:unzip, "0.11.0"} + {:unzip, "0.11.0"}, + {:csv, "~> 3.2"} ] end end diff --git a/mix.lock b/mix.lock index f9f3f83..0c0c65e 100644 --- a/mix.lock +++ b/mix.lock @@ -1,5 +1,6 @@ %{ "certifi": {:hex, :certifi, "2.14.0", "ed3bef654e69cde5e6c022df8070a579a79e8ba2368a00acf3d75b82d9aceeed", [:rebar3], [], "hexpm", "ea59d87ef89da429b8e905264fdec3419f84f2215bb3d81e07a18aac919026c3"}, + "csv": {:hex, :csv, "3.2.2", "452f96414b39a176b7c390af6d8b78f15130dc6167fe3b836729131f515d843e", [:mix], [], "hexpm", "cbf256ff74a3fa01d9ec420d07b19c90d410ed9fe5b6d6e1bc7662edf35bc574"}, "floki": {:hex, :floki, "0.35.4", "cc947b446024732c07274ac656600c5c4dc014caa1f8fb2dfff93d275b83890d", [:mix], [], "hexpm", "27fa185d3469bd8fc5947ef0f8d5c4e47f0af02eb6b070b63c868f69e3af0204"}, "flow": {:hex, :flow, "1.2.4", "1dd58918287eb286656008777cb32714b5123d3855956f29aa141ebae456922d", [:mix], [{:gen_stage, "~> 1.0", [hex: :gen_stage, repo: "hexpm", optional: false]}], "hexpm", "874adde96368e71870f3510b91e35bc31652291858c86c0e75359cbdd35eb211"}, "gen_stage": {:hex, :gen_stage, "1.2.1", "19d8b5e9a5996d813b8245338a28246307fd8b9c99d1237de199d21efc4c76a1", [:mix], [], "hexpm", "83e8be657fa05b992ffa6ac1e3af6d57aa50aace8f691fcf696ff02f8335b001"}, diff --git a/priv/postal_codes.csv b/priv/postal_codes.csv deleted file mode 100644 index e69de29..0000000 From f398d3096e34f7c3ead9879bfe73416caaa8253b Mon Sep 17 00:00:00 2001 From: mithereal Date: Sat, 15 Mar 2025 01:33:42 -0700 Subject: [PATCH 11/11] add unload function --- lib/location.ex | 14 ++++++++++++++ lib/location/city.ex | 5 +++++ lib/location/country.ex | 4 ++++ lib/location/postalcode.ex | 5 +++++ lib/location/subdivision.ex | 4 ++++ 5 files changed, 32 insertions(+) diff --git a/lib/location.ex b/lib/location.ex index 68a2763..a3fc997 100644 --- a/lib/location.ex +++ b/lib/location.ex @@ -13,6 +13,13 @@ defmodule Location do defdelegate get_postal_codes(country_code, state_code, city_name), to: Location.PostalCode defdelegate get_postal_codes(), to: Location.PostalCode + def unload_all()do + :ok = unload(Location.Country) + :ok = unload(Location.Subdivision) + :ok = unload(Location.City) + :ok = unload(Location.PostalCode) + end + def load_all() do Logger.debug("Loading location databases...") @@ -34,6 +41,13 @@ defmodule Location do :ok end + def unload(module) do + module.unload() + + Logger.debug("Unloading location database #{inspect(module)}") + :ok + end + def version() do version_file = Application.app_dir(:location, "priv/version") diff --git a/lib/location/city.ex b/lib/location/city.ex index 4a5f275..adf39d0 100644 --- a/lib/location/city.ex +++ b/lib/location/city.ex @@ -4,6 +4,11 @@ defmodule Location.City do defstruct [:id, :name, :country_code] + def unload()do + :ets.delete(@ets_table_by_id) + :ets.delete(@ets_table_by_label) + end + def load() do @ets_table_by_id = :ets.new(@ets_table_by_id, [ diff --git a/lib/location/country.ex b/lib/location/country.ex index aa32730..ff24218 100644 --- a/lib/location/country.ex +++ b/lib/location/country.ex @@ -3,6 +3,10 @@ defmodule Location.Country do defstruct [:alpha_2, :alpha_3, :name, :flag] + def unload()do + :ets.delete(@ets_table) + end + def load() do ets = :ets.new(@ets_table, [:named_table]) diff --git a/lib/location/postalcode.ex b/lib/location/postalcode.ex index b0fb81a..0b4c8c2 100644 --- a/lib/location/postalcode.ex +++ b/lib/location/postalcode.ex @@ -11,6 +11,11 @@ defmodule Location.PostalCode do :longitude ] + def unload()do + :ets.delete(@ets_table_by_id) + :ets.delete(@ets_table_by_lookup) + end + def load() do @ets_table_by_lookup = :ets.new(@ets_table_by_lookup, [ diff --git a/lib/location/subdivision.ex b/lib/location/subdivision.ex index ce4ace8..ab8dabb 100644 --- a/lib/location/subdivision.ex +++ b/lib/location/subdivision.ex @@ -3,6 +3,10 @@ defmodule Location.Subdivision do defstruct [:code, :name, :type, :country_code] + def unload()do + :ets.delete(@ets_table) + end + def load() do ets = :ets.new(@ets_table, [:named_table])