From b8cc25326d68b394e0fa64807ffbbc9746b79249 Mon Sep 17 00:00:00 2001 From: Mike Williams Date: Wed, 22 Oct 2025 13:50:53 -0400 Subject: [PATCH 1/2] fix: automatic groupings for enrollment/exposure queries --- src/mozanalysis/experiment.py | 49 ++++++++--------------------------- 1 file changed, 11 insertions(+), 38 deletions(-) diff --git a/src/mozanalysis/experiment.py b/src/mozanalysis/experiment.py index 2e3b1c7f..72e9a93d 100644 --- a/src/mozanalysis/experiment.py +++ b/src/mozanalysis/experiment.py @@ -858,7 +858,7 @@ def _build_enrollments_query_normandy( BETWEEN '{time_limits.first_enrollment_date}' AND '{time_limits.last_enrollment_date}' AND e.event_string_value = '{self.experiment_slug}' AND e.sample_id < {sample_size} - GROUP BY e.{self.analysis_unit.value}, branch + GROUP BY ALL """ # noqa:E501 def _build_enrollments_query_fenix_baseline( @@ -895,7 +895,7 @@ def _build_enrollments_query_fenix_baseline( '{experiment_slug}' ).branch IS NOT NULL AND b.sample_id < {sample_size} - GROUP BY b.client_info.client_id, branch + GROUP BY ALL HAVING enrollment_date >= '{first_enrollment_date}' """.format( experiment_slug=self.experiment_slug, @@ -905,36 +905,6 @@ def _build_enrollments_query_fenix_baseline( sample_size=sample_size, ) - def _build_enrollments_query_glean_event( - self, time_limits: TimeLimits, dataset: str, sample_size: int = 100 - ) -> str: - """Deprecated; see _build_enrollments_query_glean_events_stream below - - Return SQL to query enrollments for a Glean experiment from the - events table for the application or dataset. - """ - - return f""" - SELECT events.client_info.client_id AS analysis_id, - mozfun.map.get_key( - e.extra, - 'branch' - ) AS branch, - DATE(MIN(events.submission_timestamp)) AS enrollment_date, - COUNT(events.submission_timestamp) AS num_enrollment_events - FROM `moz-fx-data-shared-prod.{self.app_id or dataset}.events` events, - UNNEST(events.events) AS e - WHERE - events.client_info.client_id IS NOT NULL AND - DATE(events.submission_timestamp) - BETWEEN '{time_limits.first_enrollment_date}' AND '{time_limits.last_enrollment_date}' - AND e.category = "nimbus_events" - AND mozfun.map.get_key(e.extra, "experiment") = '{self.experiment_slug}' - AND e.name = 'enrollment' - AND sample_id < {sample_size} - GROUP BY events.client_info.client_id, branch - """ # noqa:E501 - def _build_enrollments_query_glean_events_stream( self, time_limits: TimeLimits, @@ -961,7 +931,7 @@ def _build_enrollments_query_glean_events_stream( AND JSON_VALUE(event_extra, "$.experiment") = "{self.experiment_slug}" AND event_name = "enrollment" AND sample_id < {sample_size} - GROUP BY client_id, branch + GROUP BY ALL """ # noqa:E501 def _build_enrollments_query_glean_events_stream_enrollment_status( @@ -972,6 +942,9 @@ def _build_enrollments_query_glean_events_stream_enrollment_status( enrollments from the enrollment_status event. """ + logger.warning("enrollment_status is not ready for production use.") + # TODO: dynamic analysis_id + return f""" SELECT client_id AS analysis_id, @@ -989,7 +962,7 @@ def _build_enrollments_query_glean_events_stream_enrollment_status( AND JSON_VALUE(event_extra, "$.status") = "Enrolled" AND JSON_VALUE(event_extra, "$.reason") = "Qualified" AND sample_id < {sample_size} - GROUP BY client_id, branch + GROUP BY ALL """ # noqa:E501 def _build_enrollments_query_cirrus( @@ -1024,7 +997,7 @@ def _build_enrollments_query_cirrus( AND mozfun.map.get_key(e.extra, "experiment") = '{self.experiment_slug}' AND e.name = 'enrollment' AND client_info.app_channel = 'production' - GROUP BY mozfun.map.get_key(e.extra, "nimbus_user_id"), branch + GROUP BY ALL """ # noqa:E501 def _build_exposure_query_normandy(self, time_limits: TimeLimits) -> str: @@ -1053,7 +1026,7 @@ def _build_exposure_query_normandy(self, time_limits: TimeLimits) -> str: ON re.analysis_id = e.analysis_id AND re.branch = e.branch AND e.submission_date >= re.enrollment_date - GROUP BY e.analysis_id, e.branch + GROUP BY ALL """ # noqa: E501 def _build_exposure_query_glean_event( @@ -1091,7 +1064,7 @@ def _build_exposure_query_glean_event( ON re.analysis_id = exposures.analysis_id AND re.branch = exposures.branch AND exposures.submission_date >= re.enrollment_date - GROUP BY analysis_id, branch + GROUP BY ALL """ # noqa: E501 def _build_exposure_query_glean_events_stream( @@ -1126,7 +1099,7 @@ def _build_exposure_query_glean_events_stream( ON re.analysis_id = exposures.analysis_id AND re.branch = exposures.branch AND exposures.submission_date >= re.enrollment_date - GROUP BY analysis_id, branch + GROUP BY ALL """ # noqa: E501 def _build_metrics_query_bits( From 03eb325bbf3f1550d66d5a9680d4e4ba458dc73a Mon Sep 17 00:00:00 2001 From: Mike Williams Date: Wed, 22 Oct 2025 14:01:00 -0400 Subject: [PATCH 2/2] fix tests & ci run condition --- .github/workflows/build.yml | 1 + tests/test_experiment.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f8ce5e18..ef49f882 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -2,6 +2,7 @@ name: Build, Test, and Deploy on: push: branches: + - main pull_request: jobs: diff --git a/tests/test_experiment.py b/tests/test_experiment.py index ecff02d6..5f47d5b6 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -304,7 +304,7 @@ def test_query_not_detectably_malformed(analysis_unit: AnalysisUnit): sql_lint(enrollments_sql) assert "sample_id < None" not in enrollments_sql - assert enrollments_sql.count(analysis_unit.value) == 3 + assert enrollments_sql.count(analysis_unit.value) == 2 metrics_sql = exp.build_metrics_query( metric_list=[], @@ -345,7 +345,7 @@ def test_megaquery_not_detectably_malformed( sql_lint(enrollments_sql) - assert enrollments_sql.count(analysis_unit.value) == 3 + assert enrollments_sql.count(analysis_unit.value) == 2 metrics_sql = exp.build_metrics_query( metric_list=desktop_metrics, @@ -1069,7 +1069,7 @@ def test_enrollments_query_analysis_unit(analysis_unit): BETWEEN '2019-01-01' AND '2019-01-08' AND e.event_string_value = 'slug' AND e.sample_id < 100 -GROUP BY e.{analysis_unit.value}, branch +GROUP BY ALL ), segmented_enrollments AS ( SELECT @@ -1102,7 +1102,7 @@ def test_enrollments_query_analysis_unit(analysis_unit): ON re.analysis_id = e.analysis_id AND re.branch = e.branch AND e.submission_date >= re.enrollment_date -GROUP BY e.analysis_id, e.branch +GROUP BY ALL ) SELECT