From bbdd582d7fb127153ebbddcaa12b78b53db636ab Mon Sep 17 00:00:00 2001 From: christophercaradonna Date: Thu, 25 Sep 2025 14:06:28 -0600 Subject: [PATCH 01/16] Setting up testing runs --- .../compare_upgrades-test_timeseries_plots.py | 113 ++++++++++++++++++ .../comstock_measure_comparison.py | 108 ++++++++--------- .../comstockpostproc/plotting_mixin.py | 15 +++ 3 files changed, 182 insertions(+), 54 deletions(-) create mode 100644 postprocessing/compare_upgrades-test_timeseries_plots.py diff --git a/postprocessing/compare_upgrades-test_timeseries_plots.py b/postprocessing/compare_upgrades-test_timeseries_plots.py new file mode 100644 index 000000000..afed1de15 --- /dev/null +++ b/postprocessing/compare_upgrades-test_timeseries_plots.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import logging + +import comstockpostproc as cspp + +logging.basicConfig(level='INFO') # Use DEBUG, INFO, or WARNING +logger = logging.getLogger(__name__) + +def main(): + # ComStock run + comstock = cspp.ComStock( + s3_base_dir='com-sdr/production_runs/2025_r3_2012/test', # If run not on S3, download results_up**.parquet manually + comstock_run_name='sdr_2025_r3_2012_1of5_test100_df', # Name of the run on S3 + comstock_run_version='sdr_2025_r3_2012_1of5_test100_df', # Use whatever you want to see in plot and folder names + comstock_year=2018, # Typically don't change this + athena_table_name=None, # Typically don't change this + truth_data_version='v01', # Typically don't change this + buildstock_csv_name='buildstock.csv', # Download buildstock.csv manually + acceptable_failure_percentage=0.05, # Can increase this when testing and high failure are OK + drop_failed_runs=True, # False if you want to evaluate which runs failed in raw output data + color_hex='#0072B2', # Color used to represent this run in plots + skip_missing_columns=True, # False if you want to ensure you have all data specified for export + reload_from_cache=True, # True if CSV already made and want faster reload times + include_upgrades=True, # False if not looking at upgrades + upgrade_ids_to_skip=[], # Use [1, 3] etc. to exclude certain upgrades + make_timeseries_plots=False, + states={ + #'MN': 'Minnesota', # specify state to use for timeseries plots in dictionary format. State ID must correspond correctly. + 'MA':'Massachusetts', + #'OR': 'Oregon', + #'LA': 'Louisiana', + #'AZ': 'Arizona', + #'TN': 'Tennessee' + }, + upgrade_ids_for_comparison={} # Use {'':[0,1,2]}; add as many upgrade IDs as needed, but plots look strange over 5 + #output_dir = 's3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2025/comstock_amy2018_release_1' + ) + + # Stock Estimation for Apportionment: + stock_estimate = cspp.Apportion( + stock_estimation_version='2025R3', # Only updated when a new stock estimate is published + truth_data_version='v01', # Typically don't change this + reload_from_cache=True, # Set to "True" if you have already run apportionment and would like to keep consistant values between postprocessing runs. + #output_dir = 's3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2025/comstock_amy2018_release_1' + ) + + # Scale ComStock runs to the 'truth data' from StockE V3 estimates using bucket-based apportionment + base_sim_outs = comstock.get_sim_outs_for_upgrade(0) + comstock.create_allocated_weights(stock_estimate, base_sim_outs, reload_from_cache=True) + + # CBECS + cbecs = cspp.CBECS( + cbecs_year=2018, # 2012 and 2018 currently available + truth_data_version='v01', # Typically don't change this + color_hex='#009E73', # Color used to represent CBECS in plots + reload_from_csv=True # True if CSV already made and want faster reload times + ) + + # Scale ComStock to CBECS 2018 AND remove non-ComStock buildings from CBECS + base_sim_outs = comstock.get_sim_outs_for_upgrade(0) + alloc_wts = comstock.get_allocated_weights() + comstock.create_allocated_weights_scaled_to_cbecs(cbecs, base_sim_outs, alloc_wts, remove_non_comstock_bldg_types_from_cbecs=True) + + # Add utility bills onto allocated weights + for upgrade_id in comstock.upgrade_ids_to_process: + # up_sim_outs = comstock.get_sim_outs_for_upgrade(upgrade_id) + # up_alloc_wts = comstock.get_allocated_weights_scaled_to_cbecs_for_upgrade(upgrade_id) + comstock.create_allocated_weights_plus_util_bills_for_upgrade(upgrade_id) + + # Export metadata files + geo_exports = [ + #{'geo_top_dir': 'by_state_and_county', + # 'partition_cols': { + # comstock.STATE_ABBRV: 'state', + # comstock.COUNTY_ID: 'county', + # }, + # 'aggregation_levels': ['in.nhgis_tract_gisjoin'], # , comstock.COUNTY_ID], # Full tract resolution (agg=in.nhgis_tract_gisjoin) + # 'data_types': ['full', 'basic'], + # 'file_types': ['csv', 'parquet'], + #}, + {'geo_top_dir': 'national_by_state', + 'partition_cols': {}, + 'aggregation_levels': [[comstock.STATE_ABBRV, comstock.CZ_ASHRAE]], + 'data_types': ['full'], # other options: 'detailed', 'basic' **If using multiple options, order must go from more detailed to less detailed. + 'file_types': ['csv'], # other options:'parquet' + } + ] + + #for geo_export in geo_exports: + # for upgrade_id in comstock.upgrade_ids_to_process: + # # if upgrade_id == 0: + # # continue + # comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id, [geo_export]) + + ## Compare to CBECS + #comp = cspp.ComStockToCBECSComparison(cbecs_list=[cbecs], + # comstock_list=[comstock], + # upgrade_id='All', + # make_comparison_plots=True, + # make_hvac_plots=False) + #comp.export_to_csv_wide() # May comment this out after run once + + # Create measure run comparisons; only use if run has measures + comparison = cspp.ComStockMeasureComparison(comstock, states=comstock.states, make_comparison_plots = comstock.make_comparison_plots, make_timeseries_plots = comstock.make_timeseries_plots) + + # Export dictionaries corresponding to the exported columns + #comstock.export_data_and_enumeration_dictionary() + +# Code to execute the script +if __name__=="__main__": + main() diff --git a/postprocessing/comstockpostproc/comstock_measure_comparison.py b/postprocessing/comstockpostproc/comstock_measure_comparison.py index 0e942a5a1..f3acff9a9 100644 --- a/postprocessing/comstockpostproc/comstock_measure_comparison.py +++ b/postprocessing/comstockpostproc/comstock_measure_comparison.py @@ -129,60 +129,60 @@ def make_plots(self, lazy_frame: pl.LazyFrame, column_for_grouping, states, make 'output_dir': output_dir } - LazyFramePlotter.plot_with_lazy( - plot_method=self.plot_energy_by_enduse_and_fuel_type, - lazy_frame=lazy_frame.clone(), - columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_COLUMNS_ANN_ENDUSE + self.lazyframe_plotter.WTD_COLUMNS_ANN_PV + self.lazyframe_plotter.WTD_COLUMNS_SUMMARIZE))(**BASIC_PARAMS) - LazyFramePlotter.plot_with_lazy( - plot_method=self.plot_emissions_by_fuel_type, - lazy_frame=lazy_frame.clone(), - columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_GHG_COLUMNS))(**BASIC_PARAMS) - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_utility_bills_by_fuel_type, lazy_frame=lazy_frame.clone(), columns=( - self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_UTILITY_COLUMNS))(**BASIC_PARAMS) - LazyFramePlotter.plot_with_lazy( - plot_method=self.plot_floor_area_and_energy_totals, - lazy_frame=lazy_frame.clone(), - columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_COLUMNS_SUMMARIZE))(**BASIC_PARAMS) - LazyFramePlotter.plot_with_lazy( - plot_method=self.plot_floor_area_and_energy_totals_by_building_type, - lazy_frame=lazy_frame.clone(), - columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_COLUMNS_SUMMARIZE))(**BASIC_PARAMS) - LazyFramePlotter.plot_with_lazy( - plot_method=self.plot_end_use_totals_by_building_type, - lazy_frame=lazy_frame.clone(), - columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_COLUMNS_ANN_ENDUSE + [self.BLDG_TYPE, self.CEN_DIV]))(**BASIC_PARAMS) - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_eui_histograms_by_building_type, - lazy_frame=lazy_frame.clone(), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_ANN_TOTL_COLUMNS + [self.BLDG_TYPE]))(**BASIC_PARAMS) - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_eui_boxplots_by_building_type, - lazy_frame=lazy_frame.clone(), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_ANN_TOTL_COLUMNS + [self.CEN_DIV, self.BLDG_TYPE]))(**BASIC_PARAMS) - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_savings_distributions_enduse_and_fuel, lazy_frame=lazy_frame.clone( - ), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.SAVINGS_DISTRI_ENDUSE_COLUMNS + [self.UPGRADE_ID]))(output_dir=output_dir) - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_savings_distributions_by_building_type, lazy_frame=lazy_frame.clone(), - columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.SAVINGS_DISTRI_BUILDINTYPE + [self.BLDG_TYPE, self.UPGRADE_ID]))(output_dir=output_dir) - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_savings_distributions_by_climate_zone, lazy_frame=lazy_frame.clone( - ), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.SAVINGS_DISTRI_BUILDINTYPE + [self.CZ_ASHRAE, self.UPGRADE_ID]))(output_dir=output_dir) - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_savings_distributions_by_hvac_system_type, lazy_frame=lazy_frame.clone( - ), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.SAVINGS_DISTRI_BUILDINTYPE + [self.HVAC_SYS, self.UPGRADE_ID]))(output_dir=output_dir) - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_utility_savings_distributions_by_fuel, lazy_frame=lazy_frame.clone( - ), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_SAVINGS_COLUMNS + [self.UPGRADE_ID]))(output_dir=output_dir) - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_utility_savings_distributions_by_building_type, lazy_frame=lazy_frame.clone( - ), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_SAVINGS_COLUMNS + [self.BLDG_TYPE, self.UPGRADE_ID]))(output_dir=output_dir) - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_utility_savings_distributions_by_climate_zone, lazy_frame=lazy_frame.clone( - ), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_SAVINGS_COLUMNS + [self.CZ_ASHRAE, self.UPGRADE_ID]))(output_dir=output_dir) - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_utility_savings_distributions_by_hvac_system, lazy_frame=lazy_frame.clone( - ), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_SAVINGS_COLUMNS + [self.HVAC_SYS, self.UPGRADE_ID]))(output_dir=output_dir) - - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_qoi_timing, lazy_frame=lazy_frame.clone(), - columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.QOI_COLUMNS))(**BASIC_PARAMS) - - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_qoi_max_use, lazy_frame=lazy_frame.clone(), - columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.QOI_COLUMNS))(**BASIC_PARAMS) - - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_qoi_min_use, lazy_frame=lazy_frame.clone(), - columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.QOI_COLUMNS))(**BASIC_PARAMS) - - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_unmet_hours, lazy_frame=lazy_frame.clone(), - columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.UNMET_HOURS_COLS))(**BASIC_PARAMS) + #LazyFramePlotter.plot_with_lazy( + # plot_method=self.plot_energy_by_enduse_and_fuel_type, + # lazy_frame=lazy_frame.clone(), + # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_COLUMNS_ANN_ENDUSE + self.lazyframe_plotter.WTD_COLUMNS_ANN_PV + self.lazyframe_plotter.WTD_COLUMNS_SUMMARIZE))(**BASIC_PARAMS) + #LazyFramePlotter.plot_with_lazy( + # plot_method=self.plot_emissions_by_fuel_type, + # lazy_frame=lazy_frame.clone(), + # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_GHG_COLUMNS))(**BASIC_PARAMS) + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_utility_bills_by_fuel_type, lazy_frame=lazy_frame.clone(), columns=( + # self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_UTILITY_COLUMNS))(**BASIC_PARAMS) + #LazyFramePlotter.plot_with_lazy( + # plot_method=self.plot_floor_area_and_energy_totals, + # lazy_frame=lazy_frame.clone(), + # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_COLUMNS_SUMMARIZE))(**BASIC_PARAMS) + #LazyFramePlotter.plot_with_lazy( + # plot_method=self.plot_floor_area_and_energy_totals_by_building_type, + # lazy_frame=lazy_frame.clone(), + # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_COLUMNS_SUMMARIZE))(**BASIC_PARAMS) + #LazyFramePlotter.plot_with_lazy( + # plot_method=self.plot_end_use_totals_by_building_type, + # lazy_frame=lazy_frame.clone(), + # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_COLUMNS_ANN_ENDUSE + [self.BLDG_TYPE, self.CEN_DIV]))(**BASIC_PARAMS) + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_eui_histograms_by_building_type, + # lazy_frame=lazy_frame.clone(), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_ANN_TOTL_COLUMNS + [self.BLDG_TYPE]))(**BASIC_PARAMS) + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_eui_boxplots_by_building_type, + # lazy_frame=lazy_frame.clone(), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_ANN_TOTL_COLUMNS + [self.CEN_DIV, self.BLDG_TYPE]))(**BASIC_PARAMS) + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_savings_distributions_enduse_and_fuel, lazy_frame=lazy_frame.clone( + #), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.SAVINGS_DISTRI_ENDUSE_COLUMNS + [self.UPGRADE_ID]))(output_dir=output_dir) + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_savings_distributions_by_building_type, lazy_frame=lazy_frame.clone(), + # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.SAVINGS_DISTRI_BUILDINTYPE + [self.BLDG_TYPE, self.UPGRADE_ID]))(output_dir=output_dir) + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_savings_distributions_by_climate_zone, lazy_frame=lazy_frame.clone( + #), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.SAVINGS_DISTRI_BUILDINTYPE + [self.CZ_ASHRAE, self.UPGRADE_ID]))(output_dir=output_dir) + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_savings_distributions_by_hvac_system_type, lazy_frame=lazy_frame.clone( + #), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.SAVINGS_DISTRI_BUILDINTYPE + [self.HVAC_SYS, self.UPGRADE_ID]))(output_dir=output_dir) + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_utility_savings_distributions_by_fuel, lazy_frame=lazy_frame.clone( + #), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_SAVINGS_COLUMNS + [self.UPGRADE_ID]))(output_dir=output_dir) + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_utility_savings_distributions_by_building_type, lazy_frame=lazy_frame.clone( + #), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_SAVINGS_COLUMNS + [self.BLDG_TYPE, self.UPGRADE_ID]))(output_dir=output_dir) + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_utility_savings_distributions_by_climate_zone, lazy_frame=lazy_frame.clone( + #), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_SAVINGS_COLUMNS + [self.CZ_ASHRAE, self.UPGRADE_ID]))(output_dir=output_dir) + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_utility_savings_distributions_by_hvac_system, lazy_frame=lazy_frame.clone( + #), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_SAVINGS_COLUMNS + [self.HVAC_SYS, self.UPGRADE_ID]))(output_dir=output_dir) + + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_qoi_timing, lazy_frame=lazy_frame.clone(), + # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.QOI_COLUMNS))(**BASIC_PARAMS) + + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_qoi_max_use, lazy_frame=lazy_frame.clone(), + # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.QOI_COLUMNS))(**BASIC_PARAMS) + + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_qoi_min_use, lazy_frame=lazy_frame.clone(), + # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.QOI_COLUMNS))(**BASIC_PARAMS) + + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_unmet_hours, lazy_frame=lazy_frame.clone(), + # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.UNMET_HOURS_COLS))(**BASIC_PARAMS) if make_timeseries_plots: TIMESERIES_PARAMS = {'comstock_run_name': self.comstock_run_name, 'states': states, 'color_map': color_map, diff --git a/postprocessing/comstockpostproc/plotting_mixin.py b/postprocessing/comstockpostproc/plotting_mixin.py index 766e60cce..2cf6b5d37 100644 --- a/postprocessing/comstockpostproc/plotting_mixin.py +++ b/postprocessing/comstockpostproc/plotting_mixin.py @@ -3010,6 +3010,18 @@ def plot_load_duration_curve(self, df, region, building_type, color_map, output_ plt.savefig(output_path, bbox_inches='tight') + # create weights table on S3 based on allocation outcomes + def clone_weights_table_on_s3(self, df, state, bldg_type_col, weight_col): + """ + This method clones the weights table created during allocation on S3 so it can be reused for weighting timeseries profiles on S3. + The weights table is for the full baseline run at full resolution. + """ + self.get_allocated_weights_scaled_to_cbecs_for_upgrade(0) + + + + + # get weighted load profiles def wgt_by_btype(self, df, run_data, dict_wgts, upgrade_num, state, upgrade_name): """ @@ -3538,7 +3550,10 @@ def map_to_dow(dow): dfs_merged=None if not os.path.exists(file_path): + + ######### This is where we get the weighted data dfs_base_combined, dfs_upgrade_combined = self.wgt_by_btype(df, run_data, dict_wgts, upgrade_num, state, upgrade_name) + ######### # merge into single dataframe dfs_merged = pd.concat([dfs_base_combined, dfs_upgrade_combined], ignore_index=True) From cf4fd58678339b1f423cebd0c6736d15395959a2 Mon Sep 17 00:00:00 2001 From: christophercaradonna Date: Wed, 29 Oct 2025 11:15:48 -0600 Subject: [PATCH 02/16] progress --- .../compare_upgrades-test_timeseries_plots.py | 612 +++++++++++++++++- postprocessing/comstockpostproc/comstock.py | 25 +- .../comstock_measure_comparison.py | 29 +- .../comstock_query_builder.py | 409 ++++++++++++ .../comstockpostproc/plotting_mixin.py | 576 ++++++++++------- .../comstockpostproc/s3_utilities_mixin.py | 2 +- postprocessing/setup.py | 4 +- 7 files changed, 1393 insertions(+), 264 deletions(-) create mode 100644 postprocessing/comstockpostproc/comstock_query_builder.py diff --git a/postprocessing/compare_upgrades-test_timeseries_plots.py b/postprocessing/compare_upgrades-test_timeseries_plots.py index afed1de15..28ed481ed 100644 --- a/postprocessing/compare_upgrades-test_timeseries_plots.py +++ b/postprocessing/compare_upgrades-test_timeseries_plots.py @@ -2,18 +2,581 @@ # -*- coding: utf-8 -*- import logging - +import boto3 +import time +from fsspec.core import url_to_fs +from sqlalchemy.engine import create_engine +import sqlalchemy as sa import comstockpostproc as cspp +from sqlalchemy_views import CreateView +import re + + +import sys, inspect, importlib, site, platform, shutil, subprocess, os + +mods = ["pyathena","s3fs","fsspec","botocore","aiobotocore","boto3","pandas"] +for m in mods: + try: + mod = importlib.import_module(m) + print(f"{m:12} {getattr(mod,'__version__','?'):>10} @ {inspect.getfile(mod)}") + except Exception as e: + print(f"{m:12} NOT IMPORTABLE: {e}") + +# Python version details +print("python exe:", sys.executable) +print("python implementation:", platform.python_implementation()) +print("python version:", platform.python_version()) +print("sys.version:", sys.version.replace("\n", " ")) +print("sys.version_info:", sys.version_info) logging.basicConfig(level='INFO') # Use DEBUG, INFO, or WARNING logger = logging.getLogger(__name__) +def create_sightglass_tables( + s3_location: str, + dataset_name: str, + database_name: str = "vizstock", + glue_service_role: str = "vizstock-glue-service-role", + ): + fs, fs_path = url_to_fs(s3_location) + + glue = boto3.client("glue", region_name="us-west-2") + + crawler_name_base = f"vizstock_{dataset_name}" + tbl_prefix_base = f"{dataset_name}" + + crawler_params = { + "Role": glue_service_role, + "DatabaseName": database_name, + "Targets": {}, + "SchemaChangePolicy": { + "UpdateBehavior": "UPDATE_IN_DATABASE", + "DeleteBehavior": "DELETE_FROM_DATABASE", + } + } + + # Crawl each metadata target separately to create a + # unique table name for each metadata folder + logger.info(f"Creating crawlers for metadata") + md_agg_paths = fs.ls(f"{s3_location}") #{fs_path}/metadata_and_annual_results_aggregates/ + md_paths = fs.ls(f"{fs_path}/metadata_and_annual_results/") + + for md_path in md_agg_paths + md_paths: + md_geog = md_path.split('/')[-1] + if '_aggregates' in md_path: + crawler_name = f'{crawler_name_base}_md_agg_{md_geog}' + tbl_prefix = f'{tbl_prefix_base}_md_agg_{md_geog}_' + else: + crawler_name = f'{crawler_name_base}_md_{md_geog}' + tbl_prefix = f'{tbl_prefix_base}_md_{md_geog}_' + + crawler_params["Name"] = crawler_name + crawler_params["TablePrefix"] = tbl_prefix + crawler_params["Targets"]["S3Targets"] = [{ + "Path": f"s3://{md_path}/full/parquet", + "SampleSize": 1 + }] + + try: + crawler_info = glue.get_crawler(Name=crawler_name) + except glue.exceptions.EntityNotFoundException as ex: + logger.info(f"Creating Crawler {crawler_name}") + glue.create_crawler(**crawler_params) + else: + logger.info(f"Updating Crawler {crawler_name}") + glue.update_crawler(**crawler_params) + + logger.info(f"Running Crawler {crawler_name} on: {md_path}") + + expected_table_name = f"{tbl_prefix}{md_geog}" + logger.info(f"Expected Athena table: {expected_table_name} in database {database_name}") + + glue.start_crawler(Name=crawler_name) + time.sleep(10) + + crawler_info = glue.get_crawler(Name=crawler_name) + while crawler_info["Crawler"]["State"] != "READY": + time.sleep(10) + crawler_info = glue.get_crawler(Name=crawler_name) + logger.info(f"Crawler state: {crawler_info['Crawler']['State']}") + glue.delete_crawler(Name=crawler_name) + logger.info(f"Deleting Crawler {crawler_name}") + + ## Crawl the timeseries targets with one crawler + #logger.info(f"Creating Crawler for timeseries data") + #crawler_name = f'{crawler_name_base}_ts' + #tbl_prefix = f'{tbl_prefix_base}_ts_' + + #crawler_params["Name"] = crawler_name + #crawler_params["TablePrefix"] = tbl_prefix + #crawler_params["Targets"]["S3Targets"] = [] + #for ts_path in fs.ls(f"{fs_path}/timeseries_individual_buildings/"): + # crawler_params["Targets"]["S3Targets"].append( + # {"Path": f"s3://{ts_path}", "SampleSize": 1} + # ) + + #breakpoint() + #try: + # crawler_info = glue.get_crawler(Name=crawler_name) + #except glue.exceptions.EntityNotFoundException as ex: + # logger.info(f"Creating Crawler {crawler_name}") + # glue.create_crawler(**crawler_params) + #else: + # logger.info(f"Updating Crawler {crawler_name}") + # glue.update_crawler(**crawler_params) + + #logger.info(f"Running crawler {crawler_name} on: {ts_path}") + #glue.start_crawler(Name=crawler_name) + #time.sleep(10) + + #crawler_info = glue.get_crawler(Name=crawler_name) + #while crawler_info["Crawler"]["State"] != "READY": + # time.sleep(10) + # crawler_info = glue.get_crawler(Name=crawler_name) + # logger.info(f"Crawler state: {crawler_info['Crawler']['State']}") + #glue.delete_crawler(Name=crawler_name) + #logger.info(f"Deleting Crawler {crawler_name}") + + return dataset_name + +def fix_timeseries_tables(dataset_name: str, database_name: str = "vizstock"): + logger.info("Updating timeseries table schemas") + + glue = boto3.client("glue", region_name="us-west-2") + tbl_prefix = f"{dataset_name}_" + get_tables_kw = {"DatabaseName": database_name, "Expression": f"{tbl_prefix}*"} + tbls_resp = glue.get_tables(**get_tables_kw) + tbl_list = tbls_resp["TableList"] + + while "NextToken" in tbls_resp: + tbls_resp = glue.get_tables(NextToken=tbls_resp["NextToken"], **get_tables_kw) + tbl_list.extend(tbls_resp["TableList"]) + + for tbl in tbl_list: + + # Skip timeseries views that may exist, including ones created by this script + if tbl.get("TableType") == "VIRTUAL_VIEW": + continue + + table_name = tbl['Name'] + if not '_timeseries' in table_name: + continue + + # Check the dtype of the 'upgrade' column. + # Must be 'bigint' to match the metadata schema so joined queries work. + do_tbl_update = False + for part_key in tbl["PartitionKeys"]: + if part_key["Name"] == "upgrade" and part_key["Type"] != "bigint": + do_tbl_update = True + part_key["Type"] = "bigint" + + if not do_tbl_update: + logger.debug(f"Skipping {table_name} because it already has correct partition dtypes") + continue + + # Delete the automatically-created partition index. + # While the index exists, dtypes for the columns in the index cannot be modified. + indexes_resp = glue.get_partition_indexes( + DatabaseName=database_name, + TableName=table_name + ) + for index in indexes_resp['PartitionIndexDescriptorList']: + index_name = index['IndexName'] + index_keys = index['Keys'] + logger.debug(f'Deleting index {index_name} with keys {index_keys} in {table_name}') + glue.delete_partition_index( + DatabaseName=database_name, + TableName=table_name, + IndexName=index_name + ) + + # Wait for index deletion to complete + index_deleted = False + for i in range(0, 60): + indexes_resp = glue.get_partition_indexes( + DatabaseName=database_name, + TableName=table_name + ) + if len(indexes_resp['PartitionIndexDescriptorList']) == 0: + index_deleted = True + break + logger.debug('Waiting 10 seconds to check index deletion status') + time.sleep(10) + if not index_deleted: + raise RuntimeError(f'Did not delete index in 600 seconds, stopping.') + + # Change the dtype of the 'upgrade' partition column to bigint + tbl_input = {} + for k, v in tbl.items(): + if k in ( + "Name", + "Description", + "Owner", + "Retention", + "StorageDescriptor", + "PartitionKeys", + "TableType", + "Parameters", + ): + tbl_input[k] = v + logger.debug(f"Updating dtype of upgrade column in {table_name} to bigint") + glue.update_table(DatabaseName=database_name, TableInput=tbl_input) + + # Recreate the index + key_names = [k['Name'] for k in index_keys] + logger.debug(f"Creating index with columns: {key_names} in {table_name}") + glue.create_partition_index( + # CatalogId='string', + DatabaseName=database_name, + TableName=table_name, + PartitionIndex={ + 'Keys': key_names, + 'IndexName': 'vizstock_partition_index' + } + ) + + # Wait for index creation to complete + index_created = False + for i in range(0, 60): + indexes_resp = glue.get_partition_indexes( + DatabaseName=database_name, + TableName=table_name + ) + for index in indexes_resp['PartitionIndexDescriptorList']: + index_status = index['IndexStatus'] + if index_status == 'ACTIVE': + index_created = True + if index_created: + break + else: + logger.debug('Waiting 10 seconds to check index creation status') + time.sleep(10) + if not index_deleted: + raise RuntimeError(f'Did not create index in 600 seconds, stopping.') + +def column_filter(col): + return not ( + col.name.endswith(".co2e_kg") + or col.name.find("out.electricity.pv") > -1 + or col.name.find("out.electricity.purchased") > -1 + or col.name.find("out.electricity.net") > -1 + or col.name.find(".net.") > -1 + or col.name.find("applicability.") > -1 + or col.name.find("out.qoi.") > -1 + or col.name.find("out.emissions.") > -1 + or col.name.find("out.params.") > -1 + or col.name.find("out.utility_bills.") > -1 + or col.name.find("calc.") > -1 + or col.name.startswith("in.ejscreen") + or col.name.startswith("in.cejst") + or col.name.startswith("in.cluster_id") + or col.name.startswith("in.size_bin_id") + or col.name.startswith("in.sampling_region_id") + or col.name.startswith("out.district_heating.interior_equipment") + or col.name.startswith("out.district_heating.cooling") + or col.name.endswith("_applicable") + or col.name.startswith("out.electricity.total.apr") + or col.name.startswith("out.electricity.total.aug") + or col.name.startswith("out.electricity.total.dec") + or col.name.startswith("out.electricity.total.feb") + or col.name.startswith("out.electricity.total.jan") + or col.name.startswith("out.electricity.total.jul") + or col.name.startswith("out.electricity.total.jun") + or col.name.startswith("out.electricity.total.mar") + or col.name.startswith("out.electricity.total.may") + or col.name.startswith("out.electricity.total.nov") + or col.name.startswith("out.electricity.total.oct") + or col.name.startswith("out.electricity.total.sep") + ) + +def create_column_alias(col_name): + + # accept SQLAlchemy Column / ColumnElement or plain str + if not isinstance(col_name, str): + # prefer .name, then .key; fallback to str (last resort) + name = getattr(col_name, "name", None) or getattr(col_name, "key", None) + if not name: + name = str(col_name) + col_name = name + + # 1) name normalizations + normalized = ( + col_name + .replace("gas", "natural_gas") + .replace("fueloil", "fuel_oil") + .replace("districtheating", "district_heating") + .replace("districtcooling", "district_cooling") + ) + + # 2) regex patterns + m1 = re.search( + r"^(electricity|natural_gas|fuel_oil|propane|district_heating|district_cooling|other_fuel)_(\w+)_(kwh|therm|mbtu|kbtu)$", + normalized, + ) + m2 = re.search( + r"^total_site_(electricity|natural_gas|fuel_oil|propane|district_heating|district_cooling|other_fuel)_(kwh|therm|mbtu|kbtu)$", + normalized, + ) + m3 = re.search(r"^(total)_(site_energy)_(kwh|therm|mbtu|kbtu)$", normalized) + m4 = re.search( + r"^total_net_site_(electricity|natural_gas|fuel_oil|propane|district_heating|district_cooling|other_fuel)_(kwh|therm|mbtu|kbtu)$", + normalized, + ) + m5 = re.search( + r"^total_purchased_site_(electricity|natural_gas|fuel_oil|propane|district_heating|district_cooling|other_fuel)_(kwh|therm|mbtu|kbtu)$", + normalized, + ) + + if not (m1 or m2 or m3 or m4 or m5): + # Not an energy column we care about + return 1.0, normalized + + if m1: + fueltype, enduse, fuel_units = m1.groups() + elif m2: + fueltype, fuel_units = m2.groups() + enduse = "total" + elif m3: + enduse, fueltype, fuel_units = m3.groups() # "total","site_energy",units + # If you prefer "site_energy" to be reported as a pseudo-fuel, keep as-is. + # Otherwise map to a specific convention here. + elif m4: + fueltype, fuel_units = m4.groups() + enduse = "net" + else: # m5 + fueltype, fuel_units = m5.groups() + enduse = "purchased" + + # 3) build alias + col_alias = f"out.{fueltype}.{enduse}.energy_consumption" + + # Created using OpenStudio unit conversion library + energy_unit_conv_to_kwh = { + "mbtu": 293.0710701722222, + "m_btu": 293.0710701722222, + "therm": 29.307107017222222, + "kbtu": 0.2930710701722222, + } + + # 4) conversion factor to kWh + if fuel_units == "kwh": + conv_factor = 1.0 + else: + try: + conv_factor = energy_unit_conv_to_kwh[fuel_units] + except KeyError: + raise ValueError(f"Unhandled energy unit: {fuel_units!r} in column {col_name!r}") + + return conv_factor, col_alias + +def create_views( + dataset_name: str, database_name: str = "vizstock", workgroup: str = "eulp" +): + glue = boto3.client("glue", region_name="us-west-2") + + logger.info(f'Creating views for {dataset_name}') + + # Get a list of metadata tables + get_md_tables_kw = { + "DatabaseName": database_name, + "Expression": f"{dataset_name}_md_.*", + } + tbls_resp = glue.get_tables(**get_md_tables_kw) + tbl_list = tbls_resp["TableList"] + while "NextToken" in tbls_resp: + tbls_resp = glue.get_tables(NextToken=tbls_resp["NextToken"], **get_md_tables_kw) + tbl_list.extend(tbls_resp["TableList"]) + md_tbls = [x["Name"] for x in tbl_list if x["TableType"] != "VIRTUAL_VIEW"] + + # Create a view for each metadata table + for metadata_tblname in md_tbls: + # Connect to the metadata table + engine = create_engine( + f"awsathena+rest://:@athena.us-west-2.amazonaws.com:443/{database_name}?work_group={workgroup}" + ) + meta = sa.MetaData(bind=engine) + metadata_tbl = sa.Table(metadata_tblname, meta, autoload=True) + logger.info(f"Loaded metadata table: {metadata_tblname}") + + # Extract the partition columns from the table name + bys = [] + if 'by' in metadata_tblname: + bys = metadata_tblname.replace(f'{dataset_name}_md_', '') + bys = bys.replace(f'agg_', '').replace(f'by_', '').replace(f'_parquet', '').split('_and_') + logger.debug(f"Partition identifiers = {', '.join(bys)}") + + # Rename the partition column to match hive partition + # Move it up in the column order for easier debugging + # TODO figure out a better approach than hard-coding + aliases = { + 'state': {'col': 'in.state','alias': 'state'}, + 'puma': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'}, + 'county': {'col': 'in.nhgis_county_gisjoin', 'alias': 'county'}, + 'puma_midwest': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'}, + 'puma_northeast': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'}, + 'puma_south': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'}, + 'puma_west': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'} + } + + # Create a map of column names to aliases for partition columns + #col_to_alias = { aliases[by]['col']: aliases[by]['alias'] for by in bys if aliases[by] } + + # Select columns for the metadata, aliasing partition columns + cols = [] + for col in metadata_tbl.columns: + # Identify partition column and find alias + # col_name = str(col).replace(f'{metadata_tblname}.', '') + # if col_name in col_to_alias: + # # logger.debug(f"Aliasing {col} as {col_to_alias[col_name]}") + # cols.insert( + # 2, metadata_tbl.c[col_name].label(col_to_alias[col_name]) + # ) + # else: + # cols.append(col) + cols.append(col) + + continue + + # Remove everything but the input characteristics and output energy columns from the view + cols = list(filter(column_filter, cols)) + + # Alias the out.foo columns to remove units + # TODO: add this to timeseries stuff + cols_aliased = [] + for col in cols: + if col.name.startswith("out.") and col.name.find("..") > -1: + unitless_name = col.name.split('..')[0] + cols_aliased.append(col.label(unitless_name)) + # logger.debug(f'Aliasing {col.name} to {unitless_name}') + elif col.name == 'in.sqft..ft2': + unitless_name = 'in.sqft' # Special requirement for SightGlass + cols_aliased.append(col.label(unitless_name)) + # logger.debug(f'Aliasing {col.name} to {unitless_name}') + else: + cols_aliased.append(col) + cols = cols_aliased + + # Check columns + col_type_errs = 0 + for c in cols: + # Column name length for postgres compatibility + if len(c.name) > 63: + col_type_errs += 1 + logger.error(f'column: `{c.name}` must be < 64 chars for SightGlass postgres storage') + + # in.foo columns may not be bigints or booleans because they cannot be serialized to JSON + # when determining the unique set of filter values for SightGlass + if c.name.startswith('in.') and (str(c.type) == 'BIGINT' or (str(c.type) == 'BOOLEAN')): + col_type_errs += 1 + logger.error(f'in.foo column {c} may not be a BIGINT or BOOLEAN for SightGlass to work') + + # Expected bigint columns in SightGlass + expected_bigints = ['metadata_index', 'upgrade', 'bldg_id'] + if c.name in expected_bigints: + if not str(c.type) == 'BIGINT': + col_type_errs += 1 + logger.error(f'Column {c} must be a BIGINT, but found {c.type}') + + if col_type_errs > 0: + raise RuntimeError(f'{col_type_errs} were found in columns, correct these in metadata before proceeding.') + + # For PUMAs, need to create one view for each census region + if 'puma' in bys: + regions = ("South", "Midwest", "Northeast", "West") + for region in regions: + q = sa.select(cols).where(metadata_tbl.c["in.census_region_name"].in_([region])) + view_name = metadata_tblname.replace('_by_state_and_puma_parquet', '_puma') + view_name = f"{view_name}_{region.lower()}_vu" + db_plus_vu = f'{database_name}_{view_name}' + if len(db_plus_vu) > 63: + raise RuntimeError(f'db + view: `{db_plus_vu}` must be < 64 chars for SightGlass postgres storage') + view = sa.Table(view_name, meta) + create_view = CreateView(view, q, or_replace=True) + logger.info(f"Creating metadata view: {view_name}, partitioned by {bys}") + engine.execute(create_view) + # logger.debug('Columns in view:') + # for c in cols: + # logger.debug(c) + else: + q = sa.select(cols) + view_name = metadata_tblname.replace('_parquet', '') + view_name = f"{view_name}_vu" + db_plus_vu = f'{database_name}_{view_name}' + if len(db_plus_vu) > 63: + raise RuntimeError(f'db + view: `{db_plus_vu}` must be < 64 chars for SightGlass postgres storage') + view = sa.Table(view_name, meta) + create_view = CreateView(view, q, or_replace=True) + logger.info(f"Creating metadata view: {view_name}, partitioned by {bys}") + engine.execute(create_view) + # logger.debug('Columns in view:') + # for c in cols: + # logger.debug(c) + + # Get a list of timeseries tables + get_ts_tables_kw = { + "DatabaseName": database_name, + "Expression": f"{dataset_name}_timeseries*", + } + tbls_resp = glue.get_tables(**get_ts_tables_kw) + tbl_list = tbls_resp["TableList"] + while "NextToken" in tbls_resp: + tbls_resp = glue.get_tables(NextToken=tbls_resp["NextToken"], **get_ts_tables_kw) + tbl_list.extend(tbls_resp["TableList"]) + ts_tbls = [x["Name"] for x in tbl_list if x["TableType"] != "VIRTUAL_VIEW"] + + # Create a view for each timeseries table, removing the emissions columns + for ts_tblname in ts_tbls: + engine = create_engine( + f"awsathena+rest://:@athena.us-west-2.amazonaws.com:443/{database_name}?work_group={workgroup}" + ) + meta = sa.MetaData(bind=engine) + ts_tbl = sa.Table(ts_tblname, meta, autoload=True) + cols = list(filter(column_filter, ts_tbl.columns)) + cols_aliased = [] + for col in cols: + + # rename + conv_factor, col_alias = create_column_alias(col) + if (col.name == col_alias) & (conv_factor==1): # and conversion factor is 1 + cols_aliased.append(col) + elif (col.name != col_alias) & (conv_factor==1): #name different, conversion factor 1 + cols_aliased.append(col.label(col_alias)) #TODO: return both alias and new units + else: # name and conversion different + cols_aliased.append((col/conv_factor).label(col_alias)) #TODO: return both alias and new units + + if col.name == 'timestamp': #timestamp + # Convert bigint to timestamp type if necessary + if str(col.type) == 'BIGINT': + # Pandas uses nanosecond resolution integer timestamps. + # Presto expects second resolution values in from_unixtime. + # Must divide values by 1e9 to go from nanoseconds to seconds. + cols_aliased.append(sa.func.from_unixtime(col / 1e9).label('timestamp')) #NOTE: syntax for adding unit conversions + else: + cols_aliased.append(col) + + cols = cols_aliased + + q = sa.select(cols) + view_name = f"{ts_tblname}_vu" + + db_plus_vu = f'{database_name}_{view_name}' + if len(db_plus_vu) > 63: + raise RuntimeError(f'db + view: `{db_plus_vu}` must be < 64 chars for SightGlass postgres storage') + view = sa.Table(view_name, meta) + create_view = CreateView(view, q, or_replace=True) + logger.info(f"Creating timeseries view: {view_name}") + engine.execute(create_view) + logger.debug('Columns in view:') + for c in cols: + logger.debug(c) + + def main(): # ComStock run comstock = cspp.ComStock( - s3_base_dir='com-sdr/production_runs/2025_r3_2012/test', # If run not on S3, download results_up**.parquet manually - comstock_run_name='sdr_2025_r3_2012_1of5_test100_df', # Name of the run on S3 - comstock_run_version='sdr_2025_r3_2012_1of5_test100_df', # Use whatever you want to see in plot and folder names + s3_base_dir='com-sdr', # If run not on S3, download results_up**.parquet manually + comstock_run_name='rtuadv_v11', # Name of the run on S3 + comstock_run_version='rtuadv_v11', # Use whatever you want to see in plot and folder names comstock_year=2018, # Typically don't change this athena_table_name=None, # Typically don't change this truth_data_version='v01', # Typically don't change this @@ -24,8 +587,8 @@ def main(): skip_missing_columns=True, # False if you want to ensure you have all data specified for export reload_from_cache=True, # True if CSV already made and want faster reload times include_upgrades=True, # False if not looking at upgrades - upgrade_ids_to_skip=[], # Use [1, 3] etc. to exclude certain upgrades - make_timeseries_plots=False, + upgrade_ids_to_skip=[2,3], # Use [1, 3] etc. to exclude certain upgrades + make_timeseries_plots=True, states={ #'MN': 'Minnesota', # specify state to use for timeseries plots in dictionary format. State ID must correspond correctly. 'MA':'Massachusetts', @@ -34,6 +597,7 @@ def main(): #'AZ': 'Arizona', #'TN': 'Tennessee' }, + # [('state',['VA','AZ']) upgrade_ids_for_comparison={} # Use {'':[0,1,2]}; add as many upgrade IDs as needed, but plots look strange over 5 #output_dir = 's3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2025/comstock_amy2018_release_1' ) @@ -84,7 +648,7 @@ def main(): 'partition_cols': {}, 'aggregation_levels': [[comstock.STATE_ABBRV, comstock.CZ_ASHRAE]], 'data_types': ['full'], # other options: 'detailed', 'basic' **If using multiple options, order must go from more detailed to less detailed. - 'file_types': ['csv'], # other options:'parquet' + 'file_types': ['parquet'], # other options:'parquet' } ] @@ -92,15 +656,31 @@ def main(): # for upgrade_id in comstock.upgrade_ids_to_process: # # if upgrade_id == 0: # # continue - # comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id, [geo_export]) - - ## Compare to CBECS - #comp = cspp.ComStockToCBECSComparison(cbecs_list=[cbecs], - # comstock_list=[comstock], - # upgrade_id='All', - # make_comparison_plots=True, - # make_hvac_plots=False) - #comp.export_to_csv_wide() # May comment this out after run once + # # comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id, [geo_export]) + # if comstock.make_timeseries_plots: + # s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" + # s3_output_dir = comstock.setup_fsspec_filesystem(s3_dir, aws_profile_name=None) + # comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id=upgrade_id, geo_exports=[geo_export], output_dir=s3_output_dir) + + + # write select results to S3 for Athena/Glue when needed for timeseries plots + if comstock.make_timeseries_plots: + s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" + database = "enduse" + dataset_name = f"{comstock.comstock_run_name}/{comstock.comstock_run_name}" # name of the dir in s3_dir we want to make new tables and views for + crawler_name = comstock.comstock_run_name # used to set name of crawler, cannot include slashes + workgroup = "eulp" # Athena workgroup to use + glue_service_role = "service-role/AWSGlueServiceRole-default" + + # Export parquet files to S3 for Athena/Glue + # TODO: modify so that county data can be used for timeseries plots + # TODO: Modify geo export structure to specify parquet files only for this part of the workflow + #create_sightglass_tables(s3_location=f"{s3_dir}/metadata_and_annual_results_aggregates", + # dataset_name=crawler_name, + # database_name=database, + # glue_service_role=glue_service_role) + #fix_timeseries_tables(crawler_name, database) + #create_views(crawler_name, database, workgroup) # Create measure run comparisons; only use if run has measures comparison = cspp.ComStockMeasureComparison(comstock, states=comstock.states, make_comparison_plots = comstock.make_comparison_plots, make_timeseries_plots = comstock.make_timeseries_plots) diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py index 13a817662..78ac55820 100644 --- a/postprocessing/comstockpostproc/comstock.py +++ b/postprocessing/comstockpostproc/comstock.py @@ -58,6 +58,7 @@ def __init__(self, s3_base_dir, comstock_run_name, comstock_run_version, comstoc """ # Initialize members + self.s3_base_dir = s3_base_dir self.comstock_run_name = comstock_run_name self.comstock_run_version = comstock_run_version self.year = comstock_year @@ -2711,7 +2712,11 @@ def create_weighted_aggregate_output(self, assert isinstance(wtd_agg_outs, pl.LazyFrame) return wtd_agg_outs - def export_metadata_and_annual_results_for_upgrade(self, upgrade_id, geo_exports, n_parallel=-1): + def export_metadata_and_annual_results_for_upgrade(self, upgrade_id, geo_exports, n_parallel=-1, output_dir=None): + # Use self.output_dir if output_dir is not specified + if output_dir is None: + output_dir = self.output_dir + # # Define the geographic partitions to export # geo_exports = [ # {'geo_top_dir': 'national', @@ -2777,23 +2782,23 @@ def export_metadata_and_annual_results_for_upgrade(self, upgrade_id, geo_exports first_geo_combos = first_geo_combos.sort(by=geo_col_names[0]) # Make a directory for the geography type - full_geo_dir = f"{self.output_dir['fs_path']}/metadata_and_annual_results/{geo_top_dir}" - self.output_dir['fs'].mkdirs(full_geo_dir, exist_ok=True) + full_geo_dir = f"{output_dir['fs_path']}/metadata_and_annual_results/{geo_top_dir}" + output_dir['fs'].mkdirs(full_geo_dir, exist_ok=True) # Make a directory for each data type X file type combo if None in aggregation_levels: for data_type in data_types: for file_type in file_types: - self.output_dir['fs'].mkdirs(f'{full_geo_dir}/{data_type}/{file_type}', exist_ok=True) + output_dir['fs'].mkdirs(f'{full_geo_dir}/{data_type}/{file_type}', exist_ok=True) # Make an aggregates directory for the geography type - full_geo_agg_dir = f"{self.output_dir['fs_path']}/metadata_and_annual_results_aggregates/{geo_top_dir}" - self.output_dir['fs'].mkdirs(full_geo_agg_dir, exist_ok=True) + full_geo_agg_dir = f"{output_dir['fs_path']}/metadata_and_annual_results_aggregates/{geo_top_dir}" + output_dir['fs'].mkdirs(full_geo_agg_dir, exist_ok=True) # Make a directory for each data type X file type combo for data_type in data_types: for file_type in file_types: - self.output_dir['fs'].mkdirs(f'{full_geo_agg_dir}/{data_type}/{file_type}', exist_ok=True) + output_dir['fs'].mkdirs(f'{full_geo_agg_dir}/{data_type}/{file_type}', exist_ok=True) # Builds a file path for each aggregate based on name, file type, and aggregation level def get_file_path(full_geo_agg_dir, full_geo_dir, geo_prefixes, geo_levels, file_type, aggregation_level): @@ -2804,7 +2809,7 @@ def get_file_path(full_geo_agg_dir, full_geo_dir, geo_prefixes, geo_levels, file geo_level_dir = f'{agg_level_dir}/{data_type}/{file_type}' if len(geo_levels) > 0: geo_level_dir = f'{geo_level_dir}/' + '/'.join(geo_levels) - self.output_dir['fs'].mkdirs(geo_level_dir, exist_ok=True) + output_dir['fs'].mkdirs(geo_level_dir, exist_ok=True) file_name = f'upgrade{upgrade_id}' # Add geography prefix to filename if len(geo_prefixes) > 0: @@ -2922,7 +2927,7 @@ def get_file_path(full_geo_agg_dir, full_geo_dir, geo_prefixes, geo_levels, file for file_type in file_types: file_path = get_file_path(full_geo_agg_dir, full_geo_dir, geo_prefixes, geo_levels, file_type, aggregation_level) logger.debug(f"Queuing {file_path}") - combo = (geo_data_for_data_type, self.output_dir, file_type, file_path) + combo = (geo_data_for_data_type, output_dir, file_type, file_path) combos_to_write.append(combo) # Write files in parallel @@ -2994,7 +2999,7 @@ def get_file_path(full_geo_agg_dir, full_geo_dir, geo_prefixes, geo_levels, file for file_type in file_types: file_path = get_file_path(full_geo_agg_dir, full_geo_dir, geo_prefixes, geo_levels, file_type, aggregation_level) logger.debug(f"Queuing {file_path}: n_cols = {n_cols:,}, n_rows = {n_rows:,}") - combo = (data_type_df, self.output_dir, file_type, file_path) + combo = (data_type_df, output_dir, file_type, file_path) combos_to_write.append(combo) # Write files in parallel diff --git a/postprocessing/comstockpostproc/comstock_measure_comparison.py b/postprocessing/comstockpostproc/comstock_measure_comparison.py index f3acff9a9..db56bb936 100644 --- a/postprocessing/comstockpostproc/comstock_measure_comparison.py +++ b/postprocessing/comstockpostproc/comstock_measure_comparison.py @@ -39,10 +39,13 @@ def __init__(self, comstock_object: comstock.ComStock, states, make_comparison_p self.dataset_name = comstock_object.dataset_name self.output_dir = os.path.join( current_dir, '..', 'output', self.dataset_name, 'measure_runs') + self.column_for_grouping = self.UPGRADE_NAME self.dict_measure_dir = {} # this can be called to determine output directory self.upgrade_ids_for_comparison = comstock_object.upgrade_ids_for_comparison self.comstock_run_name = comstock_object.comstock_run_name + self.athena_table_name = comstock_object.athena_table_name + self.s3_base_dir = comstock_object.s3_base_dir self.states = states self.make_timeseries_plots = make_timeseries_plots self.lazyframe_plotter = LazyFramePlotter() @@ -82,7 +85,7 @@ def __init__(self, comstock_object: comstock.ComStock, states, make_comparison_p # make consumption plots for upgrades if requested by user if make_comparison_plots: logger.info(f'Making plots for upgrade {upgrade}') - self.make_plots(df_upgrade, self.column_for_grouping, states, make_timeseries_plots, color_map, self.dict_measure_dir[upgrade]) + self.make_plots(df_upgrade, self.column_for_grouping, states, make_timeseries_plots, color_map, self.dict_measure_dir[upgrade], self.s3_base_dir, self.athena_table_name, self.comstock_run_name) else: logger.info("make_comparison_plots is set to false, so not plots were created. Set make_comparison_plots to True for plots.") @@ -120,13 +123,16 @@ def __init__(self, comstock_object: comstock.ComStock, states, make_comparison_p end_time = pd.Timestamp.now() logger.info(f"Time taken to make all plots is {end_time - start_time}") - def make_plots(self, lazy_frame: pl.LazyFrame, column_for_grouping, states, make_timeseries_plots, color_map, output_dir): + def make_plots(self, lazy_frame: pl.LazyFrame, column_for_grouping, states, make_timeseries_plots, color_map, output_dir, s3_base_dir, athena_table_name, comstock_run_name): time_start = pd.Timestamp.now() BASIC_PARAMS = { 'column_for_grouping': column_for_grouping, 'color_map': color_map, - 'output_dir': output_dir + 'output_dir': output_dir, + 's3_base_dir':s3_base_dir, + 'athena_table_name':athena_table_name, + 'comstock_run_name':comstock_run_name } #LazyFramePlotter.plot_with_lazy( @@ -189,22 +195,25 @@ def make_plots(self, lazy_frame: pl.LazyFrame, column_for_grouping, states, make 'output_dir': output_dir} LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_timeseries_peak_week_by_state, lazy_frame=lazy_frame.clone(), - columns=(self.lazyframe_plotter.BASE_COLUMNS + [self.UPGRADE_ID, self.BLDG_WEIGHT, self.BLDG_TYPE]))(**TIMESERIES_PARAMS) - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_timeseries_season_average_by_state, lazy_frame=lazy_frame.clone(), - columns=(self.lazyframe_plotter.BASE_COLUMNS + [self.UPGRADE_ID, self.BLDG_WEIGHT, self.BLDG_TYPE]))(**TIMESERIES_PARAMS) - LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_timeseries_season_average_by_state, lazy_frame=lazy_frame.clone(), - columns=(self.lazyframe_plotter.BASE_COLUMNS + [self.UPGRADE_ID, self.BLDG_WEIGHT, self.BLDG_TYPE]))(**TIMESERIES_PARAMS) + columns=(self.lazyframe_plotter.BASE_COLUMNS + [self.UPGRADE_ID, self.BLDG_TYPE]))(**TIMESERIES_PARAMS) #self.BLDG_WEIGHT, + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_timeseries_season_average_by_state, lazy_frame=lazy_frame.clone(), + # columns=(self.lazyframe_plotter.BASE_COLUMNS + [self.UPGRADE_ID, self.BLDG_TYPE]))(**TIMESERIES_PARAMS) #self.BLDG_WEIGHT + #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_timeseries_season_average_by_state, lazy_frame=lazy_frame.clone(), + # columns=(self.lazyframe_plotter.BASE_COLUMNS + [self.UPGRADE_ID, self.BLDG_TYPE]))(**TIMESERIES_PARAMS) #self.BLDG_WEIGHT time_end = pd.Timestamp.now() logger.info(f"Time taken to make plots is {time_end - time_start}") - def make_comparative_plots(self, lazy_frame: pl.LazyFrame, column_for_grouping, states, make_timeseries_plots, color_map, output_dir): + def make_comparative_plots(self, lazy_frame: pl.LazyFrame, column_for_grouping, states, make_timeseries_plots, color_map, output_dir, s3_base_dir, athena_table_name, comstock_run_name): # Make plots comparing the upgrades assert isinstance(lazy_frame, pl.LazyFrame) BASIC_PARAMS = { 'column_for_grouping': column_for_grouping, 'color_map': color_map, - 'output_dir': output_dir + 'output_dir': output_dir, + 's3_base_dir': s3_base_dir, + 'athena_table_name': athena_table_name, + 'comstock_run_name': comstock_run_name } logger.info(f'Making comparison plots for upgrade groupings') diff --git a/postprocessing/comstockpostproc/comstock_query_builder.py b/postprocessing/comstockpostproc/comstock_query_builder.py new file mode 100644 index 000000000..bedc032b8 --- /dev/null +++ b/postprocessing/comstockpostproc/comstock_query_builder.py @@ -0,0 +1,409 @@ +""" +Query templates and builders for ComStock data extraction from Athena. + +This module provides SQL query templates and builder functions to construct +Athena queries for various ComStock analysis needs, separating query logic +from the main ComStock processing classes. +""" + +import logging +from typing import Dict, List, Optional, Union + +logger = logging.getLogger(__name__) + + +class ComStockQueryBuilder: + """ + Builder class for creating SQL queries to extract ComStock data from Athena. + + This class provides templated queries that can be customized with parameters + for different analysis needs while maintaining consistent query structure. + """ + + def __init__(self, athena_table_name: str): + """ + Initialize the query builder with the base Athena table name. + + Args: + athena_table_name: Base name of the Athena table (without _timeseries or _baseline suffix) + """ + self.athena_table_name = athena_table_name + self.timeseries_table = f"{athena_table_name}_timeseries" + self.baseline_table = f"{athena_table_name}_baseline" + + def get_monthly_energy_consumption_query(self, + upgrade_ids: Optional[List[Union[int, str]]] = None, + states: Optional[List[str]] = None, + building_types: Optional[List[str]] = None) -> str: + """ + Build query for monthly natural gas and electricity consumption by state and building type. + + Args: + upgrade_ids: List of upgrade IDs to filter on (optional) + states: List of state IDs to filter on (optional) + building_types: List of building types to filter on (optional) + + Returns: + SQL query string + """ + + ## Build WHERE clause conditions + #where_conditions = ['"build_existing_model.building_type" IS NOT NULL'] + + #if upgrade_ids: + # upgrade_list = ', '.join([f'"{uid}"' for uid in upgrade_ids]) + # where_conditions.append(f'"upgrade" IN ({upgrade_list})') + + #if states: + # state_list = ', '.join([f'"{state}"' for state in states]) + # where_conditions.append(f'SUBSTRING("build_existing_model.county_id", 2, 2) IN ({state_list})') + + #if building_types: + # btype_list = ', '.join([f'"{bt}"' for bt in building_types]) + # where_conditions.append(f'"build_existing_model.create_bar_from_building_type_ratios_bldg_type_a" IN ({btype_list})') + + #where_clause = ' AND '.join(where_conditions) + + #query = f""" + # SELECT + # "upgrade", + # "month", + # "state_id", + # "building_type", + # sum("total_site_gas_kbtu") AS "total_site_gas_kbtu", + # sum("total_site_electricity_kwh") AS "total_site_electricity_kwh" + # FROM + # ( + # SELECT + # EXTRACT(MONTH from "time") as "month", + # SUBSTRING("build_existing_model.county_id", 2, 2) AS "state_id", + # "build_existing_model.create_bar_from_building_type_ratios_bldg_type_a" as "building_type", + # "upgrade", + # "total_site_gas_kbtu", + # "total_site_electricity_kwh" + # FROM + # "{self.timeseries_table}" + # JOIN "{self.baseline_table}" + # ON "{self.timeseries_table}"."building_id" = "{self.baseline_table}"."building_id" + # WHERE {where_clause} + # ) + # GROUP BY + # "upgrade", + # "month", + # "state_id", + # "building_type" + #""" + + #return query.strip() + + def get_timeseries_aggregation_query(self, + upgrade_id: Union[int, str], + enduses: List[str], + restrictions: List[tuple] = None, + timestamp_grouping: str = 'hour', + building_ids: Optional[List[int]] = None, + weight_view_table: Optional[str] = None, + include_sample_stats: bool = True) -> str: + """ + Build query for timeseries data aggregation similar to buildstock query functionality. + + This matches the pattern from working buildstock queries that join timeseries data + with a weight view table for proper weighting and aggregation. + + Args: + upgrade_id: Upgrade ID to filter on + enduses: List of end use columns to select + restrictions: List of (column, values) tuples for filtering + timestamp_grouping: How to group timestamps ('hour', 'day', 'month') + building_ids: Specific building IDs to filter on (optional) + weight_view_table: Name of the weight view table (e.g., 'rtuadv_v11_md_agg_national_by_state_vu') + include_sample_stats: Whether to include sample_count, units_count, rows_per_sample + + Returns: + SQL query string + """ + + # If no weight view table provided, construct default name + if weight_view_table is None: + weight_view_table = f"{self.athena_table_name}_md_agg_national_by_state_vu" #TODO: make table name dynamic to aggregation level + + # Build timestamp grouping with date_trunc and time adjustment (like buildstock does) + if timestamp_grouping == 'hour': + time_select = f"date_trunc('hour', date_add('second', -900, {self.timeseries_table}.time)) AS time" + time_group = '1' + elif timestamp_grouping == 'day': + time_select = f"date_trunc('day', {self.timeseries_table}.time) AS time" + time_group = '1' + elif timestamp_grouping == 'month': + time_select = f"date_trunc('month', {self.timeseries_table}.time) AS time" + time_group = '1' + else: + time_select = f"{self.timeseries_table}.time AS time" + time_group = '1' + + # Build SELECT clause with sample statistics (matching buildstock pattern) + select_clauses = [time_select] + + if include_sample_stats: + select_clauses.extend([ + f"count(distinct({self.timeseries_table}.building_id)) AS sample_count", + f"(count(distinct({self.timeseries_table}.building_id)) * sum(1 * {weight_view_table}.weight)) / sum(1) AS units_count", + f"sum(1) / count(distinct({self.timeseries_table}.building_id)) AS rows_per_sample" + ]) + + # Build weighted enduse aggregations + for enduse in enduses: + weighted_sum = f"sum({self.timeseries_table}.{enduse} * 1 * {weight_view_table}.weight) AS {enduse}" + select_clauses.append(weighted_sum) + + select_clause = ',\n '.join(select_clauses) + + # Build WHERE clause - join conditions first, then filters + where_conditions = [ + f'{weight_view_table}."bldg_id" = {self.timeseries_table}.building_id', + f'{weight_view_table}.upgrade = {self.timeseries_table}.upgrade', + f'{weight_view_table}.upgrade = {upgrade_id}' + ] + + if building_ids: + bldg_list = ', '.join([str(bid) for bid in building_ids]) + where_conditions.append(f'{weight_view_table}."bldg_id" IN ({bldg_list})') + + # Handle restrictions - these are typically filters on the weight view table + if restrictions: + for column, values in restrictions: + if isinstance(values, (list, tuple)): + if len(values) == 1: + where_conditions.append(f'{weight_view_table}."{column}" = \'{values[0]}\'') + else: + value_list = ', '.join([f"'{v}'" for v in values]) + where_conditions.append(f'{weight_view_table}."{column}" IN ({value_list})') + else: + where_conditions.append(f'{weight_view_table}."{column}" = \'{values}\'') + + where_clause = ' AND '.join(where_conditions) + + # Build the query using FROM clause with comma-separated tables (like buildstock) + query = f"""SELECT {select_clause} + FROM {weight_view_table}, {self.timeseries_table} + WHERE {where_clause} + GROUP BY {time_group} + ORDER BY {time_group}""" + + return query + + def get_timeseries_aggregation_query_with_join(self, + upgrade_id: Union[int, str], + enduses: List[str], + restrictions: List[tuple] = None, + timestamp_grouping: str = 'hour', + building_ids: Optional[List[int]] = None) -> str: + """ + Alternative version using JOIN syntax instead of comma-separated FROM clause. + Use this if you prefer explicit JOIN syntax over the buildstock comma-style. + """ + + # Build SELECT clause for enduses + enduse_selects = [] + for enduse in enduses: + enduse_selects.append(f'sum("{enduse}") AS "{enduse}"') + + enduse_clause = ',\n '.join(enduse_selects) + + # Build timestamp grouping + if timestamp_grouping == 'hour': + time_select = 'EXTRACT(HOUR from "time") as "hour"' + time_group = '"hour"' + elif timestamp_grouping == 'day': + time_select = 'EXTRACT(DAY from "time") as "day"' + time_group = '"day"' + elif timestamp_grouping == 'month': + time_select = 'EXTRACT(MONTH from "time") as "month"' + time_group = '"month"' + else: + time_select = '"time"' + time_group = '"time"' + + # Build WHERE clause + where_conditions = [f'"upgrade" = "{upgrade_id}"'] + + if building_ids: + bldg_list = ', '.join([str(bid) for bid in building_ids]) + where_conditions.append(f'"{self.timeseries_table}"."building_id" IN ({bldg_list})') + + if restrictions: + for column, values in restrictions: + if isinstance(values, (list, tuple)): + value_list = ', '.join([f'"{v}"' for v in values]) + where_conditions.append(f'"{column}" IN ({value_list})') + else: + where_conditions.append(f'"{column}" = "{values}"') + + where_clause = ' AND '.join(where_conditions) + + query = f""" + SELECT + {time_select}, + {enduse_clause} + FROM + "{self.timeseries_table}" + JOIN "{self.baseline_table}" + ON "{self.timeseries_table}"."building_id" = "{self.baseline_table}"."building_id" + WHERE {where_clause} + GROUP BY + {time_group} + ORDER BY + {time_group} + """ + + return query.strip() + + def get_building_characteristics_query(self, + upgrade_ids: Optional[List[Union[int, str]]] = None, + characteristics: Optional[List[str]] = None, + filters: Optional[Dict[str, Union[str, List[str]]]] = None) -> str: + """ + Build query to extract building characteristics and metadata. + + Args: + upgrade_ids: List of upgrade IDs to include (optional, defaults to all) + characteristics: Specific characteristic columns to select (optional) + filters: Dictionary of column: value(s) filters to apply + + Returns: + SQL query string + """ + + # Default characteristics if none specified + if characteristics is None: + characteristics = [ + 'building_id', + 'upgrade', + 'build_existing_model.building_type', + 'build_existing_model.county_id', + 'build_existing_model.create_bar_from_building_type_ratios_bldg_type_a', + 'in.sqft', + 'in.geometry_building_type_recs' + ] + + # Build SELECT clause + select_clause = ',\n '.join([f'"{char}"' for char in characteristics]) + + # Build WHERE clause + where_conditions = [] + + if upgrade_ids: + upgrade_list = ', '.join([f'"{uid}"' for uid in upgrade_ids]) + where_conditions.append(f'"upgrade" IN ({upgrade_list})') + + if filters: + for column, values in filters.items(): + if isinstance(values, (list, tuple)): + value_list = ', '.join([f'"{v}"' for v in values]) + where_conditions.append(f'"{column}" IN ({value_list})') + else: + where_conditions.append(f'"{column}" = "{values}"') + + where_clause = 'WHERE ' + ' AND '.join(where_conditions) if where_conditions else '' + + query = f""" + SELECT + {select_clause} + FROM + "{self.baseline_table}" + {where_clause} + """ + + return query.strip() + + +def get_monthly_energy_query(athena_table_name: str, **kwargs) -> str: + """ + Convenience function to get monthly energy consumption query. + + Args: + athena_table_name: Base Athena table name + **kwargs: Additional arguments passed to get_monthly_energy_consumption_query + + Returns: + SQL query string + """ + builder = ComStockQueryBuilder(athena_table_name) + return builder.get_monthly_energy_consumption_query(**kwargs) + + +def get_timeseries_query(athena_table_name: str, **kwargs) -> str: + """ + Convenience function to get timeseries aggregation query. + + Args: + athena_table_name: Base Athena table name + **kwargs: Additional arguments passed to get_timeseries_aggregation_query + + Returns: + SQL query string + """ + builder = ComStockQueryBuilder(athena_table_name) + return builder.get_timeseries_aggregation_query(**kwargs) + + +def get_building_chars_query(athena_table_name: str, **kwargs) -> str: + """ + Convenience function to get building characteristics query. + + Args: + athena_table_name: Base Athena table name + **kwargs: Additional arguments passed to get_building_characteristics_query + + Returns: + SQL query string + """ + builder = ComStockQueryBuilder(athena_table_name) + return builder.get_building_characteristics_query(**kwargs) + + +# Query template constants for common patterns +MONTHLY_ENERGY_TEMPLATE = """ +SELECT + "upgrade", + "month", + "state_id", + "building_type", + sum("total_site_gas_kbtu") AS "total_site_gas_kbtu", + sum("total_site_electricity_kwh") AS "total_site_electricity_kwh" +FROM +( + SELECT + EXTRACT(MONTH from "time") as "month", + SUBSTRING("build_existing_model.county_id", 2, 2) AS "state_id", + "build_existing_model.create_bar_from_building_type_ratios_bldg_type_a" as "building_type", + "upgrade", + "total_site_gas_kbtu", + "total_site_electricity_kwh" + FROM + "{timeseries_table}" + JOIN "{baseline_table}" + ON "{timeseries_table}"."building_id" = "{baseline_table}"."building_id" + WHERE {where_clause} +) +GROUP BY + "upgrade", + "month", + "state_id", + "building_type" +""" + +TIMESERIES_AGG_TEMPLATE = """ +SELECT + {time_grouping}, + {enduse_aggregations} +FROM + "{timeseries_table}" +JOIN "{baseline_table}" + ON "{timeseries_table}"."building_id" = "{baseline_table}"."building_id" +WHERE {where_clause} +GROUP BY + {time_grouping} +ORDER BY + {time_grouping} +""" \ No newline at end of file diff --git a/postprocessing/comstockpostproc/plotting_mixin.py b/postprocessing/comstockpostproc/plotting_mixin.py index 2cf6b5d37..30f239f40 100644 --- a/postprocessing/comstockpostproc/plotting_mixin.py +++ b/postprocessing/comstockpostproc/plotting_mixin.py @@ -11,9 +11,28 @@ import plotly.express as px import seaborn as sns import plotly.graph_objects as go +import sys from buildstock_query import BuildStockQuery import matplotlib.colors as mcolors from plotly.subplots import make_subplots +import comstockpostproc.comstock as comstock +from comstockpostproc.comstock_query_builder import ComStockQueryBuilder +import fsspec +import s3fs +from pyathena.arrow.cursor import ArrowCursor + +# --- put this at the VERY TOP of your main script --- +from pyathena import connect as _orig_connect +from pyathena.arrow.cursor import ArrowCursor +import buildstock_query.query_core as qc # this is what QueryCore calls + +def _connect_with_arrow(*args, **kwargs): + kwargs.setdefault("cursor_class", ArrowCursor) + kwargs.setdefault("cursor_kwargs", {"unload": True}) # downloads via boto3, no s3fs/fsspec involved + return _orig_connect(*args, **kwargs) + +qc.connect = _connect_with_arrow # patch the exact symbol QueryCore uses +# --- end patch --- matplotlib.use('Agg') logger = logging.getLogger(__name__) @@ -3010,92 +3029,206 @@ def plot_load_duration_curve(self, df, region, building_type, color_map, output_ plt.savefig(output_path, bbox_inches='tight') - # create weights table on S3 based on allocation outcomes - def clone_weights_table_on_s3(self, df, state, bldg_type_col, weight_col): + + + # get weighted load profiles + def get_weighted_load_profiles_from_s3(self, df, upgrade_num, state, upgrade_name): """ - This method clones the weights table created during allocation on S3 so it can be reused for weighting timeseries profiles on S3. - The weights table is for the full baseline run at full resolution. + This method retrieves weighted timeseries profiles from s3/athena. + Returns dataframe with weighted kWh columns for baseline and upgrade. """ - self.get_allocated_weights_scaled_to_cbecs_for_upgrade(0) + # run crawler + run_data = BuildStockQuery(workgroup='eulp', + db_name='enduse', + table_name=self.comstock_run_name, + buildstock_type='comstock', + skip_reports=True, + metadata_table_suffix='_md_agg_national_by_state_vu', #TODO: make this more dynamic + #ts_table_suffix=f"_timeseries_vu" + ) + + print(run_data._tables.keys()) + breakpoint() + applic_bldgs_list = list(df.loc[(df[self.UPGRADE_ID].isin(upgrade_num)) & (df[self.UPGRADE_APPL]==True), self.BLDG_ID]) + applic_bldgs_list = [int(x) for x in applic_bldgs_list] - # get weighted load profiles - def wgt_by_btype(self, df, run_data, dict_wgts, upgrade_num, state, upgrade_name): - """ - This method weights the timeseries profiles. - Returns dataframe with weighted kWh columns. - """ - btype_list = df[self.BLDG_TYPE].unique() + import time + from datetime import datetime, time - applic_bldgs_list = list(df.loc[(df[self.UPGRADE_NAME].isin(upgrade_name)) & (df[self.UPGRADE_APPL]==True), self.BLDG_ID]) - applic_bldgs_list = [int(x) for x in applic_bldgs_list] + start_time = time() + start_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - dfs_base=[] - dfs_up=[] - for btype in btype_list: + print(f"\n[{start_str}] Starting processing for state {state}") - # get building weights - btype_wgt = dict_wgts[btype] + # loop through upgrades + for upgrade_id in df[self.UPGRADE_ID].unique(): - # apply weights by building type - def apply_wgts(df): - # Identify columns that contain 'kwh' in their names - kwh_columns = [col for col in df.columns if 'kwh' in col] + print(f"\n[{start_str}] Starting processing for state {state}, upgrade id {upgrade_id}...") - # Apply the weight and add the suffix 'weighted' - weighted_df = df[kwh_columns].apply(lambda x: x * btype_wgt).rename(columns=lambda x: x + '_weighted') - # Concatenate the new weighted columns with the original DataFrame without the unweighted 'kwh' columns - df_wgt = pd.concat([df.drop(columns=kwh_columns), weighted_df], axis=1) - return df_wgt + # if there are upgrades, restrict baseline to match upgrade applicability + if (upgrade_id in (0, "0")) and (df[self.UPGRADE_ID].nunique() > 1): - # baseline load data - aggregate electricity total only - df_base_ts_agg = run_data.agg.aggregate_timeseries( - upgrade_id=0, + # Create query builder and generate query + builder = ComStockQueryBuilder(self.comstock_run_name) + query = builder.get_timeseries_aggregation_query( + upgrade_id=upgrade_id, + enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), + restrictions=[ + ('in.state', [f"{state}"]), + ('bldg_id', [19]) # match applicability of upgrade applic_bldgs_list[0] + ], + timestamp_grouping='hour', + weight_view_table=f'{self.comstock_run_name}_md_agg_national_by_state_vu' + ) + + print("Generated Query:") + print("="*80) + print(query) + print("="*80) + + print(f"Getting weighted baseline load profile for state {state} and upgrade id {upgrade_id} with {len(applic_bldgs_list)} applicable buildings.") + + ########start bsq + # need to replace buildstock query code with direct athena query function call + df_base_ts_agg_weighted = run_data.agg.aggregate_timeseries( + upgrade_id=upgrade_id, + enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), + restrict=[ + ('in.state', [f"{state}"]), + (run_data.bs_bldgid_column, [19]), # match applicability of upgrade applic_bldgs_list[0] + ], + timestamp_grouping_func='hour', + weights=[self.BLDG_WEIGHT], + get_query_only=False + ) + else: + # baseline load data when no upgrades are present, or upgrade load data + + # Create query builder and generate query for upgrade data + builder = ComStockQueryBuilder(self.comstock_run_name) + query = builder.get_timeseries_aggregation_query( + upgrade_id=upgrade_id, + enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), + restrictions=[ + ('in.state', [f"{state}"]), + ('bldg_id', [19]) + ], + timestamp_grouping='hour', + weight_view_table=f'{self.comstock_run_name}_md_agg_national_by_state_vu' + ) + + print("Generated Upgrade Query:") + print("="*80) + print(query) + print("="*80) + + df_up_ts_agg_weighted = run_data.agg.aggregate_timeseries( + upgrade_id=upgrade_id, enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), - restrict=[(('build_existing_model.building_type', [self.BLDG_TYPE_TO_SNAKE_CASE[btype]])), - ('state_abbreviation', [f"{state}"]), - (run_data.bs_bldgid_column, applic_bldgs_list), - ], + restrict=[ + ('in.state', [f"{state}"]), + (run_data.bs_bldgid_column, [19]), + ], timestamp_grouping_func='hour', + weights=[self.BLDG_WEIGHT], get_query_only=False ) - # add baseline data - df_base_ts_agg_weighted = apply_wgts(df_base_ts_agg) - df_base_ts_agg_weighted[self.UPGRADE_NAME] = 'baseline' - dfs_base.append(df_base_ts_agg_weighted) - for upgrade in upgrade_num: - # upgrade load data - all enduses - upgrade_ts_agg = run_data.agg.aggregate_timeseries( - upgrade_id=upgrade.astype(str), - enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), - restrict=[(('build_existing_model.building_type', [self.BLDG_TYPE_TO_SNAKE_CASE[btype]])), - ('state_abbreviation', [f"{state}"]), - ], - timestamp_grouping_func='hour', - get_query_only=False - ) + #############end bsq + + end_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + print(f"[{end_str}] Finished upgrade_id {upgrade_id}") + + # Initialize default values in case no data was processed + df_base_ts_agg_weighted = pd.DataFrame() if 'df_base_ts_agg_weighted' not in locals() else df_base_ts_agg_weighted + df_up_ts_agg_weighted = pd.DataFrame() if 'df_up_ts_agg_weighted' not in locals() else df_up_ts_agg_weighted + + df_base_ts_agg_weighted.to_csv('df_base_ts_agg_weighted.csv') + df_up_ts_agg_weighted.to_csv('df_up_ts_agg_weighted.csv') + + return df_base_ts_agg_weighted, df_up_ts_agg_weighted + + ## get weighted load profiles + #def wgt_by_btype(self, df, run_data, dict_wgts, upgrade_num, state, upgrade_name): + # """ + # This method weights the timeseries profiles. + # Returns dataframe with weighted kWh columns. + # """ + # #btype_list = df[self.BLDG_TYPE].unique() + + #applic_bldgs_list = list(df.loc[(df[self.UPGRADE_NAME].isin(upgrade_name)) & (df[self.UPGRADE_APPL]==True), self.BLDG_ID]) + #applic_bldgs_list = [int(x) for x in applic_bldgs_list] + + #dfs_base=[] + #dfs_up=[] + #for btype in btype_list: + + # # get building weights + # btype_wgt = dict_wgts[btype] + + # # apply weights by building type + # def apply_wgts(df): + # # Identify columns that contain 'kwh' in their names + # kwh_columns = [col for col in df.columns if 'kwh' in col] + + # # Apply the weight and add the suffix 'weighted' + # weighted_df = df[kwh_columns].apply(lambda x: x * btype_wgt).rename(columns=lambda x: x + '_weighted') + # # Concatenate the new weighted columns with the original DataFrame without the unweighted 'kwh' columns + # df_wgt = pd.concat([df.drop(columns=kwh_columns), weighted_df], axis=1) + + # return df_wgt + + # # baseline load data - aggregate electricity total only + # df_base_ts_agg_weighted = run_data.agg.aggregate_timeseries( + # upgrade_id=0, + # enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), + # restrict=[(('build_existing_model.building_type', [self.BLDG_TYPE_TO_SNAKE_CASE[btype]])), + # ('state_abbreviation', [f"{state}"]), + # (run_data.bs_bldgid_column, applic_bldgs_list), + # ], + # timestamp_grouping_func='hour', + # get_query_only=False + # ) + + # # add baseline data + # df_base_ts_agg_weighted[self.UPGRADE_NAME] = 'baseline' + + # dfs_base.append(df_base_ts_agg_weighted) - # add upgrade data - df_upgrade_ts_agg_weighted = apply_wgts(upgrade_ts_agg) - df_upgrade_ts_agg_weighted[self.UPGRADE_NAME] = self.dict_upid_to_upname[upgrade] - dfs_up.append(df_upgrade_ts_agg_weighted) + # for upgrade in upgrade_num: + # # upgrade load data - all enduses + # upgrade_ts_agg = run_data.agg.aggregate_timeseries( + # upgrade_id=upgrade.astype(str), + # enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), + # restrict=[(('build_existing_model.building_type', [self.BLDG_TYPE_TO_SNAKE_CASE[btype]])), + # ('state_abbreviation', [f"{state}"]), + # ], + # timestamp_grouping_func='hour', + # get_query_only=False + # ) + # # add upgrade data + # df_upgrade_ts_agg_weighted = apply_wgts(upgrade_ts_agg) + # df_upgrade_ts_agg_weighted[self.UPGRADE_NAME] = self.dict_upid_to_upname[upgrade] + # dfs_up.append(df_upgrade_ts_agg_weighted) - # concatinate and combine baseline data - dfs_base_combined = pd.concat(dfs_base, join='outer', ignore_index=True) - dfs_base_combined = dfs_base_combined.groupby(['time', self.UPGRADE_NAME], observed=True, as_index=False)[dfs_base_combined.loc[:, dfs_base_combined.columns.str.contains('_kwh')].columns].sum() - # concatinate and combine upgrade data - dfs_upgrade_combined = pd.concat(dfs_up, join='outer', ignore_index=True) - dfs_upgrade_combined = dfs_upgrade_combined.groupby(['time', self.UPGRADE_NAME], observed=True, as_index=False)[dfs_upgrade_combined.loc[:, dfs_upgrade_combined.columns.str.contains('_kwh')].columns].sum() + ## concatinate and combine baseline data + #dfs_base_combined = pd.concat(dfs_base, join='outer', ignore_index=True) + #dfs_base_combined = dfs_base_combined.groupby(['time', self.UPGRADE_NAME], observed=True, as_index=False)[dfs_base_combined.loc[:, dfs_base_combined.columns.str.contains('_kwh')].columns].sum() + + ## concatinate and combine upgrade data + #dfs_upgrade_combined = pd.concat(dfs_up, join='outer', ignore_index=True) + #dfs_upgrade_combined = dfs_upgrade_combined.groupby(['time', self.UPGRADE_NAME], observed=True, as_index=False)[dfs_upgrade_combined.loc[:, dfs_upgrade_combined.columns.str.contains('_kwh')].columns].sum() + + #return dfs_base_combined, dfs_upgrade_combined - return dfs_base_combined, dfs_upgrade_combined # plot order_list = [ @@ -3124,175 +3257,168 @@ def map_to_season(month): def plot_measure_timeseries_peak_week_by_state(self, df, output_dir, states, color_map, comstock_run_name): #, df, region, building_type, color_map, output_dir - # run crawler - run_data = BuildStockQuery('eulp', - 'enduse', - self.comstock_run_name, - buildstock_type='comstock', - skip_reports=False) - # get upgrade ID - df_upgrade = df.loc[df[self.UPGRADE_ID]!=0, :] + df_data = df.copy() + # coerce data type of upgrade ID - arrives as float, need str of int for querying + df_data[self.UPGRADE_ID] = pd.to_numeric(df_data[self.UPGRADE_ID], errors="coerce").astype("Int64").astype(str) + df_upgrade = df_data.loc[(df_data[self.UPGRADE_ID]!="0") & (df_data[self.UPGRADE_ID]!=0), :] upgrade_num = list(df_upgrade[self.UPGRADE_ID].unique()) upgrade_name = list(df_upgrade[self.UPGRADE_NAME].unique()) - # get weights - dict_wgts = df_upgrade.groupby(self.BLDG_TYPE, observed=True)[self.BLDG_WEIGHT].mean().to_dict() - # apply queries and weighting for state, state_name in states.items(): - dfs_base_combined, dfs_upgrade_combined = self.wgt_by_btype(df, run_data, dict_wgts, upgrade_num, state, upgrade_name) - - # merge into single dataframe - dfs_merged = pd.concat([dfs_base_combined, dfs_upgrade_combined], ignore_index=True) - - # set index - dfs_merged.set_index("time", inplace=True) - dfs_merged['Month'] = dfs_merged.index.month - - def map_to_season(month): - if 3 <= month <= 5: - return 'Spring' - elif 6 <= month <= 8: - return 'Summer' - elif 9 <= month <= 11: - return 'Fall' - else: - return 'Winter' - - # Apply the mapping function to create the "Season" column - dfs_merged['Season'] = dfs_merged['Month'].apply(map_to_season) - dfs_merged['Week_of_Year'] = dfs_merged.index.isocalendar().week - dfs_merged['Day_of_Year'] = dfs_merged.index.dayofyear - dfs_merged['Day_of_Week'] = dfs_merged.index.dayofweek - dfs_merged['Hour_of_Day'] = dfs_merged.index.hour - dfs_merged['Year'] = dfs_merged.index.year - # make dec 31st last week of year - dfs_merged.loc[dfs_merged['Day_of_Year']==365, 'Week_of_Year'] = 55 - dfs_merged = dfs_merged.loc[dfs_merged['Year']==2018, :] - max_peak = dfs_merged.loc[:, 'total_site_electricity_kwh_weighted'].max() - - # find peak week by season - seasons = ['Spring', 'Summer', 'Fall', 'Winter'] - for season in seasons: - peak_week = dfs_merged.loc[dfs_merged['Season']==season, ["total_site_electricity_kwh_weighted", "Week_of_Year"]] - peak_week = peak_week.loc[peak_week["total_site_electricity_kwh_weighted"] == peak_week["total_site_electricity_kwh_weighted"].max(), "Week_of_Year"][0] - - - # filter to the week - dfs_merged_pw = dfs_merged.loc[dfs_merged["Week_of_Year"]==peak_week, :].copy() - #dfs_merged_pw = dfs_merged_pw.loc[:, dfs_merged_pw.columns.str.contains("electricity")] - dfs_merged_pw.reset_index(inplace=True) - dfs_merged_pw = dfs_merged_pw.sort_values('time') - - # rename columns - dfs_merged_pw.columns = dfs_merged_pw.columns.str.replace("electricity_", "") - dfs_merged_pw.columns = dfs_merged_pw.columns.str.replace("_kwh_weighted", "") - - # convert hourly kWH to 15 minute MW - dfs_merged_pw.loc[:, self.order_list] = dfs_merged_pw.loc[:, self.order_list]/1000 - dfs_merged_pw.loc[:, "total_site"] = dfs_merged_pw.loc[:, "total_site"]/1000 - - # add upgrade traces - # Create traces for area plot - traces = [] - # add aggregate measure load - dfs_merged_pw_up = dfs_merged_pw.loc[dfs_merged_pw['in.upgrade_name'] != "baseline"] - dfs_merged_pw_up.columns = dfs_merged_pw_up.columns.str.replace("total_site", "Measure Total") - # if only 1 upgrade, plot end uses and total - if len(upgrade_num) == 1: - # loop through end uses - for enduse in self.order_list: - trace = go.Scatter( - x=dfs_merged_pw_up['time'], - y=dfs_merged_pw_up[enduse], - fill='tonexty', - fillcolor=self.PLOTLY_ENDUSE_COLOR_DICT[enduse.replace('_'," ").title()], - mode='none', - line=dict(color=self.PLOTLY_ENDUSE_COLOR_DICT[enduse.replace('_'," ").title()], width=0.5), - name=enduse, - stackgroup='stack' - ) - traces.append(trace) - - # Create a trace for the upgrade load - upgrade_trace = go.Scatter( - x=dfs_merged_pw_up['time'], - y=dfs_merged_pw_up['Measure Total'], - mode='lines', - line=dict(color='black', width=1.8, dash='solid'), - name='Measure Total', - ) - traces.append(upgrade_trace) - else: - # if more than 1 upgrade, add only aggregate loads - for upgrade in upgrade_num: - dfs_merged_pw_up_mult = dfs_merged_pw_up.loc[dfs_merged_pw_up['in.upgrade_name'] == self.dict_upid_to_upname[upgrade]] - upgrade_trace = go.Scatter( - x=dfs_merged_pw_up_mult['time'], - y=dfs_merged_pw_up_mult['Measure Total'], - mode='lines', - line=dict(width=1.8, dash='solid'), #color=color_map[self.dict_upid_to_upname[upgrade]] - name=self.dict_upid_to_upname[upgrade], - ) - traces.append(upgrade_trace) - - - # add baseline load - dfs_merged_pw_base = dfs_merged_pw.loc[dfs_merged_pw['in.upgrade_name']=="baseline"] - dfs_merged_pw_base.columns = dfs_merged_pw_base.columns.str.replace("total_site", "Baseline Total") - - # Create a trace for the baseline load - baseline_trace = go.Scatter( - x=dfs_merged_pw_base['time'], - y=dfs_merged_pw_base['Baseline Total'], - mode='lines', - #line=dict(color='black', width=1.75), - line=dict(color='black', width=1.8, dash='dot'), - name='Baseline Total' - ) - traces.append(baseline_trace) - - # Create the layout - layout = go.Layout( - #title=f"{season} Peak Week - {state_name}", - xaxis=dict(mirror=True, title=None, showline=True), - yaxis=dict(mirror=True, title='Electricity Demand (MW)', range=[0, max_peak/1000], showline=True), - legend=dict(font=dict(size=8), y=1.02, xanchor="left", x=0.0, orientation="h", yanchor="bottom", itemwidth=30), - legend_traceorder="reversed", - showlegend=True, - template='simple_white', - width=650, - height=400, - annotations=[ - dict(x=-0.1, # Centered on the x-axis - y=-0.35, # Adjust this value as needed to place the title correctly - xref='paper', - yref='paper', - text=f"{season} Peak Week, Applicable Buildings - {state_name}", - showarrow=False, - font=dict( - size=16 - ))] - ) - - # Create the figure - fig = go.Figure(data=traces, layout=layout) - - # Save fig - title = f"{season}_peak_week" - fig_name = f'{title.replace(" ", "_").lower()}.{self.image_type}' - fig_name_html = f'{title.replace(" ", "_").lower()}.html' - fig_sub_dir = os.path.abspath(os.path.join(output_dir, f"timeseries/{state_name}")) - if not os.path.exists(fig_sub_dir): - os.makedirs(fig_sub_dir) - fig_path = os.path.abspath(os.path.join(fig_sub_dir, fig_name)) - fig_path_html = os.path.abspath(os.path.join(fig_sub_dir, fig_name_html)) - - fig.write_image(fig_path, scale=10) - fig.write_html(fig_path_html) - - dfs_merged.to_csv(f"{fig_sub_dir}/timeseries_data_{state_name}.csv") + dfs_base_combined, dfs_upgrade_combined = self.get_weighted_load_profiles_from_s3(df_data, upgrade_num, state, upgrade_name) + + # # merge into single dataframe + # dfs_merged = pd.concat([dfs_base_combined, dfs_upgrade_combined], ignore_index=True) + + # # set index + # dfs_merged.set_index("time", inplace=True) + # dfs_merged['Month'] = dfs_merged.index.month + + # def map_to_season(month): + # if 3 <= month <= 5: + # return 'Spring' + # elif 6 <= month <= 8: + # return 'Summer' + # elif 9 <= month <= 11: + # return 'Fall' + # else: + # return 'Winter' + + # # Apply the mapping function to create the "Season" column + # dfs_merged['Season'] = dfs_merged['Month'].apply(map_to_season) + # dfs_merged['Week_of_Year'] = dfs_merged.index.isocalendar().week + # dfs_merged['Day_of_Year'] = dfs_merged.index.dayofyear + # dfs_merged['Day_of_Week'] = dfs_merged.index.dayofweek + # dfs_merged['Hour_of_Day'] = dfs_merged.index.hour + # dfs_merged['Year'] = dfs_merged.index.year + # # make dec 31st last week of year + # dfs_merged.loc[dfs_merged['Day_of_Year']==365, 'Week_of_Year'] = 55 + # dfs_merged = dfs_merged.loc[dfs_merged['Year']==2018, :] + # max_peak = dfs_merged.loc[:, 'total_site_electricity_kwh_weighted'].max() + + # # find peak week by season + # seasons = ['Spring', 'Summer', 'Fall', 'Winter'] + # for season in seasons: + # peak_week = dfs_merged.loc[dfs_merged['Season']==season, ["total_site_electricity_kwh_weighted", "Week_of_Year"]] + # peak_week = peak_week.loc[peak_week["total_site_electricity_kwh_weighted"] == peak_week["total_site_electricity_kwh_weighted"].max(), "Week_of_Year"][0] + + + # # filter to the week + # dfs_merged_pw = dfs_merged.loc[dfs_merged["Week_of_Year"]==peak_week, :].copy() + # #dfs_merged_pw = dfs_merged_pw.loc[:, dfs_merged_pw.columns.str.contains("electricity")] + # dfs_merged_pw.reset_index(inplace=True) + # dfs_merged_pw = dfs_merged_pw.sort_values('time') + + # # rename columns + # dfs_merged_pw.columns = dfs_merged_pw.columns.str.replace("electricity_", "") + # dfs_merged_pw.columns = dfs_merged_pw.columns.str.replace("_kwh_weighted", "") + + # # convert hourly kWH to 15 minute MW + # dfs_merged_pw.loc[:, self.order_list] = dfs_merged_pw.loc[:, self.order_list]/1000 + # dfs_merged_pw.loc[:, "total_site"] = dfs_merged_pw.loc[:, "total_site"]/1000 + + # # add upgrade traces + # # Create traces for area plot + # traces = [] + # # add aggregate measure load + # dfs_merged_pw_up = dfs_merged_pw.loc[dfs_merged_pw['in.upgrade_name'] != "baseline"] + # dfs_merged_pw_up.columns = dfs_merged_pw_up.columns.str.replace("total_site", "Measure Total") + # # if only 1 upgrade, plot end uses and total + # if len(upgrade_num) == 1: + # # loop through end uses + # for enduse in self.order_list: + # trace = go.Scatter( + # x=dfs_merged_pw_up['time'], + # y=dfs_merged_pw_up[enduse], + # fill='tonexty', + # fillcolor=self.PLOTLY_ENDUSE_COLOR_DICT[enduse.replace('_'," ").title()], + # mode='none', + # line=dict(color=self.PLOTLY_ENDUSE_COLOR_DICT[enduse.replace('_'," ").title()], width=0.5), + # name=enduse, + # stackgroup='stack' + # ) + # traces.append(trace) + + # # Create a trace for the upgrade load + # upgrade_trace = go.Scatter( + # x=dfs_merged_pw_up['time'], + # y=dfs_merged_pw_up['Measure Total'], + # mode='lines', + # line=dict(color='black', width=1.8, dash='solid'), + # name='Measure Total', + # ) + # traces.append(upgrade_trace) + # else: + # # if more than 1 upgrade, add only aggregate loads + # for upgrade in upgrade_num: + # dfs_merged_pw_up_mult = dfs_merged_pw_up.loc[dfs_merged_pw_up['in.upgrade_name'] == self.dict_upid_to_upname[upgrade]] + # upgrade_trace = go.Scatter( + # x=dfs_merged_pw_up_mult['time'], + # y=dfs_merged_pw_up_mult['Measure Total'], + # mode='lines', + # line=dict(width=1.8, dash='solid'), #color=color_map[self.dict_upid_to_upname[upgrade]] + # name=self.dict_upid_to_upname[upgrade], + # ) + # traces.append(upgrade_trace) + + + # # add baseline load + # dfs_merged_pw_base = dfs_merged_pw.loc[dfs_merged_pw['in.upgrade_name']=="baseline"] + # dfs_merged_pw_base.columns = dfs_merged_pw_base.columns.str.replace("total_site", "Baseline Total") + + # # Create a trace for the baseline load + # baseline_trace = go.Scatter( + # x=dfs_merged_pw_base['time'], + # y=dfs_merged_pw_base['Baseline Total'], + # mode='lines', + # #line=dict(color='black', width=1.75), + # line=dict(color='black', width=1.8, dash='dot'), + # name='Baseline Total' + # ) + # traces.append(baseline_trace) + + # # Create the layout + # layout = go.Layout( + # #title=f"{season} Peak Week - {state_name}", + # xaxis=dict(mirror=True, title=None, showline=True), + # yaxis=dict(mirror=True, title='Electricity Demand (MW)', range=[0, max_peak/1000], showline=True), + # legend=dict(font=dict(size=8), y=1.02, xanchor="left", x=0.0, orientation="h", yanchor="bottom", itemwidth=30), + # legend_traceorder="reversed", + # showlegend=True, + # template='simple_white', + # width=650, + # height=400, + # annotations=[ + # dict(x=-0.1, # Centered on the x-axis + # y=-0.35, # Adjust this value as needed to place the title correctly + # xref='paper', + # yref='paper', + # text=f"{season} Peak Week, Applicable Buildings - {state_name}", + # showarrow=False, + # font=dict( + # size=16 + # ))] + # ) + + # # Create the figure + # fig = go.Figure(data=traces, layout=layout) + + # # Save fig + # title = f"{season}_peak_week" + # fig_name = f'{title.replace(" ", "_").lower()}.{self.image_type}' + # fig_name_html = f'{title.replace(" ", "_").lower()}.html' + # fig_sub_dir = os.path.abspath(os.path.join(output_dir, f"timeseries/{state_name}")) + # if not os.path.exists(fig_sub_dir): + # os.makedirs(fig_sub_dir) + # fig_path = os.path.abspath(os.path.join(fig_sub_dir, fig_name)) + # fig_path_html = os.path.abspath(os.path.join(fig_sub_dir, fig_name_html)) + + # fig.write_image(fig_path, scale=10) + # fig.write_html(fig_path_html) + + # dfs_merged.to_csv(f"{fig_sub_dir}/timeseries_data_{state_name}.csv") def plot_measure_timeseries_season_average_by_state(self, df, output_dir, states, color_map, comstock_run_name): @@ -3512,7 +3638,7 @@ def map_to_dow(dow): def plot_measure_timeseries_annual_average_by_state_and_enduse(self, df, output_dir, states, color_map, comstock_run_name): # run crawler - run_data = BuildStockQuery('eulp', 'enduse', self.comstock_run_name, buildstock_type='comstock', skip_reports=False) + run_data = BuildStockQuery('eulp', 'enduse', self.comstock_run_name, buildstock_type='comstock', skip_reports=False, timeseries_table_suffix='something') # get upgrade ID df_upgrade = df.loc[df[self.UPGRADE_ID] != 0, :] diff --git a/postprocessing/comstockpostproc/s3_utilities_mixin.py b/postprocessing/comstockpostproc/s3_utilities_mixin.py index 28b2d3cc4..3e6d01a8e 100644 --- a/postprocessing/comstockpostproc/s3_utilities_mixin.py +++ b/postprocessing/comstockpostproc/s3_utilities_mixin.py @@ -220,7 +220,7 @@ def setup_fsspec_filesystem(self, output_dir, aws_profile_name): output_dir = os.path.abspath(os.path.join(current_dir, '..', 'output', self.dataset_name)) # PyAthena >2.18.0 implements an s3 filesystem that replaces s3fs but does not implement file.open() # Make fsspec use the s3fs s3 filesystem implementation for writing files to S3 - register_implementation("s3", s3fs.S3FileSystem, clobber=True) + #register_implementation("s3", s3fs.S3FileSystem, clobber=True) out_fs, out_fs_path = url_to_fs(output_dir, profile=aws_profile_name) output_dir = { 'fs': out_fs, diff --git a/postprocessing/setup.py b/postprocessing/setup.py index b993c960e..50054820e 100644 --- a/postprocessing/setup.py +++ b/postprocessing/setup.py @@ -42,7 +42,7 @@ 'Programming Language :: Python :: 3.10', ], keywords='comstock postprocessing', - python_requires="==3.10.12", + python_requires=">=3.9", install_requires=[ 'boto3', 'botocore', @@ -58,7 +58,7 @@ 'scipy', 'seaborn>=0.12.0', 'xlrd', - 'buildstock_query @ git+https://github.com/NREL/buildstock-query@main' + #'buildstock_query @ git+https://github.com/NREL/buildstock-query@main' ], extras_require={ 'dev': [ From 14316b4352413d3b275f1a31cfd58dc632f068e0 Mon Sep 17 00:00:00 2001 From: Chris CaraDonna <50876267+ChristopherCaradonna@users.noreply.github.com> Date: Wed, 5 Nov 2025 11:51:08 -0700 Subject: [PATCH 03/16] Plotting updates - works for measure comparisons, but needs cleanup --- .../compare_upgrades-test_timeseries_plots.py | 63 +- .../comstock_query_builder.py | 83 ++- .../comstockpostproc/plotting_mixin.py | 542 +++++++----------- 3 files changed, 308 insertions(+), 380 deletions(-) diff --git a/postprocessing/compare_upgrades-test_timeseries_plots.py b/postprocessing/compare_upgrades-test_timeseries_plots.py index 28ed481ed..a1e47d1af 100644 --- a/postprocessing/compare_upgrades-test_timeseries_plots.py +++ b/postprocessing/compare_upgrades-test_timeseries_plots.py @@ -22,12 +22,6 @@ except Exception as e: print(f"{m:12} NOT IMPORTABLE: {e}") -# Python version details -print("python exe:", sys.executable) -print("python implementation:", platform.python_implementation()) -print("python version:", platform.python_version()) -print("sys.version:", sys.version.replace("\n", " ")) -print("sys.version_info:", sys.version_info) logging.basicConfig(level='INFO') # Use DEBUG, INFO, or WARNING logger = logging.getLogger(__name__) @@ -102,41 +96,6 @@ def create_sightglass_tables( glue.delete_crawler(Name=crawler_name) logger.info(f"Deleting Crawler {crawler_name}") - ## Crawl the timeseries targets with one crawler - #logger.info(f"Creating Crawler for timeseries data") - #crawler_name = f'{crawler_name_base}_ts' - #tbl_prefix = f'{tbl_prefix_base}_ts_' - - #crawler_params["Name"] = crawler_name - #crawler_params["TablePrefix"] = tbl_prefix - #crawler_params["Targets"]["S3Targets"] = [] - #for ts_path in fs.ls(f"{fs_path}/timeseries_individual_buildings/"): - # crawler_params["Targets"]["S3Targets"].append( - # {"Path": f"s3://{ts_path}", "SampleSize": 1} - # ) - - #breakpoint() - #try: - # crawler_info = glue.get_crawler(Name=crawler_name) - #except glue.exceptions.EntityNotFoundException as ex: - # logger.info(f"Creating Crawler {crawler_name}") - # glue.create_crawler(**crawler_params) - #else: - # logger.info(f"Updating Crawler {crawler_name}") - # glue.update_crawler(**crawler_params) - - #logger.info(f"Running crawler {crawler_name} on: {ts_path}") - #glue.start_crawler(Name=crawler_name) - #time.sleep(10) - - #crawler_info = glue.get_crawler(Name=crawler_name) - #while crawler_info["Crawler"]["State"] != "READY": - # time.sleep(10) - # crawler_info = glue.get_crawler(Name=crawler_name) - # logger.info(f"Crawler state: {crawler_info['Crawler']['State']}") - #glue.delete_crawler(Name=crawler_name) - #logger.info(f"Deleting Crawler {crawler_name}") - return dataset_name def fix_timeseries_tables(dataset_name: str, database_name: str = "vizstock"): @@ -418,21 +377,9 @@ def create_views( 'puma_west': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'} } - # Create a map of column names to aliases for partition columns - #col_to_alias = { aliases[by]['col']: aliases[by]['alias'] for by in bys if aliases[by] } - # Select columns for the metadata, aliasing partition columns cols = [] for col in metadata_tbl.columns: - # Identify partition column and find alias - # col_name = str(col).replace(f'{metadata_tblname}.', '') - # if col_name in col_to_alias: - # # logger.debug(f"Aliasing {col} as {col_to_alias[col_name]}") - # cols.insert( - # 2, metadata_tbl.c[col_name].label(col_to_alias[col_name]) - # ) - # else: - # cols.append(col) cols.append(col) continue @@ -590,12 +537,12 @@ def main(): upgrade_ids_to_skip=[2,3], # Use [1, 3] etc. to exclude certain upgrades make_timeseries_plots=True, states={ - #'MN': 'Minnesota', # specify state to use for timeseries plots in dictionary format. State ID must correspond correctly. + 'MN': 'Minnesota', # specify state to use for timeseries plots in dictionary format. State ID must correspond correctly. 'MA':'Massachusetts', - #'OR': 'Oregon', - #'LA': 'Louisiana', - #'AZ': 'Arizona', - #'TN': 'Tennessee' + 'OR': 'Oregon', + 'LA': 'Louisiana', + 'AZ': 'Arizona', + 'TN': 'Tennessee' }, # [('state',['VA','AZ']) upgrade_ids_for_comparison={} # Use {'':[0,1,2]}; add as many upgrade IDs as needed, but plots look strange over 5 diff --git a/postprocessing/comstockpostproc/comstock_query_builder.py b/postprocessing/comstockpostproc/comstock_query_builder.py index bedc032b8..d1be591cc 100644 --- a/postprocessing/comstockpostproc/comstock_query_builder.py +++ b/postprocessing/comstockpostproc/comstock_query_builder.py @@ -172,14 +172,27 @@ def get_timeseries_aggregation_query(self, # Handle restrictions - these are typically filters on the weight view table if restrictions: for column, values in restrictions: + # Determine if this is a numeric column that shouldn't be quoted + numeric_columns = ['bldg_id', 'building_id', 'upgrade', 'upgrade_id'] + is_numeric = column in numeric_columns + if isinstance(values, (list, tuple)): if len(values) == 1: - where_conditions.append(f'{weight_view_table}."{column}" = \'{values[0]}\'') + if is_numeric: + where_conditions.append(f'{weight_view_table}."{column}" = {values[0]}') + else: + where_conditions.append(f'{weight_view_table}."{column}" = \'{values[0]}\'') else: - value_list = ', '.join([f"'{v}'" for v in values]) + if is_numeric: + value_list = ', '.join([str(v) for v in values]) + else: + value_list = ', '.join([f"'{v}'" for v in values]) where_conditions.append(f'{weight_view_table}."{column}" IN ({value_list})') else: - where_conditions.append(f'{weight_view_table}."{column}" = \'{values}\'') + if is_numeric: + where_conditions.append(f'{weight_view_table}."{column}" = {values}') + else: + where_conditions.append(f'{weight_view_table}."{column}" = \'{values}\'') where_clause = ' AND '.join(where_conditions) @@ -316,6 +329,70 @@ def get_building_characteristics_query(self, return query.strip() + def get_applicability_query(self, + upgrade_ids: List[Union[int, str]], + state: Optional[str] = None, + columns: Optional[List[str]] = None, + weight_view_table: Optional[str] = None) -> str: + """ + Build query to get applicable buildings and their characteristics from the weight view. + + Args: + upgrade_ids: List of upgrade IDs to filter on + state: State abbreviation to filter on (optional) + columns: Specific columns to select (optional, defaults to common applicability columns) + weight_view_table: Name of the weight view table (optional, uses default naming) + + Returns: + SQL query string + """ + + # If no weight view table provided, construct default name + if weight_view_table is None: + weight_view_table = f"{self.athena_table_name}_md_agg_national_by_state_vu" + + # Default columns for applicability queries + if columns is None: + columns = [ + 'dataset', + '"in.state"', + 'bldg_id', + 'upgrade', + 'applicability' + ] + + # Build SELECT clause + select_clause = ',\n '.join(columns) + + # Build WHERE clause + where_conditions = [] + + # Filter by upgrade IDs + if len(upgrade_ids) == 1: + where_conditions.append(f'upgrade = {upgrade_ids[0]}') + else: + upgrade_list = ','.join(map(str, upgrade_ids)) + where_conditions.append(f'upgrade IN ({upgrade_list})') + + # Filter by applicability + where_conditions.append('applicability = true') + + # Filter by state if provided + if state: + where_conditions.append(f'"in.state" = \'{state}\'') + + where_clause = ' AND '.join(where_conditions) + + query = f""" + SELECT DISTINCT + {select_clause} + FROM {weight_view_table} + WHERE {where_clause} + ORDER BY bldg_id + """ + + return query.strip() + def get_monthly_energy_query(athena_table_name: str, **kwargs) -> str: """ diff --git a/postprocessing/comstockpostproc/plotting_mixin.py b/postprocessing/comstockpostproc/plotting_mixin.py index 30f239f40..783df17ca 100644 --- a/postprocessing/comstockpostproc/plotting_mixin.py +++ b/postprocessing/comstockpostproc/plotting_mixin.py @@ -3039,37 +3039,48 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, state, upgrade_nam """ # run crawler - run_data = BuildStockQuery(workgroup='eulp', + athena_client = BuildStockQuery(workgroup='eulp', db_name='enduse', table_name=self.comstock_run_name, buildstock_type='comstock', skip_reports=True, - metadata_table_suffix='_md_agg_national_by_state_vu', #TODO: make this more dynamic + metadata_table_suffix='_md_agg_national_by_state_vu', #TODO: make this more dynamic for geo resolution #ts_table_suffix=f"_timeseries_vu" ) - print(run_data._tables.keys()) + print(athena_client._tables.keys()) + + # generate applicability for upgrade + # if there are multiple upgrades, applicability is based on first upgrade only + builder = ComStockQueryBuilder(self.comstock_run_name) + applicability_query = builder.get_applicability_query( + upgrade_ids=upgrade_num, + state=state, + columns=[ + 'dataset', + '"in.state"', + 'bldg_id', + 'upgrade', + 'applicability' + ] + ) - breakpoint() + #print("Applicability Query:") + #print("="*80) + #print(applicability_query) + #print("="*80) - applic_bldgs_list = list(df.loc[(df[self.UPGRADE_ID].isin(upgrade_num)) & (df[self.UPGRADE_APPL]==True), self.BLDG_ID]) + # Execute query to get applicable buildings + applic_df = athena_client.execute(applicability_query) + applic_bldgs_list = list(applic_df['bldg_id'].unique()) applic_bldgs_list = [int(x) for x in applic_bldgs_list] - import time from datetime import datetime, time - start_time = time() - start_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - print(f"\n[{start_str}] Starting processing for state {state}") - # loop through upgrades for upgrade_id in df[self.UPGRADE_ID].unique(): - print(f"\n[{start_str}] Starting processing for state {state}, upgrade id {upgrade_id}...") - - # if there are upgrades, restrict baseline to match upgrade applicability if (upgrade_id in (0, "0")) and (df[self.UPGRADE_ID].nunique() > 1): @@ -3080,156 +3091,57 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, state, upgrade_nam enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), restrictions=[ ('in.state', [f"{state}"]), - ('bldg_id', [19]) # match applicability of upgrade applic_bldgs_list[0] + ('bldg_id', applic_bldgs_list) # match applicability of upgrade applic_bldgs_list[0] ], timestamp_grouping='hour', weight_view_table=f'{self.comstock_run_name}_md_agg_national_by_state_vu' ) - - print("Generated Query:") - print("="*80) - print(query) - print("="*80) + + #print("Generated Query:") + #print("="*80) + #print(query) + #print("="*80) print(f"Getting weighted baseline load profile for state {state} and upgrade id {upgrade_id} with {len(applic_bldgs_list)} applicable buildings.") - ########start bsq - # need to replace buildstock query code with direct athena query function call - df_base_ts_agg_weighted = run_data.agg.aggregate_timeseries( - upgrade_id=upgrade_id, - enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), - restrict=[ - ('in.state', [f"{state}"]), - (run_data.bs_bldgid_column, [19]), # match applicability of upgrade applic_bldgs_list[0] - ], - timestamp_grouping_func='hour', - weights=[self.BLDG_WEIGHT], - get_query_only=False - ) + # temp save figure + df_base_ts_agg_weighted = athena_client.execute(query) + df_base_ts_agg_weighted[self.UPGRADE_NAME] = self.dict_upid_to_upname[0] + df_base_ts_agg_weighted.to_csv('comstock_unscaled_data.csv', index=False) + else: # baseline load data when no upgrades are present, or upgrade load data - - # Create query builder and generate query for upgrade data + + + ## Create query builder and generate query for upgrade data builder = ComStockQueryBuilder(self.comstock_run_name) query = builder.get_timeseries_aggregation_query( upgrade_id=upgrade_id, enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), restrictions=[ ('in.state', [f"{state}"]), - ('bldg_id', [19]) + ('bldg_id', applic_bldgs_list) # match applicability of upgrade applic_bldgs_list[0] ], timestamp_grouping='hour', weight_view_table=f'{self.comstock_run_name}_md_agg_national_by_state_vu' ) - - print("Generated Upgrade Query:") - print("="*80) - print(query) - print("="*80) - - df_up_ts_agg_weighted = run_data.agg.aggregate_timeseries( - upgrade_id=upgrade_id, - enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), - restrict=[ - ('in.state', [f"{state}"]), - (run_data.bs_bldgid_column, [19]), - ], - timestamp_grouping_func='hour', - weights=[self.BLDG_WEIGHT], - get_query_only=False - ) - - - #############end bsq - - end_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - print(f"[{end_str}] Finished upgrade_id {upgrade_id}") + + df_up_ts_agg_weighted = athena_client.execute(query) + df_up_ts_agg_weighted[self.UPGRADE_NAME] = self.dict_upid_to_upname[int(upgrade_num[0])] + df_up_ts_agg_weighted.to_csv('comstock_unscaled_upgrade_data.csv', index=False) + + #print("Generated Upgrade Query:") + #print("="*80) + #print(query) + #print("="*80) + # Initialize default values in case no data was processed df_base_ts_agg_weighted = pd.DataFrame() if 'df_base_ts_agg_weighted' not in locals() else df_base_ts_agg_weighted df_up_ts_agg_weighted = pd.DataFrame() if 'df_up_ts_agg_weighted' not in locals() else df_up_ts_agg_weighted - - df_base_ts_agg_weighted.to_csv('df_base_ts_agg_weighted.csv') - df_up_ts_agg_weighted.to_csv('df_up_ts_agg_weighted.csv') return df_base_ts_agg_weighted, df_up_ts_agg_weighted - ## get weighted load profiles - #def wgt_by_btype(self, df, run_data, dict_wgts, upgrade_num, state, upgrade_name): - # """ - # This method weights the timeseries profiles. - # Returns dataframe with weighted kWh columns. - # """ - # #btype_list = df[self.BLDG_TYPE].unique() - - #applic_bldgs_list = list(df.loc[(df[self.UPGRADE_NAME].isin(upgrade_name)) & (df[self.UPGRADE_APPL]==True), self.BLDG_ID]) - #applic_bldgs_list = [int(x) for x in applic_bldgs_list] - - #dfs_base=[] - #dfs_up=[] - #for btype in btype_list: - - # # get building weights - # btype_wgt = dict_wgts[btype] - - # # apply weights by building type - # def apply_wgts(df): - # # Identify columns that contain 'kwh' in their names - # kwh_columns = [col for col in df.columns if 'kwh' in col] - - # # Apply the weight and add the suffix 'weighted' - # weighted_df = df[kwh_columns].apply(lambda x: x * btype_wgt).rename(columns=lambda x: x + '_weighted') - # # Concatenate the new weighted columns with the original DataFrame without the unweighted 'kwh' columns - # df_wgt = pd.concat([df.drop(columns=kwh_columns), weighted_df], axis=1) - - # return df_wgt - - # # baseline load data - aggregate electricity total only - # df_base_ts_agg_weighted = run_data.agg.aggregate_timeseries( - # upgrade_id=0, - # enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), - # restrict=[(('build_existing_model.building_type', [self.BLDG_TYPE_TO_SNAKE_CASE[btype]])), - # ('state_abbreviation', [f"{state}"]), - # (run_data.bs_bldgid_column, applic_bldgs_list), - # ], - # timestamp_grouping_func='hour', - # get_query_only=False - # ) - - # # add baseline data - # df_base_ts_agg_weighted[self.UPGRADE_NAME] = 'baseline' - - # dfs_base.append(df_base_ts_agg_weighted) - - # for upgrade in upgrade_num: - # # upgrade load data - all enduses - # upgrade_ts_agg = run_data.agg.aggregate_timeseries( - # upgrade_id=upgrade.astype(str), - # enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), - # restrict=[(('build_existing_model.building_type', [self.BLDG_TYPE_TO_SNAKE_CASE[btype]])), - # ('state_abbreviation', [f"{state}"]), - # ], - # timestamp_grouping_func='hour', - # get_query_only=False - # ) - - # # add upgrade data - # df_upgrade_ts_agg_weighted = apply_wgts(upgrade_ts_agg) - # df_upgrade_ts_agg_weighted[self.UPGRADE_NAME] = self.dict_upid_to_upname[upgrade] - # dfs_up.append(df_upgrade_ts_agg_weighted) - - - ## concatinate and combine baseline data - #dfs_base_combined = pd.concat(dfs_base, join='outer', ignore_index=True) - #dfs_base_combined = dfs_base_combined.groupby(['time', self.UPGRADE_NAME], observed=True, as_index=False)[dfs_base_combined.loc[:, dfs_base_combined.columns.str.contains('_kwh')].columns].sum() - - ## concatinate and combine upgrade data - #dfs_upgrade_combined = pd.concat(dfs_up, join='outer', ignore_index=True) - #dfs_upgrade_combined = dfs_upgrade_combined.groupby(['time', self.UPGRADE_NAME], observed=True, as_index=False)[dfs_upgrade_combined.loc[:, dfs_upgrade_combined.columns.str.contains('_kwh')].columns].sum() - - #return dfs_base_combined, dfs_upgrade_combined - - # plot order_list = [ 'interior_equipment', @@ -3269,174 +3181,169 @@ def plot_measure_timeseries_peak_week_by_state(self, df, output_dir, states, col for state, state_name in states.items(): dfs_base_combined, dfs_upgrade_combined = self.get_weighted_load_profiles_from_s3(df_data, upgrade_num, state, upgrade_name) - # # merge into single dataframe - # dfs_merged = pd.concat([dfs_base_combined, dfs_upgrade_combined], ignore_index=True) - - # # set index - # dfs_merged.set_index("time", inplace=True) - # dfs_merged['Month'] = dfs_merged.index.month - - # def map_to_season(month): - # if 3 <= month <= 5: - # return 'Spring' - # elif 6 <= month <= 8: - # return 'Summer' - # elif 9 <= month <= 11: - # return 'Fall' - # else: - # return 'Winter' - - # # Apply the mapping function to create the "Season" column - # dfs_merged['Season'] = dfs_merged['Month'].apply(map_to_season) - # dfs_merged['Week_of_Year'] = dfs_merged.index.isocalendar().week - # dfs_merged['Day_of_Year'] = dfs_merged.index.dayofyear - # dfs_merged['Day_of_Week'] = dfs_merged.index.dayofweek - # dfs_merged['Hour_of_Day'] = dfs_merged.index.hour - # dfs_merged['Year'] = dfs_merged.index.year - # # make dec 31st last week of year - # dfs_merged.loc[dfs_merged['Day_of_Year']==365, 'Week_of_Year'] = 55 - # dfs_merged = dfs_merged.loc[dfs_merged['Year']==2018, :] - # max_peak = dfs_merged.loc[:, 'total_site_electricity_kwh_weighted'].max() - - # # find peak week by season - # seasons = ['Spring', 'Summer', 'Fall', 'Winter'] - # for season in seasons: - # peak_week = dfs_merged.loc[dfs_merged['Season']==season, ["total_site_electricity_kwh_weighted", "Week_of_Year"]] - # peak_week = peak_week.loc[peak_week["total_site_electricity_kwh_weighted"] == peak_week["total_site_electricity_kwh_weighted"].max(), "Week_of_Year"][0] - - - # # filter to the week - # dfs_merged_pw = dfs_merged.loc[dfs_merged["Week_of_Year"]==peak_week, :].copy() - # #dfs_merged_pw = dfs_merged_pw.loc[:, dfs_merged_pw.columns.str.contains("electricity")] - # dfs_merged_pw.reset_index(inplace=True) - # dfs_merged_pw = dfs_merged_pw.sort_values('time') - - # # rename columns - # dfs_merged_pw.columns = dfs_merged_pw.columns.str.replace("electricity_", "") - # dfs_merged_pw.columns = dfs_merged_pw.columns.str.replace("_kwh_weighted", "") - - # # convert hourly kWH to 15 minute MW - # dfs_merged_pw.loc[:, self.order_list] = dfs_merged_pw.loc[:, self.order_list]/1000 - # dfs_merged_pw.loc[:, "total_site"] = dfs_merged_pw.loc[:, "total_site"]/1000 - - # # add upgrade traces - # # Create traces for area plot - # traces = [] - # # add aggregate measure load - # dfs_merged_pw_up = dfs_merged_pw.loc[dfs_merged_pw['in.upgrade_name'] != "baseline"] - # dfs_merged_pw_up.columns = dfs_merged_pw_up.columns.str.replace("total_site", "Measure Total") - # # if only 1 upgrade, plot end uses and total - # if len(upgrade_num) == 1: - # # loop through end uses - # for enduse in self.order_list: - # trace = go.Scatter( - # x=dfs_merged_pw_up['time'], - # y=dfs_merged_pw_up[enduse], - # fill='tonexty', - # fillcolor=self.PLOTLY_ENDUSE_COLOR_DICT[enduse.replace('_'," ").title()], - # mode='none', - # line=dict(color=self.PLOTLY_ENDUSE_COLOR_DICT[enduse.replace('_'," ").title()], width=0.5), - # name=enduse, - # stackgroup='stack' - # ) - # traces.append(trace) - - # # Create a trace for the upgrade load - # upgrade_trace = go.Scatter( - # x=dfs_merged_pw_up['time'], - # y=dfs_merged_pw_up['Measure Total'], - # mode='lines', - # line=dict(color='black', width=1.8, dash='solid'), - # name='Measure Total', - # ) - # traces.append(upgrade_trace) - # else: - # # if more than 1 upgrade, add only aggregate loads - # for upgrade in upgrade_num: - # dfs_merged_pw_up_mult = dfs_merged_pw_up.loc[dfs_merged_pw_up['in.upgrade_name'] == self.dict_upid_to_upname[upgrade]] - # upgrade_trace = go.Scatter( - # x=dfs_merged_pw_up_mult['time'], - # y=dfs_merged_pw_up_mult['Measure Total'], - # mode='lines', - # line=dict(width=1.8, dash='solid'), #color=color_map[self.dict_upid_to_upname[upgrade]] - # name=self.dict_upid_to_upname[upgrade], - # ) - # traces.append(upgrade_trace) - - - # # add baseline load - # dfs_merged_pw_base = dfs_merged_pw.loc[dfs_merged_pw['in.upgrade_name']=="baseline"] - # dfs_merged_pw_base.columns = dfs_merged_pw_base.columns.str.replace("total_site", "Baseline Total") - - # # Create a trace for the baseline load - # baseline_trace = go.Scatter( - # x=dfs_merged_pw_base['time'], - # y=dfs_merged_pw_base['Baseline Total'], - # mode='lines', - # #line=dict(color='black', width=1.75), - # line=dict(color='black', width=1.8, dash='dot'), - # name='Baseline Total' - # ) - # traces.append(baseline_trace) - - # # Create the layout - # layout = go.Layout( - # #title=f"{season} Peak Week - {state_name}", - # xaxis=dict(mirror=True, title=None, showline=True), - # yaxis=dict(mirror=True, title='Electricity Demand (MW)', range=[0, max_peak/1000], showline=True), - # legend=dict(font=dict(size=8), y=1.02, xanchor="left", x=0.0, orientation="h", yanchor="bottom", itemwidth=30), - # legend_traceorder="reversed", - # showlegend=True, - # template='simple_white', - # width=650, - # height=400, - # annotations=[ - # dict(x=-0.1, # Centered on the x-axis - # y=-0.35, # Adjust this value as needed to place the title correctly - # xref='paper', - # yref='paper', - # text=f"{season} Peak Week, Applicable Buildings - {state_name}", - # showarrow=False, - # font=dict( - # size=16 - # ))] - # ) - - # # Create the figure - # fig = go.Figure(data=traces, layout=layout) - - # # Save fig - # title = f"{season}_peak_week" - # fig_name = f'{title.replace(" ", "_").lower()}.{self.image_type}' - # fig_name_html = f'{title.replace(" ", "_").lower()}.html' - # fig_sub_dir = os.path.abspath(os.path.join(output_dir, f"timeseries/{state_name}")) - # if not os.path.exists(fig_sub_dir): - # os.makedirs(fig_sub_dir) - # fig_path = os.path.abspath(os.path.join(fig_sub_dir, fig_name)) - # fig_path_html = os.path.abspath(os.path.join(fig_sub_dir, fig_name_html)) - - # fig.write_image(fig_path, scale=10) - # fig.write_html(fig_path_html) - - # dfs_merged.to_csv(f"{fig_sub_dir}/timeseries_data_{state_name}.csv") + # merge into single dataframe + dfs_merged = pd.concat([dfs_base_combined, dfs_upgrade_combined], ignore_index=True) - def plot_measure_timeseries_season_average_by_state(self, df, output_dir, states, color_map, comstock_run_name): + # set index + dfs_merged.set_index("time", inplace=True) + dfs_merged['Month'] = dfs_merged.index.month - # run crawler - run_data = BuildStockQuery('eulp', - 'enduse', - self.comstock_run_name, - buildstock_type='comstock', - skip_reports=False) + def map_to_season(month): + if 3 <= month <= 5: + return 'Spring' + elif 6 <= month <= 8: + return 'Summer' + elif 9 <= month <= 11: + return 'Fall' + else: + return 'Winter' + + # Apply the mapping function to create the "Season" column + dfs_merged['Season'] = dfs_merged['Month'].apply(map_to_season) + dfs_merged['Week_of_Year'] = dfs_merged.index.isocalendar().week + dfs_merged['Day_of_Year'] = dfs_merged.index.dayofyear + dfs_merged['Day_of_Week'] = dfs_merged.index.dayofweek + dfs_merged['Hour_of_Day'] = dfs_merged.index.hour + dfs_merged['Year'] = dfs_merged.index.year + # make dec 31st last week of year + dfs_merged.loc[dfs_merged['Day_of_Year']==365, 'Week_of_Year'] = 55 + dfs_merged = dfs_merged.loc[dfs_merged['Year']==2018, :] + max_peak = dfs_merged.loc[:, 'total_site_electricity_kwh'].max() + + # find peak week by season + seasons = ['Spring', 'Summer', 'Fall', 'Winter'] + for season in seasons: + peak_week = dfs_merged.loc[dfs_merged['Season']==season, ["total_site_electricity_kwh", "Week_of_Year"]] + peak_week = peak_week.loc[peak_week["total_site_electricity_kwh"] == peak_week["total_site_electricity_kwh"].max(), "Week_of_Year"][0] + + + # filter to the week + dfs_merged_pw = dfs_merged.loc[dfs_merged["Week_of_Year"]==peak_week, :].copy() + #dfs_merged_pw = dfs_merged_pw.loc[:, dfs_merged_pw.columns.str.contains("electricity")] + dfs_merged_pw.reset_index(inplace=True) + dfs_merged_pw = dfs_merged_pw.sort_values('time') + + # rename columns + dfs_merged_pw.columns = dfs_merged_pw.columns.str.replace("electricity_", "") + dfs_merged_pw.columns = dfs_merged_pw.columns.str.replace("_kwh", "") + + # convert hourly kWH to 15 minute MW + dfs_merged_pw.loc[:, self.order_list] = dfs_merged_pw.loc[:, self.order_list]/1000 + dfs_merged_pw.loc[:, "total_site"] = dfs_merged_pw.loc[:, "total_site"]/1000 + + # add upgrade traces + # Create traces for area plot + traces = [] + # add aggregate measure load + dfs_merged_pw_up = dfs_merged_pw.loc[dfs_merged_pw['in.upgrade_name'] != "Baseline"] + dfs_merged_pw_up.columns = dfs_merged_pw_up.columns.str.replace("total_site", "Measure Total") + + # if only 1 upgrade, plot end uses and total + if len(upgrade_num) == 1: + # loop through end uses + for enduse in self.order_list: + trace = go.Scatter( + x=dfs_merged_pw_up['time'], + y=dfs_merged_pw_up[enduse], + fill='tonexty', + fillcolor=self.PLOTLY_ENDUSE_COLOR_DICT[enduse.replace('_'," ").title()], + mode='none', + line=dict(color=self.PLOTLY_ENDUSE_COLOR_DICT[enduse.replace('_'," ").title()], width=0.5), + name=enduse, + stackgroup='stack' + ) + traces.append(trace) + + # Create a trace for the upgrade load + upgrade_trace = go.Scatter( + x=dfs_merged_pw_up['time'], + y=dfs_merged_pw_up['Measure Total'], + mode='lines', + line=dict(color='black', width=1.8, dash='solid'), + name='Measure Total', + ) + traces.append(upgrade_trace) + else: + # if more than 1 upgrade, add only aggregate loads + for upgrade in upgrade_num: + dfs_merged_pw_up_mult = dfs_merged_pw_up.loc[dfs_merged_pw_up['in.upgrade_name'] == self.dict_upid_to_upname[upgrade]] + upgrade_trace = go.Scatter( + x=dfs_merged_pw_up_mult['time'], + y=dfs_merged_pw_up_mult['Measure Total'], + mode='lines', + line=dict(width=1.8, dash='solid'), #color=color_map[self.dict_upid_to_upname[upgrade]] + name=self.dict_upid_to_upname[upgrade], + ) + traces.append(upgrade_trace) + + + # add baseline load + dfs_merged_pw_base = dfs_merged_pw.loc[dfs_merged_pw['in.upgrade_name']=="Baseline"] + dfs_merged_pw_base.columns = dfs_merged_pw_base.columns.str.replace("total_site", "Baseline Total") + + # Create a trace for the baseline load + baseline_trace = go.Scatter( + x=dfs_merged_pw_base['time'], + y=dfs_merged_pw_base['Baseline Total'], + mode='lines', + #line=dict(color='black', width=1.75), + line=dict(color='black', width=1.8, dash='dot'), + name='Baseline Total' + ) + traces.append(baseline_trace) + + # Create the layout + layout = go.Layout( + #title=f"{season} Peak Week - {state_name}", + xaxis=dict(mirror=True, title=None, showline=True), + yaxis=dict(mirror=True, title='Electricity Demand (MW)', range=[0, max_peak/1000], showline=True), + legend=dict(font=dict(size=8), y=1.02, xanchor="left", x=0.0, orientation="h", yanchor="bottom", itemwidth=30), + legend_traceorder="reversed", + showlegend=True, + template='simple_white', + width=650, + height=400, + annotations=[ + dict(x=-0.1, # Centered on the x-axis + y=-0.35, # Adjust this value as needed to place the title correctly + xref='paper', + yref='paper', + text=f"{season} Peak Week, Applicable Buildings - {state_name}", + showarrow=False, + font=dict( + size=16 + ))] + ) + + # Create the figure + fig = go.Figure(data=traces, layout=layout) + + # Save fig + title = f"{season}_peak_week" + fig_name = f'{title.replace(" ", "_").lower()}.{self.image_type}' + fig_name_html = f'{title.replace(" ", "_").lower()}.html' + fig_sub_dir = os.path.abspath(os.path.join(output_dir, f"timeseries/{state_name}")) + if not os.path.exists(fig_sub_dir): + os.makedirs(fig_sub_dir) + fig_path = os.path.abspath(os.path.join(fig_sub_dir, fig_name)) + fig_path_html = os.path.abspath(os.path.join(fig_sub_dir, fig_name_html)) + + fig.write_image(fig_path, scale=6) + fig.write_html(fig_path_html) + + print(f"{fig_sub_dir}/timeseries_data_{state_name})") + dfs_merged.to_csv(f"{fig_sub_dir}/timeseries_data_{state_name}.csv") + + def plot_measure_timeseries_season_average_by_state(self, df, output_dir, states, color_map, comstock_run_name): # get upgrade ID - df_upgrade = df.loc[df[self.UPGRADE_ID]!=0, :] + df_data = df.copy() + # coerce data type of upgrade ID - arrives as float, need str of int for querying + df_data[self.UPGRADE_ID] = pd.to_numeric(df_data[self.UPGRADE_ID], errors="coerce").astype("Int64").astype(str) + df_upgrade = df_data.loc[(df_data[self.UPGRADE_ID]!="0") & (df_data[self.UPGRADE_ID]!=0), :] upgrade_num = list(df_upgrade[self.UPGRADE_ID].unique()) upgrade_name = list(df_upgrade[self.UPGRADE_NAME].unique()) - # get weights - dict_wgts = df_upgrade.groupby(self.BLDG_TYPE, observed=True)[self.BLDG_WEIGHT].mean().to_dict() - standard_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] upgrade_colors = {upgrade: standard_colors[i % len(standard_colors)] for i, upgrade in enumerate(upgrade_num)} @@ -3463,7 +3370,7 @@ def map_to_dow(dow): file_path = os.path.join(fig_sub_dir, f"timeseries_data_{state_name}.csv") dfs_merged=None if not os.path.exists(file_path): - dfs_base_combined, dfs_upgrade_combined = self.wgt_by_btype(df, run_data, dict_wgts, upgrade_num, state, upgrade_name) + dfs_base_combined, dfs_upgrade_combined = self.get_weighted_load_profiles_from_s3(df_data, upgrade_num, state, upgrade_name) # merge into single dataframe dfs_merged = pd.concat([dfs_base_combined, dfs_upgrade_combined], ignore_index=True) @@ -3491,7 +3398,7 @@ def map_to_dow(dow): dfs_merged['Day_Type'] = dfs_merged['Day_of_Week'].apply(map_to_dow) dfs_merged_gb = dfs_merged.groupby(['in.upgrade_name', 'Season', 'Day_Type', 'Hour_of_Day'], observed=True)[dfs_merged.loc[:, dfs_merged.columns.str.contains('_kwh')].columns].mean().reset_index() - max_peak = dfs_merged_gb.loc[:, 'total_site_electricity_kwh_weighted'].max() + max_peak = dfs_merged_gb.loc[:, 'total_site_electricity_kwh'].max() # find peak week by season seasons = ['Summer', 'Shoulder', 'Winter'] @@ -3517,15 +3424,15 @@ def map_to_dow(dow): # rename columns dfs_merged_pw.columns = dfs_merged_pw.columns.str.replace("electricity_", "") - dfs_merged_pw.columns = dfs_merged_pw.columns.str.replace("_kwh_weighted", "") + dfs_merged_pw.columns = dfs_merged_pw.columns.str.replace("_kwh", "") # convert hourly kWH to 15 minute MW dfs_merged_pw.loc[:, self.order_list] = dfs_merged_pw.loc[:, self.order_list]/1000 dfs_merged_pw.loc[:, "total_site"] = dfs_merged_pw.loc[:, "total_site"]/1000 dfs_merged_pw = dfs_merged_pw.sort_values('Hour_of_Day') - dfs_merged_pw_up = dfs_merged_pw.loc[dfs_merged_pw['in.upgrade_name'] != 'baseline', :] + dfs_merged_pw_up = dfs_merged_pw.loc[dfs_merged_pw['in.upgrade_name'] != 'Baseline', :] dfs_merged_pw_up.columns = dfs_merged_pw_up.columns.str.replace("total_site", "Measure Total") - dfs_merged_pw_up = dfs_merged_pw_up.loc[dfs_merged_pw['in.upgrade_name'] != 'baseline', :] + dfs_merged_pw_up = dfs_merged_pw_up.loc[dfs_merged_pw['in.upgrade_name'] != 'Baseline', :] if len(upgrade_num) == 1: # Create traces for area plot @@ -3582,7 +3489,7 @@ def map_to_dow(dow): # add baseline load - dfs_merged_pw_base = dfs_merged_pw.loc[dfs_merged_pw['in.upgrade_name'] == "baseline"] + dfs_merged_pw_base = dfs_merged_pw.loc[dfs_merged_pw['in.upgrade_name'] == "Baseline"] dfs_merged_pw_base.columns = dfs_merged_pw_base.columns.str.replace("total_site", "Baseline Total") showlegend = 'Baseline Total' not in legend_entries @@ -3632,22 +3539,19 @@ def map_to_dow(dow): fig_path = os.path.abspath(os.path.join(fig_sub_dir, fig_name)) fig_path_html = os.path.abspath(os.path.join(fig_sub_dir, fig_name_html)) - fig.write_image(fig_path, scale=10) + fig.write_image(fig_path, scale=6) fig.write_html(fig_path_html) def plot_measure_timeseries_annual_average_by_state_and_enduse(self, df, output_dir, states, color_map, comstock_run_name): - # run crawler - run_data = BuildStockQuery('eulp', 'enduse', self.comstock_run_name, buildstock_type='comstock', skip_reports=False, timeseries_table_suffix='something') - # get upgrade ID - df_upgrade = df.loc[df[self.UPGRADE_ID] != 0, :] + df_data = df.copy() + # coerce data type of upgrade ID - arrives as float, need str of int for querying + df_data[self.UPGRADE_ID] = pd.to_numeric(df_data[self.UPGRADE_ID], errors="coerce").astype("Int64").astype(str) + df_upgrade = df_data.loc[(df_data[self.UPGRADE_ID]!="0") & (df_data[self.UPGRADE_ID]!=0), :] upgrade_num = list(df_upgrade[self.UPGRADE_ID].unique()) upgrade_name = list(df_upgrade[self.UPGRADE_NAME].unique()) - # get weights - dict_wgts = df_upgrade.groupby(self.BLDG_TYPE, observed=True)[self.BLDG_WEIGHT].mean().to_dict() - standard_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'] upgrade_colors = {upgrade: standard_colors[i % len(standard_colors)] for i, upgrade in enumerate(upgrade_num)} @@ -3678,7 +3582,7 @@ def map_to_dow(dow): if not os.path.exists(file_path): ######### This is where we get the weighted data - dfs_base_combined, dfs_upgrade_combined = self.wgt_by_btype(df, run_data, dict_wgts, upgrade_num, state, upgrade_name) + dfs_base_combined, dfs_upgrade_combined = self.get_weighted_load_profiles_from_s3(df_data, upgrade_num, state, upgrade_name) ######### # merge into single dataframe @@ -3706,11 +3610,11 @@ def map_to_dow(dow): dfs_merged['Season'] = dfs_merged['Month'].apply(map_to_season) dfs_merged_gb = dfs_merged.groupby(['in.upgrade_name', 'Season', 'Hour_of_Day'], observed=True)[dfs_merged.loc[:, dfs_merged.columns.str.contains('_kwh')].columns].mean().reset_index() - max_peak = dfs_merged_gb.loc[:, 'total_site_electricity_kwh_weighted'].max() + max_peak = dfs_merged_gb.loc[:, 'total_site_electricity_kwh'].max() # rename columns, convert units dfs_merged_gb.columns = dfs_merged_gb.columns.str.replace("electricity_", "") - dfs_merged_gb.columns = dfs_merged_gb.columns.str.replace("_kwh_weighted", "") + dfs_merged_gb.columns = dfs_merged_gb.columns.str.replace("_kwh", "") # find peak week by season seasons = ['Summer', 'Shoulder', 'Winter'] @@ -3758,7 +3662,7 @@ def map_to_dow(dow): showlegend = True # add upgrade - dfs_merged_gb_season_up = dfs_merged_gb_season.loc[dfs_merged_gb_season['in.upgrade_name'] != 'baseline', :] + dfs_merged_gb_season_up = dfs_merged_gb_season.loc[dfs_merged_gb_season['in.upgrade_name'] != 'Baseline', :] # if only 1 upgrade, plot end uses and total if len(upgrade_num) == 1: trace = go.Scatter( @@ -3787,7 +3691,7 @@ def map_to_dow(dow): fig.add_trace(upgrade_trace, row=row, col=col) # add baseline load - dfs_merged_gb_season_base = dfs_merged_gb_season.loc[dfs_merged_gb_season['in.upgrade_name'] == "baseline"] + dfs_merged_gb_season_base = dfs_merged_gb_season.loc[dfs_merged_gb_season['in.upgrade_name'] == "Baseline"] baseline_trace = go.Scatter( x=dfs_merged_gb_season_base['Hour_of_Day'], y=dfs_merged_gb_season_base[enduse]/1000, @@ -3840,5 +3744,5 @@ def map_to_dow(dow): fig_path = os.path.abspath(os.path.join(fig_sub_dir, fig_name)) fig_path_html = os.path.abspath(os.path.join(fig_sub_dir, fig_name_html)) - fig.write_image(fig_path, scale=10) + fig.write_image(fig_path, scale=6) fig.write_html(fig_path_html) From 4185c1dcfd752cd4d0d13f30e1928a8d8bfdb516 Mon Sep 17 00:00:00 2001 From: Chris CaraDonna <50876267+ChristopherCaradonna@users.noreply.github.com> Date: Thu, 6 Nov 2025 08:44:41 -0700 Subject: [PATCH 04/16] Move AWS/S3 methods to comstock.py --- .../compare_upgrades-test_timeseries_plots.py | 526 +----------------- postprocessing/comstockpostproc/comstock.py | 501 +++++++++++++++++ 2 files changed, 519 insertions(+), 508 deletions(-) diff --git a/postprocessing/compare_upgrades-test_timeseries_plots.py b/postprocessing/compare_upgrades-test_timeseries_plots.py index a1e47d1af..26678a985 100644 --- a/postprocessing/compare_upgrades-test_timeseries_plots.py +++ b/postprocessing/compare_upgrades-test_timeseries_plots.py @@ -26,498 +26,6 @@ logging.basicConfig(level='INFO') # Use DEBUG, INFO, or WARNING logger = logging.getLogger(__name__) -def create_sightglass_tables( - s3_location: str, - dataset_name: str, - database_name: str = "vizstock", - glue_service_role: str = "vizstock-glue-service-role", - ): - fs, fs_path = url_to_fs(s3_location) - - glue = boto3.client("glue", region_name="us-west-2") - - crawler_name_base = f"vizstock_{dataset_name}" - tbl_prefix_base = f"{dataset_name}" - - crawler_params = { - "Role": glue_service_role, - "DatabaseName": database_name, - "Targets": {}, - "SchemaChangePolicy": { - "UpdateBehavior": "UPDATE_IN_DATABASE", - "DeleteBehavior": "DELETE_FROM_DATABASE", - } - } - - # Crawl each metadata target separately to create a - # unique table name for each metadata folder - logger.info(f"Creating crawlers for metadata") - md_agg_paths = fs.ls(f"{s3_location}") #{fs_path}/metadata_and_annual_results_aggregates/ - md_paths = fs.ls(f"{fs_path}/metadata_and_annual_results/") - - for md_path in md_agg_paths + md_paths: - md_geog = md_path.split('/')[-1] - if '_aggregates' in md_path: - crawler_name = f'{crawler_name_base}_md_agg_{md_geog}' - tbl_prefix = f'{tbl_prefix_base}_md_agg_{md_geog}_' - else: - crawler_name = f'{crawler_name_base}_md_{md_geog}' - tbl_prefix = f'{tbl_prefix_base}_md_{md_geog}_' - - crawler_params["Name"] = crawler_name - crawler_params["TablePrefix"] = tbl_prefix - crawler_params["Targets"]["S3Targets"] = [{ - "Path": f"s3://{md_path}/full/parquet", - "SampleSize": 1 - }] - - try: - crawler_info = glue.get_crawler(Name=crawler_name) - except glue.exceptions.EntityNotFoundException as ex: - logger.info(f"Creating Crawler {crawler_name}") - glue.create_crawler(**crawler_params) - else: - logger.info(f"Updating Crawler {crawler_name}") - glue.update_crawler(**crawler_params) - - logger.info(f"Running Crawler {crawler_name} on: {md_path}") - - expected_table_name = f"{tbl_prefix}{md_geog}" - logger.info(f"Expected Athena table: {expected_table_name} in database {database_name}") - - glue.start_crawler(Name=crawler_name) - time.sleep(10) - - crawler_info = glue.get_crawler(Name=crawler_name) - while crawler_info["Crawler"]["State"] != "READY": - time.sleep(10) - crawler_info = glue.get_crawler(Name=crawler_name) - logger.info(f"Crawler state: {crawler_info['Crawler']['State']}") - glue.delete_crawler(Name=crawler_name) - logger.info(f"Deleting Crawler {crawler_name}") - - return dataset_name - -def fix_timeseries_tables(dataset_name: str, database_name: str = "vizstock"): - logger.info("Updating timeseries table schemas") - - glue = boto3.client("glue", region_name="us-west-2") - tbl_prefix = f"{dataset_name}_" - get_tables_kw = {"DatabaseName": database_name, "Expression": f"{tbl_prefix}*"} - tbls_resp = glue.get_tables(**get_tables_kw) - tbl_list = tbls_resp["TableList"] - - while "NextToken" in tbls_resp: - tbls_resp = glue.get_tables(NextToken=tbls_resp["NextToken"], **get_tables_kw) - tbl_list.extend(tbls_resp["TableList"]) - - for tbl in tbl_list: - - # Skip timeseries views that may exist, including ones created by this script - if tbl.get("TableType") == "VIRTUAL_VIEW": - continue - - table_name = tbl['Name'] - if not '_timeseries' in table_name: - continue - - # Check the dtype of the 'upgrade' column. - # Must be 'bigint' to match the metadata schema so joined queries work. - do_tbl_update = False - for part_key in tbl["PartitionKeys"]: - if part_key["Name"] == "upgrade" and part_key["Type"] != "bigint": - do_tbl_update = True - part_key["Type"] = "bigint" - - if not do_tbl_update: - logger.debug(f"Skipping {table_name} because it already has correct partition dtypes") - continue - - # Delete the automatically-created partition index. - # While the index exists, dtypes for the columns in the index cannot be modified. - indexes_resp = glue.get_partition_indexes( - DatabaseName=database_name, - TableName=table_name - ) - for index in indexes_resp['PartitionIndexDescriptorList']: - index_name = index['IndexName'] - index_keys = index['Keys'] - logger.debug(f'Deleting index {index_name} with keys {index_keys} in {table_name}') - glue.delete_partition_index( - DatabaseName=database_name, - TableName=table_name, - IndexName=index_name - ) - - # Wait for index deletion to complete - index_deleted = False - for i in range(0, 60): - indexes_resp = glue.get_partition_indexes( - DatabaseName=database_name, - TableName=table_name - ) - if len(indexes_resp['PartitionIndexDescriptorList']) == 0: - index_deleted = True - break - logger.debug('Waiting 10 seconds to check index deletion status') - time.sleep(10) - if not index_deleted: - raise RuntimeError(f'Did not delete index in 600 seconds, stopping.') - - # Change the dtype of the 'upgrade' partition column to bigint - tbl_input = {} - for k, v in tbl.items(): - if k in ( - "Name", - "Description", - "Owner", - "Retention", - "StorageDescriptor", - "PartitionKeys", - "TableType", - "Parameters", - ): - tbl_input[k] = v - logger.debug(f"Updating dtype of upgrade column in {table_name} to bigint") - glue.update_table(DatabaseName=database_name, TableInput=tbl_input) - - # Recreate the index - key_names = [k['Name'] for k in index_keys] - logger.debug(f"Creating index with columns: {key_names} in {table_name}") - glue.create_partition_index( - # CatalogId='string', - DatabaseName=database_name, - TableName=table_name, - PartitionIndex={ - 'Keys': key_names, - 'IndexName': 'vizstock_partition_index' - } - ) - - # Wait for index creation to complete - index_created = False - for i in range(0, 60): - indexes_resp = glue.get_partition_indexes( - DatabaseName=database_name, - TableName=table_name - ) - for index in indexes_resp['PartitionIndexDescriptorList']: - index_status = index['IndexStatus'] - if index_status == 'ACTIVE': - index_created = True - if index_created: - break - else: - logger.debug('Waiting 10 seconds to check index creation status') - time.sleep(10) - if not index_deleted: - raise RuntimeError(f'Did not create index in 600 seconds, stopping.') - -def column_filter(col): - return not ( - col.name.endswith(".co2e_kg") - or col.name.find("out.electricity.pv") > -1 - or col.name.find("out.electricity.purchased") > -1 - or col.name.find("out.electricity.net") > -1 - or col.name.find(".net.") > -1 - or col.name.find("applicability.") > -1 - or col.name.find("out.qoi.") > -1 - or col.name.find("out.emissions.") > -1 - or col.name.find("out.params.") > -1 - or col.name.find("out.utility_bills.") > -1 - or col.name.find("calc.") > -1 - or col.name.startswith("in.ejscreen") - or col.name.startswith("in.cejst") - or col.name.startswith("in.cluster_id") - or col.name.startswith("in.size_bin_id") - or col.name.startswith("in.sampling_region_id") - or col.name.startswith("out.district_heating.interior_equipment") - or col.name.startswith("out.district_heating.cooling") - or col.name.endswith("_applicable") - or col.name.startswith("out.electricity.total.apr") - or col.name.startswith("out.electricity.total.aug") - or col.name.startswith("out.electricity.total.dec") - or col.name.startswith("out.electricity.total.feb") - or col.name.startswith("out.electricity.total.jan") - or col.name.startswith("out.electricity.total.jul") - or col.name.startswith("out.electricity.total.jun") - or col.name.startswith("out.electricity.total.mar") - or col.name.startswith("out.electricity.total.may") - or col.name.startswith("out.electricity.total.nov") - or col.name.startswith("out.electricity.total.oct") - or col.name.startswith("out.electricity.total.sep") - ) - -def create_column_alias(col_name): - - # accept SQLAlchemy Column / ColumnElement or plain str - if not isinstance(col_name, str): - # prefer .name, then .key; fallback to str (last resort) - name = getattr(col_name, "name", None) or getattr(col_name, "key", None) - if not name: - name = str(col_name) - col_name = name - - # 1) name normalizations - normalized = ( - col_name - .replace("gas", "natural_gas") - .replace("fueloil", "fuel_oil") - .replace("districtheating", "district_heating") - .replace("districtcooling", "district_cooling") - ) - - # 2) regex patterns - m1 = re.search( - r"^(electricity|natural_gas|fuel_oil|propane|district_heating|district_cooling|other_fuel)_(\w+)_(kwh|therm|mbtu|kbtu)$", - normalized, - ) - m2 = re.search( - r"^total_site_(electricity|natural_gas|fuel_oil|propane|district_heating|district_cooling|other_fuel)_(kwh|therm|mbtu|kbtu)$", - normalized, - ) - m3 = re.search(r"^(total)_(site_energy)_(kwh|therm|mbtu|kbtu)$", normalized) - m4 = re.search( - r"^total_net_site_(electricity|natural_gas|fuel_oil|propane|district_heating|district_cooling|other_fuel)_(kwh|therm|mbtu|kbtu)$", - normalized, - ) - m5 = re.search( - r"^total_purchased_site_(electricity|natural_gas|fuel_oil|propane|district_heating|district_cooling|other_fuel)_(kwh|therm|mbtu|kbtu)$", - normalized, - ) - - if not (m1 or m2 or m3 or m4 or m5): - # Not an energy column we care about - return 1.0, normalized - - if m1: - fueltype, enduse, fuel_units = m1.groups() - elif m2: - fueltype, fuel_units = m2.groups() - enduse = "total" - elif m3: - enduse, fueltype, fuel_units = m3.groups() # "total","site_energy",units - # If you prefer "site_energy" to be reported as a pseudo-fuel, keep as-is. - # Otherwise map to a specific convention here. - elif m4: - fueltype, fuel_units = m4.groups() - enduse = "net" - else: # m5 - fueltype, fuel_units = m5.groups() - enduse = "purchased" - - # 3) build alias - col_alias = f"out.{fueltype}.{enduse}.energy_consumption" - - # Created using OpenStudio unit conversion library - energy_unit_conv_to_kwh = { - "mbtu": 293.0710701722222, - "m_btu": 293.0710701722222, - "therm": 29.307107017222222, - "kbtu": 0.2930710701722222, - } - - # 4) conversion factor to kWh - if fuel_units == "kwh": - conv_factor = 1.0 - else: - try: - conv_factor = energy_unit_conv_to_kwh[fuel_units] - except KeyError: - raise ValueError(f"Unhandled energy unit: {fuel_units!r} in column {col_name!r}") - - return conv_factor, col_alias - -def create_views( - dataset_name: str, database_name: str = "vizstock", workgroup: str = "eulp" -): - glue = boto3.client("glue", region_name="us-west-2") - - logger.info(f'Creating views for {dataset_name}') - - # Get a list of metadata tables - get_md_tables_kw = { - "DatabaseName": database_name, - "Expression": f"{dataset_name}_md_.*", - } - tbls_resp = glue.get_tables(**get_md_tables_kw) - tbl_list = tbls_resp["TableList"] - while "NextToken" in tbls_resp: - tbls_resp = glue.get_tables(NextToken=tbls_resp["NextToken"], **get_md_tables_kw) - tbl_list.extend(tbls_resp["TableList"]) - md_tbls = [x["Name"] for x in tbl_list if x["TableType"] != "VIRTUAL_VIEW"] - - # Create a view for each metadata table - for metadata_tblname in md_tbls: - # Connect to the metadata table - engine = create_engine( - f"awsathena+rest://:@athena.us-west-2.amazonaws.com:443/{database_name}?work_group={workgroup}" - ) - meta = sa.MetaData(bind=engine) - metadata_tbl = sa.Table(metadata_tblname, meta, autoload=True) - logger.info(f"Loaded metadata table: {metadata_tblname}") - - # Extract the partition columns from the table name - bys = [] - if 'by' in metadata_tblname: - bys = metadata_tblname.replace(f'{dataset_name}_md_', '') - bys = bys.replace(f'agg_', '').replace(f'by_', '').replace(f'_parquet', '').split('_and_') - logger.debug(f"Partition identifiers = {', '.join(bys)}") - - # Rename the partition column to match hive partition - # Move it up in the column order for easier debugging - # TODO figure out a better approach than hard-coding - aliases = { - 'state': {'col': 'in.state','alias': 'state'}, - 'puma': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'}, - 'county': {'col': 'in.nhgis_county_gisjoin', 'alias': 'county'}, - 'puma_midwest': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'}, - 'puma_northeast': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'}, - 'puma_south': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'}, - 'puma_west': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'} - } - - # Select columns for the metadata, aliasing partition columns - cols = [] - for col in metadata_tbl.columns: - cols.append(col) - - continue - - # Remove everything but the input characteristics and output energy columns from the view - cols = list(filter(column_filter, cols)) - - # Alias the out.foo columns to remove units - # TODO: add this to timeseries stuff - cols_aliased = [] - for col in cols: - if col.name.startswith("out.") and col.name.find("..") > -1: - unitless_name = col.name.split('..')[0] - cols_aliased.append(col.label(unitless_name)) - # logger.debug(f'Aliasing {col.name} to {unitless_name}') - elif col.name == 'in.sqft..ft2': - unitless_name = 'in.sqft' # Special requirement for SightGlass - cols_aliased.append(col.label(unitless_name)) - # logger.debug(f'Aliasing {col.name} to {unitless_name}') - else: - cols_aliased.append(col) - cols = cols_aliased - - # Check columns - col_type_errs = 0 - for c in cols: - # Column name length for postgres compatibility - if len(c.name) > 63: - col_type_errs += 1 - logger.error(f'column: `{c.name}` must be < 64 chars for SightGlass postgres storage') - - # in.foo columns may not be bigints or booleans because they cannot be serialized to JSON - # when determining the unique set of filter values for SightGlass - if c.name.startswith('in.') and (str(c.type) == 'BIGINT' or (str(c.type) == 'BOOLEAN')): - col_type_errs += 1 - logger.error(f'in.foo column {c} may not be a BIGINT or BOOLEAN for SightGlass to work') - - # Expected bigint columns in SightGlass - expected_bigints = ['metadata_index', 'upgrade', 'bldg_id'] - if c.name in expected_bigints: - if not str(c.type) == 'BIGINT': - col_type_errs += 1 - logger.error(f'Column {c} must be a BIGINT, but found {c.type}') - - if col_type_errs > 0: - raise RuntimeError(f'{col_type_errs} were found in columns, correct these in metadata before proceeding.') - - # For PUMAs, need to create one view for each census region - if 'puma' in bys: - regions = ("South", "Midwest", "Northeast", "West") - for region in regions: - q = sa.select(cols).where(metadata_tbl.c["in.census_region_name"].in_([region])) - view_name = metadata_tblname.replace('_by_state_and_puma_parquet', '_puma') - view_name = f"{view_name}_{region.lower()}_vu" - db_plus_vu = f'{database_name}_{view_name}' - if len(db_plus_vu) > 63: - raise RuntimeError(f'db + view: `{db_plus_vu}` must be < 64 chars for SightGlass postgres storage') - view = sa.Table(view_name, meta) - create_view = CreateView(view, q, or_replace=True) - logger.info(f"Creating metadata view: {view_name}, partitioned by {bys}") - engine.execute(create_view) - # logger.debug('Columns in view:') - # for c in cols: - # logger.debug(c) - else: - q = sa.select(cols) - view_name = metadata_tblname.replace('_parquet', '') - view_name = f"{view_name}_vu" - db_plus_vu = f'{database_name}_{view_name}' - if len(db_plus_vu) > 63: - raise RuntimeError(f'db + view: `{db_plus_vu}` must be < 64 chars for SightGlass postgres storage') - view = sa.Table(view_name, meta) - create_view = CreateView(view, q, or_replace=True) - logger.info(f"Creating metadata view: {view_name}, partitioned by {bys}") - engine.execute(create_view) - # logger.debug('Columns in view:') - # for c in cols: - # logger.debug(c) - - # Get a list of timeseries tables - get_ts_tables_kw = { - "DatabaseName": database_name, - "Expression": f"{dataset_name}_timeseries*", - } - tbls_resp = glue.get_tables(**get_ts_tables_kw) - tbl_list = tbls_resp["TableList"] - while "NextToken" in tbls_resp: - tbls_resp = glue.get_tables(NextToken=tbls_resp["NextToken"], **get_ts_tables_kw) - tbl_list.extend(tbls_resp["TableList"]) - ts_tbls = [x["Name"] for x in tbl_list if x["TableType"] != "VIRTUAL_VIEW"] - - # Create a view for each timeseries table, removing the emissions columns - for ts_tblname in ts_tbls: - engine = create_engine( - f"awsathena+rest://:@athena.us-west-2.amazonaws.com:443/{database_name}?work_group={workgroup}" - ) - meta = sa.MetaData(bind=engine) - ts_tbl = sa.Table(ts_tblname, meta, autoload=True) - cols = list(filter(column_filter, ts_tbl.columns)) - cols_aliased = [] - for col in cols: - - # rename - conv_factor, col_alias = create_column_alias(col) - if (col.name == col_alias) & (conv_factor==1): # and conversion factor is 1 - cols_aliased.append(col) - elif (col.name != col_alias) & (conv_factor==1): #name different, conversion factor 1 - cols_aliased.append(col.label(col_alias)) #TODO: return both alias and new units - else: # name and conversion different - cols_aliased.append((col/conv_factor).label(col_alias)) #TODO: return both alias and new units - - if col.name == 'timestamp': #timestamp - # Convert bigint to timestamp type if necessary - if str(col.type) == 'BIGINT': - # Pandas uses nanosecond resolution integer timestamps. - # Presto expects second resolution values in from_unixtime. - # Must divide values by 1e9 to go from nanoseconds to seconds. - cols_aliased.append(sa.func.from_unixtime(col / 1e9).label('timestamp')) #NOTE: syntax for adding unit conversions - else: - cols_aliased.append(col) - - cols = cols_aliased - - q = sa.select(cols) - view_name = f"{ts_tblname}_vu" - - db_plus_vu = f'{database_name}_{view_name}' - if len(db_plus_vu) > 63: - raise RuntimeError(f'db + view: `{db_plus_vu}` must be < 64 chars for SightGlass postgres storage') - view = sa.Table(view_name, meta) - create_view = CreateView(view, q, or_replace=True) - logger.info(f"Creating timeseries view: {view_name}") - engine.execute(create_view) - logger.debug('Columns in view:') - for c in cols: - logger.debug(c) - - def main(): # ComStock run comstock = cspp.ComStock( @@ -544,7 +52,7 @@ def main(): 'AZ': 'Arizona', 'TN': 'Tennessee' }, - # [('state',['VA','AZ']) + upgrade_ids_for_comparison={} # Use {'':[0,1,2]}; add as many upgrade IDs as needed, but plots look strange over 5 #output_dir = 's3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2025/comstock_amy2018_release_1' ) @@ -599,15 +107,17 @@ def main(): } ] - #for geo_export in geo_exports: - # for upgrade_id in comstock.upgrade_ids_to_process: - # # if upgrade_id == 0: - # # continue - # # comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id, [geo_export]) - # if comstock.make_timeseries_plots: - # s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" - # s3_output_dir = comstock.setup_fsspec_filesystem(s3_dir, aws_profile_name=None) - # comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id=upgrade_id, geo_exports=[geo_export], output_dir=s3_output_dir) + for geo_export in geo_exports: + for upgrade_id in comstock.upgrade_ids_to_process: + # if upgrade_id == 0: + # continue + # comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id, [geo_export]) + + # Also write to S3 if making timeseries plots + if comstock.make_timeseries_plots: + s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" + s3_output_dir = comstock.setup_fsspec_filesystem(s3_dir, aws_profile_name=None) + comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id=upgrade_id, geo_exports=[geo_export], output_dir=s3_output_dir) # write select results to S3 for Athena/Glue when needed for timeseries plots @@ -622,12 +132,12 @@ def main(): # Export parquet files to S3 for Athena/Glue # TODO: modify so that county data can be used for timeseries plots # TODO: Modify geo export structure to specify parquet files only for this part of the workflow - #create_sightglass_tables(s3_location=f"{s3_dir}/metadata_and_annual_results_aggregates", - # dataset_name=crawler_name, - # database_name=database, - # glue_service_role=glue_service_role) - #fix_timeseries_tables(crawler_name, database) - #create_views(crawler_name, database, workgroup) + comstock.create_sightglass_tables(s3_location=f"{s3_dir}/metadata_and_annual_results_aggregates", + dataset_name=crawler_name, + database_name=database, + glue_service_role=glue_service_role) + comstock.fix_timeseries_tables(crawler_name, database) + comstock.create_views(crawler_name, database, workgroup) # Create measure run comparisons; only use if run has measures comparison = cspp.ComStockMeasureComparison(comstock, states=comstock.states, make_comparison_plots = comstock.make_comparison_plots, make_timeseries_plots = comstock.make_timeseries_plots) diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py index 78ac55820..ecd383de2 100644 --- a/postprocessing/comstockpostproc/comstock.py +++ b/postprocessing/comstockpostproc/comstock.py @@ -4,8 +4,12 @@ import s3fs from functools import lru_cache from fsspec.core import url_to_fs +from sqlalchemy.engine import create_engine from joblib import Parallel, delayed from fsspec import register_implementation +import sqlalchemy as sa +import time +from sqlalchemy_views import CreateView import boto3 import botocore @@ -4044,3 +4048,500 @@ def sightGlass_metadata_check(self, comstock_data: pl.LazyFrame): if err_log: raise ValueError(err_log) + + + @staticmethod + def create_sightglass_tables( + s3_location: str, + dataset_name: str, + database_name: str = "vizstock", + glue_service_role: str = "vizstock-glue-service-role", + ): + fs, fs_path = url_to_fs(s3_location) + + glue = boto3.client("glue", region_name="us-west-2") + + crawler_name_base = f"vizstock_{dataset_name}" + tbl_prefix_base = f"{dataset_name}" + + crawler_params = { + "Role": glue_service_role, + "DatabaseName": database_name, + "Targets": {}, + "SchemaChangePolicy": { + "UpdateBehavior": "UPDATE_IN_DATABASE", + "DeleteBehavior": "DELETE_FROM_DATABASE", + } + } + + # Crawl each metadata target separately to create a + # unique table name for each metadata folder + logger.info(f"Creating crawlers for metadata") + md_agg_paths = fs.ls(f"{s3_location}") #{fs_path}/metadata_and_annual_results_aggregates/ + md_paths = fs.ls(f"{fs_path}/metadata_and_annual_results/") + + for md_path in md_agg_paths + md_paths: + md_geog = md_path.split('/')[-1] + if '_aggregates' in md_path: + crawler_name = f'{crawler_name_base}_md_agg_{md_geog}' + tbl_prefix = f'{tbl_prefix_base}_md_agg_{md_geog}_' + else: + crawler_name = f'{crawler_name_base}_md_{md_geog}' + tbl_prefix = f'{tbl_prefix_base}_md_{md_geog}_' + + crawler_params["Name"] = crawler_name + crawler_params["TablePrefix"] = tbl_prefix + crawler_params["Targets"]["S3Targets"] = [{ + "Path": f"s3://{md_path}/full/parquet", + "SampleSize": 1 + }] + + try: + crawler_info = glue.get_crawler(Name=crawler_name) + except glue.exceptions.EntityNotFoundException as ex: + logger.info(f"Creating Crawler {crawler_name}") + glue.create_crawler(**crawler_params) + else: + logger.info(f"Updating Crawler {crawler_name}") + glue.update_crawler(**crawler_params) + + logger.info(f"Running Crawler {crawler_name} on: {md_path}") + + expected_table_name = f"{tbl_prefix}{md_geog}" + logger.info(f"Expected Athena table: {expected_table_name} in database {database_name}") + + glue.start_crawler(Name=crawler_name) + time.sleep(10) + + crawler_info = glue.get_crawler(Name=crawler_name) + while crawler_info["Crawler"]["State"] != "READY": + time.sleep(10) + crawler_info = glue.get_crawler(Name=crawler_name) + logger.info(f"Crawler state: {crawler_info['Crawler']['State']}") + glue.delete_crawler(Name=crawler_name) + logger.info(f"Deleting Crawler {crawler_name}") + + return dataset_name + + @staticmethod + def fix_timeseries_tables(dataset_name: str, database_name: str = "vizstock"): + logger.info("Updating timeseries table schemas") + + glue = boto3.client("glue", region_name="us-west-2") + tbl_prefix = f"{dataset_name}_" + get_tables_kw = {"DatabaseName": database_name, "Expression": f"{tbl_prefix}*"} + tbls_resp = glue.get_tables(**get_tables_kw) + tbl_list = tbls_resp["TableList"] + + while "NextToken" in tbls_resp: + tbls_resp = glue.get_tables(NextToken=tbls_resp["NextToken"], **get_tables_kw) + tbl_list.extend(tbls_resp["TableList"]) + + for tbl in tbl_list: + + # Skip timeseries views that may exist, including ones created by this script + if tbl.get("TableType") == "VIRTUAL_VIEW": + continue + + table_name = tbl['Name'] + if not '_timeseries' in table_name: + continue + + # Check the dtype of the 'upgrade' column. + # Must be 'bigint' to match the metadata schema so joined queries work. + do_tbl_update = False + for part_key in tbl["PartitionKeys"]: + if part_key["Name"] == "upgrade" and part_key["Type"] != "bigint": + do_tbl_update = True + part_key["Type"] = "bigint" + + if not do_tbl_update: + logger.debug(f"Skipping {table_name} because it already has correct partition dtypes") + continue + + # Delete the automatically-created partition index. + # While the index exists, dtypes for the columns in the index cannot be modified. + indexes_resp = glue.get_partition_indexes( + DatabaseName=database_name, + TableName=table_name + ) + for index in indexes_resp['PartitionIndexDescriptorList']: + index_name = index['IndexName'] + index_keys = index['Keys'] + logger.debug(f'Deleting index {index_name} with keys {index_keys} in {table_name}') + glue.delete_partition_index( + DatabaseName=database_name, + TableName=table_name, + IndexName=index_name + ) + + # Wait for index deletion to complete + index_deleted = False + for i in range(0, 60): + indexes_resp = glue.get_partition_indexes( + DatabaseName=database_name, + TableName=table_name + ) + if len(indexes_resp['PartitionIndexDescriptorList']) == 0: + index_deleted = True + break + logger.debug('Waiting 10 seconds to check index deletion status') + time.sleep(10) + if not index_deleted: + raise RuntimeError(f'Did not delete index in 600 seconds, stopping.') + + # Change the dtype of the 'upgrade' partition column to bigint + tbl_input = {} + for k, v in tbl.items(): + if k in ( + "Name", + "Description", + "Owner", + "Retention", + "StorageDescriptor", + "PartitionKeys", + "TableType", + "Parameters", + ): + tbl_input[k] = v + logger.debug(f"Updating dtype of upgrade column in {table_name} to bigint") + glue.update_table(DatabaseName=database_name, TableInput=tbl_input) + + # Recreate the index + key_names = [k['Name'] for k in index_keys] + logger.debug(f"Creating index with columns: {key_names} in {table_name}") + glue.create_partition_index( + # CatalogId='string', + DatabaseName=database_name, + TableName=table_name, + PartitionIndex={ + 'Keys': key_names, + 'IndexName': 'vizstock_partition_index' + } + ) + + # Wait for index creation to complete + index_created = False + for i in range(0, 60): + indexes_resp = glue.get_partition_indexes( + DatabaseName=database_name, + TableName=table_name + ) + for index in indexes_resp['PartitionIndexDescriptorList']: + index_status = index['IndexStatus'] + if index_status == 'ACTIVE': + index_created = True + if index_created: + break + else: + logger.debug('Waiting 10 seconds to check index creation status') + time.sleep(10) + if not index_deleted: + raise RuntimeError(f'Did not create index in 600 seconds, stopping.') + + @staticmethod + def column_filter(col): + return not ( + col.name.endswith(".co2e_kg") + or col.name.find("out.electricity.pv") > -1 + or col.name.find("out.electricity.purchased") > -1 + or col.name.find("out.electricity.net") > -1 + or col.name.find(".net.") > -1 + or col.name.find("applicability.") > -1 + or col.name.find("out.qoi.") > -1 + or col.name.find("out.emissions.") > -1 + or col.name.find("out.params.") > -1 + or col.name.find("out.utility_bills.") > -1 + or col.name.find("calc.") > -1 + or col.name.startswith("in.ejscreen") + or col.name.startswith("in.cejst") + or col.name.startswith("in.cluster_id") + or col.name.startswith("in.size_bin_id") + or col.name.startswith("in.sampling_region_id") + or col.name.startswith("out.district_heating.interior_equipment") + or col.name.startswith("out.district_heating.cooling") + or col.name.endswith("_applicable") + or col.name.startswith("out.electricity.total.apr") + or col.name.startswith("out.electricity.total.aug") + or col.name.startswith("out.electricity.total.dec") + or col.name.startswith("out.electricity.total.feb") + or col.name.startswith("out.electricity.total.jan") + or col.name.startswith("out.electricity.total.jul") + or col.name.startswith("out.electricity.total.jun") + or col.name.startswith("out.electricity.total.mar") + or col.name.startswith("out.electricity.total.may") + or col.name.startswith("out.electricity.total.nov") + or col.name.startswith("out.electricity.total.oct") + or col.name.startswith("out.electricity.total.sep") + ) + + @staticmethod + def create_column_alias(col_name): + + # accept SQLAlchemy Column / ColumnElement or plain str + if not isinstance(col_name, str): + # prefer .name, then .key; fallback to str (last resort) + name = getattr(col_name, "name", None) or getattr(col_name, "key", None) + if not name: + name = str(col_name) + col_name = name + + # 1) name normalizations + normalized = ( + col_name + .replace("gas", "natural_gas") + .replace("fueloil", "fuel_oil") + .replace("districtheating", "district_heating") + .replace("districtcooling", "district_cooling") + ) + + # 2) regex patterns + m1 = re.search( + r"^(electricity|natural_gas|fuel_oil|propane|district_heating|district_cooling|other_fuel)_(\w+)_(kwh|therm|mbtu|kbtu)$", + normalized, + ) + m2 = re.search( + r"^total_site_(electricity|natural_gas|fuel_oil|propane|district_heating|district_cooling|other_fuel)_(kwh|therm|mbtu|kbtu)$", + normalized, + ) + m3 = re.search(r"^(total)_(site_energy)_(kwh|therm|mbtu|kbtu)$", normalized) + m4 = re.search( + r"^total_net_site_(electricity|natural_gas|fuel_oil|propane|district_heating|district_cooling|other_fuel)_(kwh|therm|mbtu|kbtu)$", + normalized, + ) + m5 = re.search( + r"^total_purchased_site_(electricity|natural_gas|fuel_oil|propane|district_heating|district_cooling|other_fuel)_(kwh|therm|mbtu|kbtu)$", + normalized, + ) + + if not (m1 or m2 or m3 or m4 or m5): + # Not an energy column we care about + return 1.0, normalized + + if m1: + fueltype, enduse, fuel_units = m1.groups() + elif m2: + fueltype, fuel_units = m2.groups() + enduse = "total" + elif m3: + enduse, fueltype, fuel_units = m3.groups() # "total","site_energy",units + # If you prefer "site_energy" to be reported as a pseudo-fuel, keep as-is. + # Otherwise map to a specific convention here. + elif m4: + fueltype, fuel_units = m4.groups() + enduse = "net" + else: # m5 + fueltype, fuel_units = m5.groups() + enduse = "purchased" + + # 3) build alias + col_alias = f"out.{fueltype}.{enduse}.energy_consumption" + + # Created using OpenStudio unit conversion library + energy_unit_conv_to_kwh = { + "mbtu": 293.0710701722222, + "m_btu": 293.0710701722222, + "therm": 29.307107017222222, + "kbtu": 0.2930710701722222, + } + + # 4) conversion factor to kWh + if fuel_units == "kwh": + conv_factor = 1.0 + else: + try: + conv_factor = energy_unit_conv_to_kwh[fuel_units] + except KeyError: + raise ValueError(f"Unhandled energy unit: {fuel_units!r} in column {col_name!r}") + + return conv_factor, col_alias + + @staticmethod + def create_views( + dataset_name: str, database_name: str = "vizstock", workgroup: str = "eulp" + ): + glue = boto3.client("glue", region_name="us-west-2") + + logger.info(f'Creating views for {dataset_name}') + + # Get a list of metadata tables + get_md_tables_kw = { + "DatabaseName": database_name, + "Expression": f"{dataset_name}_md_.*", + } + tbls_resp = glue.get_tables(**get_md_tables_kw) + tbl_list = tbls_resp["TableList"] + while "NextToken" in tbls_resp: + tbls_resp = glue.get_tables(NextToken=tbls_resp["NextToken"], **get_md_tables_kw) + tbl_list.extend(tbls_resp["TableList"]) + md_tbls = [x["Name"] for x in tbl_list if x["TableType"] != "VIRTUAL_VIEW"] + + # Create a view for each metadata table + for metadata_tblname in md_tbls: + # Connect to the metadata table + engine = create_engine( + f"awsathena+rest://:@athena.us-west-2.amazonaws.com:443/{database_name}?work_group={workgroup}" + ) + meta = sa.MetaData(bind=engine) + metadata_tbl = sa.Table(metadata_tblname, meta, autoload=True) + logger.info(f"Loaded metadata table: {metadata_tblname}") + + # Extract the partition columns from the table name + bys = [] + if 'by' in metadata_tblname: + bys = metadata_tblname.replace(f'{dataset_name}_md_', '') + bys = bys.replace(f'agg_', '').replace(f'by_', '').replace(f'_parquet', '').split('_and_') + logger.debug(f"Partition identifiers = {', '.join(bys)}") + + # Rename the partition column to match hive partition + # Move it up in the column order for easier debugging + # TODO figure out a better approach than hard-coding + aliases = { + 'state': {'col': 'in.state','alias': 'state'}, + 'puma': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'}, + 'county': {'col': 'in.nhgis_county_gisjoin', 'alias': 'county'}, + 'puma_midwest': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'}, + 'puma_northeast': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'}, + 'puma_south': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'}, + 'puma_west': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'} + } + + # Select columns for the metadata, aliasing partition columns + cols = [] + for col in metadata_tbl.columns: + cols.append(col) + + continue + + # Remove everything but the input characteristics and output energy columns from the view + cols = list(filter(ComStock.column_filter, cols)) + + # Alias the out.foo columns to remove units + # TODO: add this to timeseries stuff + cols_aliased = [] + for col in cols: + if col.name.startswith("out.") and col.name.find("..") > -1: + unitless_name = col.name.split('..')[0] + cols_aliased.append(col.label(unitless_name)) + # logger.debug(f'Aliasing {col.name} to {unitless_name}') + elif col.name == 'in.sqft..ft2': + unitless_name = 'in.sqft' # Special requirement for SightGlass + cols_aliased.append(col.label(unitless_name)) + # logger.debug(f'Aliasing {col.name} to {unitless_name}') + else: + cols_aliased.append(col) + cols = cols_aliased + + # Check columns + col_type_errs = 0 + for c in cols: + # Column name length for postgres compatibility + if len(c.name) > 63: + col_type_errs += 1 + logger.error(f'column: `{c.name}` must be < 64 chars for SightGlass postgres storage') + + # in.foo columns may not be bigints or booleans because they cannot be serialized to JSON + # when determining the unique set of filter values for SightGlass + if c.name.startswith('in.') and (str(c.type) == 'BIGINT' or (str(c.type) == 'BOOLEAN')): + col_type_errs += 1 + logger.error(f'in.foo column {c} may not be a BIGINT or BOOLEAN for SightGlass to work') + + # Expected bigint columns in SightGlass + expected_bigints = ['metadata_index', 'upgrade', 'bldg_id'] + if c.name in expected_bigints: + if not str(c.type) == 'BIGINT': + col_type_errs += 1 + logger.error(f'Column {c} must be a BIGINT, but found {c.type}') + + if col_type_errs > 0: + raise RuntimeError(f'{col_type_errs} were found in columns, correct these in metadata before proceeding.') + + # For PUMAs, need to create one view for each census region + if 'puma' in bys: + regions = ("South", "Midwest", "Northeast", "West") + for region in regions: + q = sa.select(cols).where(metadata_tbl.c["in.census_region_name"].in_([region])) + view_name = metadata_tblname.replace('_by_state_and_puma_parquet', '_puma') + view_name = f"{view_name}_{region.lower()}_vu" + db_plus_vu = f'{database_name}_{view_name}' + if len(db_plus_vu) > 63: + raise RuntimeError(f'db + view: `{db_plus_vu}` must be < 64 chars for SightGlass postgres storage') + view = sa.Table(view_name, meta) + create_view = CreateView(view, q, or_replace=True) + logger.info(f"Creating metadata view: {view_name}, partitioned by {bys}") + engine.execute(create_view) + # logger.debug('Columns in view:') + # for c in cols: + # logger.debug(c) + else: + q = sa.select(cols) + view_name = metadata_tblname.replace('_parquet', '') + view_name = f"{view_name}_vu" + db_plus_vu = f'{database_name}_{view_name}' + if len(db_plus_vu) > 63: + raise RuntimeError(f'db + view: `{db_plus_vu}` must be < 64 chars for SightGlass postgres storage') + view = sa.Table(view_name, meta) + create_view = CreateView(view, q, or_replace=True) + logger.info(f"Creating metadata view: {view_name}, partitioned by {bys}") + engine.execute(create_view) + # logger.debug('Columns in view:') + # for c in cols: + # logger.debug(c) + + # Get a list of timeseries tables + get_ts_tables_kw = { + "DatabaseName": database_name, + "Expression": f"{dataset_name}_timeseries*", + } + tbls_resp = glue.get_tables(**get_ts_tables_kw) + tbl_list = tbls_resp["TableList"] + while "NextToken" in tbls_resp: + tbls_resp = glue.get_tables(NextToken=tbls_resp["NextToken"], **get_ts_tables_kw) + tbl_list.extend(tbls_resp["TableList"]) + ts_tbls = [x["Name"] for x in tbl_list if x["TableType"] != "VIRTUAL_VIEW"] + + # Create a view for each timeseries table, removing the emissions columns + for ts_tblname in ts_tbls: + engine = create_engine( + f"awsathena+rest://:@athena.us-west-2.amazonaws.com:443/{database_name}?work_group={workgroup}" + ) + meta = sa.MetaData(bind=engine) + ts_tbl = sa.Table(ts_tblname, meta, autoload=True) + cols = list(filter(ComStock.column_filter, ts_tbl.columns)) + cols_aliased = [] + for col in cols: + + # rename + conv_factor, col_alias = ComStock.create_column_alias(col) + if (col.name == col_alias) & (conv_factor==1): # and conversion factor is 1 + cols_aliased.append(col) + elif (col.name != col_alias) & (conv_factor==1): #name different, conversion factor 1 + cols_aliased.append(col.label(col_alias)) #TODO: return both alias and new units + else: # name and conversion different + cols_aliased.append((col/conv_factor).label(col_alias)) #TODO: return both alias and new units + + if col.name == 'timestamp': #timestamp + # Convert bigint to timestamp type if necessary + if str(col.type) == 'BIGINT': + # Pandas uses nanosecond resolution integer timestamps. + # Presto expects second resolution values in from_unixtime. + # Must divide values by 1e9 to go from nanoseconds to seconds. + cols_aliased.append(sa.func.from_unixtime(col / 1e9).label('timestamp')) #NOTE: syntax for adding unit conversions + else: + cols_aliased.append(col) + + cols = cols_aliased + + q = sa.select(cols) + view_name = f"{ts_tblname}_vu" + + db_plus_vu = f'{database_name}_{view_name}' + if len(db_plus_vu) > 63: + raise RuntimeError(f'db + view: `{db_plus_vu}` must be < 64 chars for SightGlass postgres storage') + view = sa.Table(view_name, meta) + create_view = CreateView(view, q, or_replace=True) + logger.info(f"Creating timeseries view: {view_name}") + engine.execute(create_view) + logger.debug('Columns in view:') + for c in cols: + logger.debug(c) From 7f605d1ab152d31c619c53248522e7e34bac0a00 Mon Sep 17 00:00:00 2001 From: Chris CaraDonna <50876267+ChristopherCaradonna@users.noreply.github.com> Date: Thu, 6 Nov 2025 09:59:26 -0700 Subject: [PATCH 05/16] Move S3 retrieval methods to comstock.py --- postprocessing/comstockpostproc/comstock.py | 115 ++++++++++++++++ .../comstock_measure_comparison.py | 13 +- .../comstockpostproc/plotting_mixin.py | 128 +----------------- 3 files changed, 130 insertions(+), 126 deletions(-) diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py index ecd383de2..7f35a2912 100644 --- a/postprocessing/comstockpostproc/comstock.py +++ b/postprocessing/comstockpostproc/comstock.py @@ -35,6 +35,7 @@ from comstockpostproc.gas_correction_model import GasCorrectionModelMixin from comstockpostproc.s3_utilities_mixin import S3UtilitiesMixin, write_geo_data from buildstock_query import BuildStockQuery +from .comstock_query_builder import ComStockQueryBuilder logger = logging.getLogger(__name__) @@ -4545,3 +4546,117 @@ def create_views( logger.debug('Columns in view:') for c in cols: logger.debug(c) + + # get weighted load profiles + + def get_weighted_load_profiles_from_s3(self, df, upgrade_num, state, upgrade_name): + """ + This method retrieves weighted timeseries profiles from s3/athena. + Returns dataframe with weighted kWh columns for baseline and upgrade. + """ + + # run crawler + athena_client = BuildStockQuery(workgroup='eulp', + db_name='enduse', + table_name=self.comstock_run_name, + buildstock_type='comstock', + skip_reports=True, + metadata_table_suffix='_md_agg_national_by_state_vu', #TODO: make this more dynamic for geo resolution + #ts_table_suffix=f"_timeseries_vu" + ) + + # Create upgrade ID to name mapping from existing data + upgrade_name_mapping = self.data.select(self.UPGRADE_ID, self.UPGRADE_NAME).unique().collect().sort(self.UPGRADE_ID).to_dict(as_series=False) + dict_upid_to_upname = dict(zip(upgrade_name_mapping[self.UPGRADE_ID], upgrade_name_mapping[self.UPGRADE_NAME])) + + print(athena_client._tables.keys()) + + # generate applicability for upgrade + # if there are multiple upgrades, applicability is based on first upgrade only + builder = ComStockQueryBuilder(self.comstock_run_name) + applicability_query = builder.get_applicability_query( + upgrade_ids=upgrade_num, + state=state, + columns=[ + 'dataset', + '"in.state"', + 'bldg_id', + 'upgrade', + 'applicability' + ] + ) + + #print("Applicability Query:") + #print("="*80) + #print(applicability_query) + #print("="*80) + + # Execute query to get applicable buildings + applic_df = athena_client.execute(applicability_query) + applic_bldgs_list = list(applic_df['bldg_id'].unique()) + applic_bldgs_list = [int(x) for x in applic_bldgs_list] + + import time + from datetime import datetime, time + + # loop through upgrades + for upgrade_id in df[self.UPGRADE_ID].unique(): + + # if there are upgrades, restrict baseline to match upgrade applicability + if (upgrade_id in (0, "0")) and (df[self.UPGRADE_ID].nunique() > 1): + + # Create query builder and generate query + builder = ComStockQueryBuilder(self.comstock_run_name) + query = builder.get_timeseries_aggregation_query( + upgrade_id=upgrade_id, + enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), + restrictions=[ + ('in.state', [f"{state}"]), + ('bldg_id', applic_bldgs_list) # match applicability of upgrade applic_bldgs_list[0] + ], + timestamp_grouping='hour', + weight_view_table=f'{self.comstock_run_name}_md_agg_national_by_state_vu' + ) + + #print("Generated Query:") + #print("="*80) + #print(query) + #print("="*80) + + print(f"Getting weighted baseline load profile for state {state} and upgrade id {upgrade_id} with {len(applic_bldgs_list)} applicable buildings.") + + # temp save figure + df_base_ts_agg_weighted = athena_client.execute(query) + df_base_ts_agg_weighted[self.UPGRADE_NAME] = dict_upid_to_upname[0] + + else: + # baseline load data when no upgrades are present, or upgrade load data + + + ## Create query builder and generate query for upgrade data + builder = ComStockQueryBuilder(self.comstock_run_name) + query = builder.get_timeseries_aggregation_query( + upgrade_id=upgrade_id, + enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), + restrictions=[ + ('in.state', [f"{state}"]), + ('bldg_id', applic_bldgs_list) # match applicability of upgrade applic_bldgs_list[0] + ], + timestamp_grouping='hour', + weight_view_table=f'{self.comstock_run_name}_md_agg_national_by_state_vu' + ) + + df_up_ts_agg_weighted = athena_client.execute(query) + df_up_ts_agg_weighted[self.UPGRADE_NAME] = dict_upid_to_upname[int(upgrade_num[0])] + + #print("Generated Upgrade Query:") + #print("="*80) + #print(query) + #print("="*80) + + + # Initialize default values in case no data was processed + df_base_ts_agg_weighted = pd.DataFrame() if 'df_base_ts_agg_weighted' not in locals() else df_base_ts_agg_weighted + df_up_ts_agg_weighted = pd.DataFrame() if 'df_up_ts_agg_weighted' not in locals() else df_up_ts_agg_weighted + + return df_base_ts_agg_weighted, df_up_ts_agg_weighted diff --git a/postprocessing/comstockpostproc/comstock_measure_comparison.py b/postprocessing/comstockpostproc/comstock_measure_comparison.py index db56bb936..8b3a80be1 100644 --- a/postprocessing/comstockpostproc/comstock_measure_comparison.py +++ b/postprocessing/comstockpostproc/comstock_measure_comparison.py @@ -28,6 +28,9 @@ def __init__(self, comstock_object: comstock.ComStock, states, make_comparison_p self.data = comstock_object.plotting_data.clone() #not really a deep copy, only schema is copied but not data. assert isinstance(self.data, pl.LazyFrame) + # Store reference to ComStock object for use in plotting methods + self.comstock_object = comstock_object + self.color_map = {} self.image_type = image_type self.name = name @@ -192,14 +195,14 @@ def make_plots(self, lazy_frame: pl.LazyFrame, column_for_grouping, states, make if make_timeseries_plots: TIMESERIES_PARAMS = {'comstock_run_name': self.comstock_run_name, 'states': states, 'color_map': color_map, - 'output_dir': output_dir} + 'output_dir': output_dir, 'comstock_obj': self.comstock_object} LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_timeseries_peak_week_by_state, lazy_frame=lazy_frame.clone(), columns=(self.lazyframe_plotter.BASE_COLUMNS + [self.UPGRADE_ID, self.BLDG_TYPE]))(**TIMESERIES_PARAMS) #self.BLDG_WEIGHT, - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_timeseries_season_average_by_state, lazy_frame=lazy_frame.clone(), - # columns=(self.lazyframe_plotter.BASE_COLUMNS + [self.UPGRADE_ID, self.BLDG_TYPE]))(**TIMESERIES_PARAMS) #self.BLDG_WEIGHT - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_timeseries_season_average_by_state, lazy_frame=lazy_frame.clone(), - # columns=(self.lazyframe_plotter.BASE_COLUMNS + [self.UPGRADE_ID, self.BLDG_TYPE]))(**TIMESERIES_PARAMS) #self.BLDG_WEIGHT + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_timeseries_season_average_by_state, lazy_frame=lazy_frame.clone(), + columns=(self.lazyframe_plotter.BASE_COLUMNS + [self.UPGRADE_ID, self.BLDG_TYPE]))(**TIMESERIES_PARAMS) #self.BLDG_WEIGHT + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_timeseries_annual_average_by_state_and_enduse, lazy_frame=lazy_frame.clone(), + columns=(self.lazyframe_plotter.BASE_COLUMNS + [self.UPGRADE_ID, self.BLDG_TYPE]))(**TIMESERIES_PARAMS) #self.BLDG_WEIGHT time_end = pd.Timestamp.now() logger.info(f"Time taken to make plots is {time_end - time_start}") diff --git a/postprocessing/comstockpostproc/plotting_mixin.py b/postprocessing/comstockpostproc/plotting_mixin.py index 783df17ca..9d2c10d64 100644 --- a/postprocessing/comstockpostproc/plotting_mixin.py +++ b/postprocessing/comstockpostproc/plotting_mixin.py @@ -3028,120 +3028,6 @@ def plot_load_duration_curve(self, df, region, building_type, color_map, output_ output_path = os.path.abspath(os.path.join(output_dir, filename)) plt.savefig(output_path, bbox_inches='tight') - - - - # get weighted load profiles - def get_weighted_load_profiles_from_s3(self, df, upgrade_num, state, upgrade_name): - """ - This method retrieves weighted timeseries profiles from s3/athena. - Returns dataframe with weighted kWh columns for baseline and upgrade. - """ - - # run crawler - athena_client = BuildStockQuery(workgroup='eulp', - db_name='enduse', - table_name=self.comstock_run_name, - buildstock_type='comstock', - skip_reports=True, - metadata_table_suffix='_md_agg_national_by_state_vu', #TODO: make this more dynamic for geo resolution - #ts_table_suffix=f"_timeseries_vu" - ) - - print(athena_client._tables.keys()) - - # generate applicability for upgrade - # if there are multiple upgrades, applicability is based on first upgrade only - builder = ComStockQueryBuilder(self.comstock_run_name) - applicability_query = builder.get_applicability_query( - upgrade_ids=upgrade_num, - state=state, - columns=[ - 'dataset', - '"in.state"', - 'bldg_id', - 'upgrade', - 'applicability' - ] - ) - - #print("Applicability Query:") - #print("="*80) - #print(applicability_query) - #print("="*80) - - # Execute query to get applicable buildings - applic_df = athena_client.execute(applicability_query) - applic_bldgs_list = list(applic_df['bldg_id'].unique()) - applic_bldgs_list = [int(x) for x in applic_bldgs_list] - - import time - from datetime import datetime, time - - # loop through upgrades - for upgrade_id in df[self.UPGRADE_ID].unique(): - - # if there are upgrades, restrict baseline to match upgrade applicability - if (upgrade_id in (0, "0")) and (df[self.UPGRADE_ID].nunique() > 1): - - # Create query builder and generate query - builder = ComStockQueryBuilder(self.comstock_run_name) - query = builder.get_timeseries_aggregation_query( - upgrade_id=upgrade_id, - enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), - restrictions=[ - ('in.state', [f"{state}"]), - ('bldg_id', applic_bldgs_list) # match applicability of upgrade applic_bldgs_list[0] - ], - timestamp_grouping='hour', - weight_view_table=f'{self.comstock_run_name}_md_agg_national_by_state_vu' - ) - - #print("Generated Query:") - #print("="*80) - #print(query) - #print("="*80) - - print(f"Getting weighted baseline load profile for state {state} and upgrade id {upgrade_id} with {len(applic_bldgs_list)} applicable buildings.") - - # temp save figure - df_base_ts_agg_weighted = athena_client.execute(query) - df_base_ts_agg_weighted[self.UPGRADE_NAME] = self.dict_upid_to_upname[0] - df_base_ts_agg_weighted.to_csv('comstock_unscaled_data.csv', index=False) - - else: - # baseline load data when no upgrades are present, or upgrade load data - - - ## Create query builder and generate query for upgrade data - builder = ComStockQueryBuilder(self.comstock_run_name) - query = builder.get_timeseries_aggregation_query( - upgrade_id=upgrade_id, - enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), - restrictions=[ - ('in.state', [f"{state}"]), - ('bldg_id', applic_bldgs_list) # match applicability of upgrade applic_bldgs_list[0] - ], - timestamp_grouping='hour', - weight_view_table=f'{self.comstock_run_name}_md_agg_national_by_state_vu' - ) - - df_up_ts_agg_weighted = athena_client.execute(query) - df_up_ts_agg_weighted[self.UPGRADE_NAME] = self.dict_upid_to_upname[int(upgrade_num[0])] - df_up_ts_agg_weighted.to_csv('comstock_unscaled_upgrade_data.csv', index=False) - - #print("Generated Upgrade Query:") - #print("="*80) - #print(query) - #print("="*80) - - - # Initialize default values in case no data was processed - df_base_ts_agg_weighted = pd.DataFrame() if 'df_base_ts_agg_weighted' not in locals() else df_base_ts_agg_weighted - df_up_ts_agg_weighted = pd.DataFrame() if 'df_up_ts_agg_weighted' not in locals() else df_up_ts_agg_weighted - - return df_base_ts_agg_weighted, df_up_ts_agg_weighted - # plot order_list = [ 'interior_equipment', @@ -3167,7 +3053,7 @@ def map_to_season(month): else: return 'Winter' - def plot_measure_timeseries_peak_week_by_state(self, df, output_dir, states, color_map, comstock_run_name): #, df, region, building_type, color_map, output_dir + def plot_measure_timeseries_peak_week_by_state(self, df, output_dir, states, color_map, comstock_run_name, comstock_obj=None): #, df, region, building_type, color_map, output_dir # get upgrade ID df_data = df.copy() @@ -3179,7 +3065,7 @@ def plot_measure_timeseries_peak_week_by_state(self, df, output_dir, states, col # apply queries and weighting for state, state_name in states.items(): - dfs_base_combined, dfs_upgrade_combined = self.get_weighted_load_profiles_from_s3(df_data, upgrade_num, state, upgrade_name) + dfs_base_combined, dfs_upgrade_combined = comstock_obj.get_weighted_load_profiles_from_s3(df_data, upgrade_num, state, upgrade_name) # merge into single dataframe dfs_merged = pd.concat([dfs_base_combined, dfs_upgrade_combined], ignore_index=True) @@ -3214,7 +3100,7 @@ def map_to_season(month): seasons = ['Spring', 'Summer', 'Fall', 'Winter'] for season in seasons: peak_week = dfs_merged.loc[dfs_merged['Season']==season, ["total_site_electricity_kwh", "Week_of_Year"]] - peak_week = peak_week.loc[peak_week["total_site_electricity_kwh"] == peak_week["total_site_electricity_kwh"].max(), "Week_of_Year"][0] + peak_week = peak_week.loc[peak_week["total_site_electricity_kwh"] == peak_week["total_site_electricity_kwh"].max(), "Week_of_Year"].iloc[0] # filter to the week @@ -3334,7 +3220,7 @@ def map_to_season(month): print(f"{fig_sub_dir}/timeseries_data_{state_name})") dfs_merged.to_csv(f"{fig_sub_dir}/timeseries_data_{state_name}.csv") - def plot_measure_timeseries_season_average_by_state(self, df, output_dir, states, color_map, comstock_run_name): + def plot_measure_timeseries_season_average_by_state(self, df, output_dir, states, color_map, comstock_run_name, comstock_obj=None): # get upgrade ID df_data = df.copy() @@ -3370,7 +3256,7 @@ def map_to_dow(dow): file_path = os.path.join(fig_sub_dir, f"timeseries_data_{state_name}.csv") dfs_merged=None if not os.path.exists(file_path): - dfs_base_combined, dfs_upgrade_combined = self.get_weighted_load_profiles_from_s3(df_data, upgrade_num, state, upgrade_name) + dfs_base_combined, dfs_upgrade_combined = comstock_obj.get_weighted_load_profiles_from_s3(df_data, upgrade_num, state, upgrade_name) # merge into single dataframe dfs_merged = pd.concat([dfs_base_combined, dfs_upgrade_combined], ignore_index=True) @@ -3542,7 +3428,7 @@ def map_to_dow(dow): fig.write_image(fig_path, scale=6) fig.write_html(fig_path_html) - def plot_measure_timeseries_annual_average_by_state_and_enduse(self, df, output_dir, states, color_map, comstock_run_name): + def plot_measure_timeseries_annual_average_by_state_and_enduse(self, df, output_dir, states, color_map, comstock_run_name, comstock_obj=None): # get upgrade ID df_data = df.copy() @@ -3582,7 +3468,7 @@ def map_to_dow(dow): if not os.path.exists(file_path): ######### This is where we get the weighted data - dfs_base_combined, dfs_upgrade_combined = self.get_weighted_load_profiles_from_s3(df_data, upgrade_num, state, upgrade_name) + dfs_base_combined, dfs_upgrade_combined = comstock_obj.get_weighted_load_profiles_from_s3(df_data, upgrade_num, state, upgrade_name) ######### # merge into single dataframe From 50b35fd04d7ceb834e48e47720f22fe1f9bfc67f Mon Sep 17 00:00:00 2001 From: Chris CaraDonna <50876267+ChristopherCaradonna@users.noreply.github.com> Date: Fri, 7 Nov 2025 13:03:31 -0700 Subject: [PATCH 06/16] works with both states and counties but not yet multiple at one time --- .../compare_upgrades-test_timeseries_plots.py | 119 ++++++++------- postprocessing/comstockpostproc/comstock.py | 138 ++++++++++++++---- .../comstock_measure_comparison.py | 14 +- .../comstock_query_builder.py | 22 ++- .../comstockpostproc/plotting_mixin.py | 32 ++-- 5 files changed, 212 insertions(+), 113 deletions(-) diff --git a/postprocessing/compare_upgrades-test_timeseries_plots.py b/postprocessing/compare_upgrades-test_timeseries_plots.py index 26678a985..2d77ed435 100644 --- a/postprocessing/compare_upgrades-test_timeseries_plots.py +++ b/postprocessing/compare_upgrades-test_timeseries_plots.py @@ -12,15 +12,15 @@ import re -import sys, inspect, importlib, site, platform, shutil, subprocess, os +#import sys, inspect, importlib, site, platform, shutil, subprocess, os -mods = ["pyathena","s3fs","fsspec","botocore","aiobotocore","boto3","pandas"] -for m in mods: - try: - mod = importlib.import_module(m) - print(f"{m:12} {getattr(mod,'__version__','?'):>10} @ {inspect.getfile(mod)}") - except Exception as e: - print(f"{m:12} NOT IMPORTABLE: {e}") +#mods = ["pyathena","s3fs","fsspec","botocore","aiobotocore","boto3","pandas"] +#for m in mods: +# try: +# mod = importlib.import_module(m) +# print(f"{m:12} {getattr(mod,'__version__','?'):>10} @ {inspect.getfile(mod)}") +# except Exception as e: +# print(f"{m:12} NOT IMPORTABLE: {e}") logging.basicConfig(level='INFO') # Use DEBUG, INFO, or WARNING @@ -44,14 +44,19 @@ def main(): include_upgrades=True, # False if not looking at upgrades upgrade_ids_to_skip=[2,3], # Use [1, 3] etc. to exclude certain upgrades make_timeseries_plots=True, - states={ - 'MN': 'Minnesota', # specify state to use for timeseries plots in dictionary format. State ID must correspond correctly. - 'MA':'Massachusetts', - 'OR': 'Oregon', - 'LA': 'Louisiana', - 'AZ': 'Arizona', - 'TN': 'Tennessee' - }, + timeseries_locations_to_plot={ + # 'MN': 'Minnesota', # specify location (either county ID or state ID) and corresponding name for plots and folders. + #'MA':'Massachusetts', + ['MA', 'NH', 'CT', 'VT', 'RI']: 'New England', # example of multiple states together + #'OR': 'Oregon', + #'LA': 'Louisiana', + #'AZ': 'Arizona', + #'TN': 'Tennessee', + #'G2500250': 'Boston', # if specifying a county, you must export county level data to S3 + #'G4900350': 'Salt Lake City', + #'G4804530': 'Austin', + ['G2500250', 'G4804530']:'Baustin' + }, upgrade_ids_for_comparison={} # Use {'':[0,1,2]}; add as many upgrade IDs as needed, but plots look strange over 5 #output_dir = 's3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2025/comstock_amy2018_release_1' @@ -88,36 +93,44 @@ def main(): # up_alloc_wts = comstock.get_allocated_weights_scaled_to_cbecs_for_upgrade(upgrade_id) comstock.create_allocated_weights_plus_util_bills_for_upgrade(upgrade_id) - # Export metadata files - geo_exports = [ - #{'geo_top_dir': 'by_state_and_county', - # 'partition_cols': { - # comstock.STATE_ABBRV: 'state', - # comstock.COUNTY_ID: 'county', - # }, - # 'aggregation_levels': ['in.nhgis_tract_gisjoin'], # , comstock.COUNTY_ID], # Full tract resolution (agg=in.nhgis_tract_gisjoin) - # 'data_types': ['full', 'basic'], - # 'file_types': ['csv', 'parquet'], - #}, - {'geo_top_dir': 'national_by_state', - 'partition_cols': {}, - 'aggregation_levels': [[comstock.STATE_ABBRV, comstock.CZ_ASHRAE]], - 'data_types': ['full'], # other options: 'detailed', 'basic' **If using multiple options, order must go from more detailed to less detailed. - 'file_types': ['parquet'], # other options:'parquet' - } - ] - - for geo_export in geo_exports: - for upgrade_id in comstock.upgrade_ids_to_process: - # if upgrade_id == 0: - # continue - # comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id, [geo_export]) - - # Also write to S3 if making timeseries plots - if comstock.make_timeseries_plots: - s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" - s3_output_dir = comstock.setup_fsspec_filesystem(s3_dir, aws_profile_name=None) - comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id=upgrade_id, geo_exports=[geo_export], output_dir=s3_output_dir) + # Specify geo exports + + # county resolution, files by state and county + county_resolution = { + 'geo_top_dir': 'by_state_and_county', + 'partition_cols': { + comstock.STATE_ABBRV: 'state', + comstock.COUNTY_ID: 'county', + }, + 'aggregation_levels': [comstock.COUNTY_ID], # , comstock.COUNTY_ID], # Full tract resolution (agg=in.nhgis_tract_gisjoin) + 'data_types': ['full'], + 'file_types': ['parquet'], + } + + # state level resolution, one single national file + state_resolution = { + 'geo_top_dir': 'national_by_state', + 'partition_cols': {}, + 'aggregation_levels': [[comstock.STATE_ABBRV, comstock.CZ_ASHRAE]], + 'data_types': ['full'], # other options: 'detailed', 'basic' **If using multiple options, order must go from more detailed to less detailed. + 'file_types': ['parquet'], # other options:'parquet' + } + + # specify the export level + # IMPORTANT: if making county level timeseries plots, must export county level data to S3. This does not occur automatically. + geo_exports = [county_resolution] #state_resolution + + #for geo_export in geo_exports: + # for upgrade_id in comstock.upgrade_ids_to_process: + # if upgrade_id == 0: + # continue + # comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id, [geo_export]) + + # # Also write to S3 if making timeseries plots + # if comstock.make_timeseries_plots: TODO: force geo exports to county data if couunty timeseries is requested. + # s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" + # s3_output_dir = comstock.setup_fsspec_filesystem(s3_dir, aws_profile_name=None) + # comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id=upgrade_id, geo_exports=[geo_export], output_dir=s3_output_dir) # write select results to S3 for Athena/Glue when needed for timeseries plots @@ -132,15 +145,15 @@ def main(): # Export parquet files to S3 for Athena/Glue # TODO: modify so that county data can be used for timeseries plots # TODO: Modify geo export structure to specify parquet files only for this part of the workflow - comstock.create_sightglass_tables(s3_location=f"{s3_dir}/metadata_and_annual_results_aggregates", - dataset_name=crawler_name, - database_name=database, - glue_service_role=glue_service_role) - comstock.fix_timeseries_tables(crawler_name, database) - comstock.create_views(crawler_name, database, workgroup) + #comstock.create_sightglass_tables(s3_location=f"{s3_dir}/metadata_and_annual_results_aggregates", + # dataset_name=crawler_name, + # database_name=database, + # glue_service_role=glue_service_role) + #comstock.fix_timeseries_tables(crawler_name, database) + #comstock.create_views(crawler_name, database, workgroup) # Create measure run comparisons; only use if run has measures - comparison = cspp.ComStockMeasureComparison(comstock, states=comstock.states, make_comparison_plots = comstock.make_comparison_plots, make_timeseries_plots = comstock.make_timeseries_plots) + comparison = cspp.ComStockMeasureComparison(comstock, timeseries_locations_to_plot=comstock.timeseries_locations_to_plot, make_comparison_plots = comstock.make_comparison_plots, make_timeseries_plots = comstock.make_timeseries_plots) # Export dictionaries corresponding to the exported columns #comstock.export_data_and_enumeration_dictionary() diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py index 7f35a2912..df8dc9ddc 100644 --- a/postprocessing/comstockpostproc/comstock.py +++ b/postprocessing/comstockpostproc/comstock.py @@ -49,7 +49,7 @@ class ComStock(NamingMixin, UnitsMixin, GasCorrectionModelMixin, S3UtilitiesMixi def __init__(self, s3_base_dir, comstock_run_name, comstock_run_version, comstock_year, athena_table_name, truth_data_version, buildstock_csv_name = 'buildstock.csv', acceptable_failure_percentage=0.01, drop_failed_runs=True, color_hex=NamingMixin.COLOR_COMSTOCK_BEFORE, weighted_energy_units='tbtu', weighted_demand_units='gw', weighted_ghg_units='co2e_mmt', weighted_utility_units='billion_usd', skip_missing_columns=False, - reload_from_cache=False, make_comparison_plots=True, make_timeseries_plots=True, include_upgrades=True, upgrade_ids_to_skip=[], states={}, upgrade_ids_for_comparison={}, rename_upgrades=False, output_dir=None, aws_profile_name=None): + reload_from_cache=False, make_comparison_plots=True, make_timeseries_plots=True, include_upgrades=True, upgrade_ids_to_skip=[], timeseries_locations_to_plot={}, upgrade_ids_for_comparison={}, rename_upgrades=False, output_dir=None, aws_profile_name=None): """ A class to load and transform ComStock data for export, analysis, and comparison. Args: @@ -102,7 +102,7 @@ def __init__(self, s3_base_dir, comstock_run_name, comstock_run_version, comstoc self.upgrade_ids_to_skip = upgrade_ids_to_skip self.upgrade_ids_for_comparison = upgrade_ids_for_comparison self.upgrade_ids_to_process = [] - self.states = states + self.timeseries_locations_to_plot = timeseries_locations_to_plot self.unweighted_weighted_map = {} self.cached_parquet = [] # List of parquet files to reload and export # TODO our current credential setup aren't playing well with this approach but does with the s3 ServiceResource @@ -397,6 +397,8 @@ def download_timeseries_data_for_ami_comparison(self, ami, reload_from_csv=True, logger.info(f'Reloading from CSV: {file_path}') self.ami_timeseries_data = pd.read_csv(file_path, low_memory=False, index_col='timestamp', parse_dates=True) else: + + ########## REVISE TO USE NEW TIMESERIES FUNCTIONALITY athena_end_uses = list(map(lambda x: self.END_USES_TIMESERIES_DICT[x], self.END_USES)) athena_end_uses.append('total_site_electricity_kwh') all_timeseries_df = pd.DataFrame() @@ -405,10 +407,12 @@ def download_timeseries_data_for_ami_comparison(self, ami, reload_from_csv=True, if os.path.isfile(region_file_path_long) and reload_from_csv and save_individual_regions: logger.info(f"timeseries data in long format for {region['source_name']} already exists at {region_file_path_long}") continue - ts_agg = self.athena_client.agg.aggregate_timeseries(enduses=athena_end_uses, group_by=['build_existing_model.building_type', 'time'], restrict=[('build_existing_model.county_id', region['county_ids'])]) + + ######################## END REVISED SECTION + if ts_agg['time'].dtype == 'Int64': # Convert bigint to timestamp type if necessary ts_agg['time'] = pd.to_datetime(ts_agg['time']/1e9, unit='s') @@ -4549,42 +4553,104 @@ def create_views( # get weighted load profiles - def get_weighted_load_profiles_from_s3(self, df, upgrade_num, state, upgrade_name): + def determine_state_or_county_timeseries_table(self, location_id, location_name=None): + """ + Determine if a single location key-value pair represents a state or county request. + + Args: + location_id (str): The location ID (e.g., 'MN', 'G123456') + location_name (str, optional): The location name (e.g., 'Minnesota', 'Boston') + + Returns: + str: 'state' for state-level locations, 'county' for county-level locations + """ + + # Analyze the location ID format + if len(location_id) == 2 and location_id.isalpha(): + # State ID: 2-letter alphabetic code (e.g., 'MN', 'CA', 'TX') + timeseries_geo_type = 'state' + elif location_id.startswith('G') and len(location_id) > 2: + # County ID: starts with 'G' followed by numbers (e.g., 'G123456') + timeseries_geo_type = 'county' + else: + # Handle other potential formats - default to state + print(f"Warning: Unrecognized location ID format: {location_id}, defaulting to state") + timeseries_geo_type = 'state' + + return timeseries_geo_type + + + + def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_id, upgrade_name): """ This method retrieves weighted timeseries profiles from s3/athena. Returns dataframe with weighted kWh columns for baseline and upgrade. + + Args: + location_id: Either a state ID (e.g., 'MN') or county ID (e.g., 'G123456') """ + # Determine table type based on the ENTIRE request set, not just this location + # If ANY location in timeseries_locations_to_plot is a county, use county table for ALL + requires_county_table = False + for loc_id in self.timeseries_locations_to_plot.keys(): + if self.determine_state_or_county_timeseries_table(loc_id) == 'county': + requires_county_table = True + break + + # Set up table configuration based on whether counties are needed anywhere + if requires_county_table: + agg = 'county' + metadata_table_suffix = '_md_agg_by_state_and_county_vu' + weight_view_table = f'{self.comstock_run_name}_md_agg_by_state_and_county_vu' + else: + agg = 'state' + metadata_table_suffix = '_md_agg_national_by_state_vu' + weight_view_table = f'{self.comstock_run_name}_md_agg_national_by_state_vu' + + # Determine the geographic type for this specific location (for query building) + location_geo_type = self.determine_state_or_county_timeseries_table(location_id) + # run crawler + # Determine which metadata table suffix to use based on what's available + # Prefer county-level data if available, otherwise use state-level + # Try county-level first athena_client = BuildStockQuery(workgroup='eulp', - db_name='enduse', - table_name=self.comstock_run_name, - buildstock_type='comstock', - skip_reports=True, - metadata_table_suffix='_md_agg_national_by_state_vu', #TODO: make this more dynamic for geo resolution - #ts_table_suffix=f"_timeseries_vu" - ) + db_name='enduse', + table_name=self.comstock_run_name, + buildstock_type='comstock', + skip_reports=True, + metadata_table_suffix=metadata_table_suffix, + ) # Create upgrade ID to name mapping from existing data upgrade_name_mapping = self.data.select(self.UPGRADE_ID, self.UPGRADE_NAME).unique().collect().sort(self.UPGRADE_ID).to_dict(as_series=False) dict_upid_to_upname = dict(zip(upgrade_name_mapping[self.UPGRADE_ID], upgrade_name_mapping[self.UPGRADE_NAME])) - print(athena_client._tables.keys()) - # generate applicability for upgrade # if there are multiple upgrades, applicability is based on first upgrade only builder = ComStockQueryBuilder(self.comstock_run_name) - applicability_query = builder.get_applicability_query( - upgrade_ids=upgrade_num, - state=state, - columns=[ - 'dataset', - '"in.state"', - 'bldg_id', - 'upgrade', - 'applicability' - ] - ) + + # Build query parameters based on what we're actually querying for this location + query_params = { + 'upgrade_ids': upgrade_num, + 'columns': [ + self.DATASET, + f'"{self.STATE_NAME}"' if location_geo_type == 'state' else f'"{self.COUNTY_ID}"', + self.BLDG_ID, + self.UPGRADE_ID, + self.UPGRADE_APPL + ], + 'weight_view_table': weight_view_table + } + + # Add geographic filter based on THIS location's type (not the table type) + if location_geo_type == 'state': + query_params['state'] = location_id # location_id is the state code (e.g., 'MN') + elif location_geo_type == 'county': + query_params['county'] = location_id # location_id is the county code (e.g., 'G123456') + + applicability_query = builder.get_applicability_query(**query_params) #print("Applicability Query:") #print("="*80) @@ -4607,15 +4673,22 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, state, upgrade_nam # Create query builder and generate query builder = ComStockQueryBuilder(self.comstock_run_name) + + # Build geographic restrictions based on THIS location's type + if location_geo_type == 'state': + geo_restriction = ('in.state', [f"{location_id}"]) + else: # county + geo_restriction = ('in.nhgis_county_gisjoin', [f"{location_id}"]) + query = builder.get_timeseries_aggregation_query( upgrade_id=upgrade_id, enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), restrictions=[ - ('in.state', [f"{state}"]), + geo_restriction, ('bldg_id', applic_bldgs_list) # match applicability of upgrade applic_bldgs_list[0] ], timestamp_grouping='hour', - weight_view_table=f'{self.comstock_run_name}_md_agg_national_by_state_vu' + weight_view_table=weight_view_table ) #print("Generated Query:") @@ -4623,7 +4696,7 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, state, upgrade_nam #print(query) #print("="*80) - print(f"Getting weighted baseline load profile for state {state} and upgrade id {upgrade_id} with {len(applic_bldgs_list)} applicable buildings.") + print(f"Getting weighted baseline load profile for {location_geo_type} {location_id} and upgrade id {upgrade_id} with {len(applic_bldgs_list)} applicable buildings (using {agg} table).") # temp save figure df_base_ts_agg_weighted = athena_client.execute(query) @@ -4635,15 +4708,22 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, state, upgrade_nam ## Create query builder and generate query for upgrade data builder = ComStockQueryBuilder(self.comstock_run_name) + + # Build geographic restrictions based on THIS location's type (same as baseline) + if location_geo_type == 'state': + geo_restriction = ('in.state', [f"{location_id}"]) + else: # county + geo_restriction = ('in.nhgis_county_gisjoin', [f"{location_id}"]) + query = builder.get_timeseries_aggregation_query( upgrade_id=upgrade_id, enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), restrictions=[ - ('in.state', [f"{state}"]), + geo_restriction, ('bldg_id', applic_bldgs_list) # match applicability of upgrade applic_bldgs_list[0] ], timestamp_grouping='hour', - weight_view_table=f'{self.comstock_run_name}_md_agg_national_by_state_vu' + weight_view_table=weight_view_table ) df_up_ts_agg_weighted = athena_client.execute(query) diff --git a/postprocessing/comstockpostproc/comstock_measure_comparison.py b/postprocessing/comstockpostproc/comstock_measure_comparison.py index 8b3a80be1..a08fe7405 100644 --- a/postprocessing/comstockpostproc/comstock_measure_comparison.py +++ b/postprocessing/comstockpostproc/comstock_measure_comparison.py @@ -16,7 +16,7 @@ class ComStockMeasureComparison(NamingMixin, UnitsMixin, PlottingMixin): - def __init__(self, comstock_object: comstock.ComStock, states, make_comparison_plots, make_timeseries_plots, image_type='jpg', name=None): + def __init__(self, comstock_object: comstock.ComStock, timeseries_locations_to_plot, make_comparison_plots, make_timeseries_plots, image_type='jpg', name=None): # Initialize members assert isinstance(comstock_object.data, pl.LazyFrame) @@ -49,7 +49,7 @@ def __init__(self, comstock_object: comstock.ComStock, states, make_comparison_p self.comstock_run_name = comstock_object.comstock_run_name self.athena_table_name = comstock_object.athena_table_name self.s3_base_dir = comstock_object.s3_base_dir - self.states = states + self.timeseries_locations_to_plot = timeseries_locations_to_plot self.make_timeseries_plots = make_timeseries_plots self.lazyframe_plotter = LazyFramePlotter() @@ -88,7 +88,7 @@ def __init__(self, comstock_object: comstock.ComStock, states, make_comparison_p # make consumption plots for upgrades if requested by user if make_comparison_plots: logger.info(f'Making plots for upgrade {upgrade}') - self.make_plots(df_upgrade, self.column_for_grouping, states, make_timeseries_plots, color_map, self.dict_measure_dir[upgrade], self.s3_base_dir, self.athena_table_name, self.comstock_run_name) + self.make_plots(df_upgrade, self.column_for_grouping, timeseries_locations_to_plot, make_timeseries_plots, color_map, self.dict_measure_dir[upgrade], self.s3_base_dir, self.athena_table_name, self.comstock_run_name) else: logger.info("make_comparison_plots is set to false, so not plots were created. Set make_comparison_plots to True for plots.") @@ -120,13 +120,13 @@ def __init__(self, comstock_object: comstock.ComStock, states, make_comparison_p # make consumption plots for upgrades if requested by user if make_comparison_plots: - self.make_comparative_plots(df_upgrade, self.column_for_grouping, states, make_timeseries_plots, color_map, comp_output_dir) + self.make_comparative_plots(df_upgrade, self.column_for_grouping, timeseries_locations_to_plot, make_timeseries_plots, color_map, comp_output_dir) else: logger.info("make_comparison_plots is set to false, so not plots were created. Set make_comparison_plots to True for plots.") end_time = pd.Timestamp.now() logger.info(f"Time taken to make all plots is {end_time - start_time}") - def make_plots(self, lazy_frame: pl.LazyFrame, column_for_grouping, states, make_timeseries_plots, color_map, output_dir, s3_base_dir, athena_table_name, comstock_run_name): + def make_plots(self, lazy_frame: pl.LazyFrame, column_for_grouping, timeseries_locations_to_plot, make_timeseries_plots, color_map, output_dir, s3_base_dir, athena_table_name, comstock_run_name): time_start = pd.Timestamp.now() BASIC_PARAMS = { @@ -194,7 +194,7 @@ def make_plots(self, lazy_frame: pl.LazyFrame, column_for_grouping, states, make # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.UNMET_HOURS_COLS))(**BASIC_PARAMS) if make_timeseries_plots: - TIMESERIES_PARAMS = {'comstock_run_name': self.comstock_run_name, 'states': states, 'color_map': color_map, + TIMESERIES_PARAMS = {'comstock_run_name': self.comstock_run_name, 'timeseries_locations_to_plot': timeseries_locations_to_plot, 'color_map': color_map, 'output_dir': output_dir, 'comstock_obj': self.comstock_object} LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_timeseries_peak_week_by_state, lazy_frame=lazy_frame.clone(), @@ -206,7 +206,7 @@ def make_plots(self, lazy_frame: pl.LazyFrame, column_for_grouping, states, make time_end = pd.Timestamp.now() logger.info(f"Time taken to make plots is {time_end - time_start}") - def make_comparative_plots(self, lazy_frame: pl.LazyFrame, column_for_grouping, states, make_timeseries_plots, color_map, output_dir, s3_base_dir, athena_table_name, comstock_run_name): + def make_comparative_plots(self, lazy_frame: pl.LazyFrame, column_for_grouping, timeseries_locations_to_plot, make_timeseries_plots, color_map, output_dir, s3_base_dir, athena_table_name, comstock_run_name): # Make plots comparing the upgrades assert isinstance(lazy_frame, pl.LazyFrame) diff --git a/postprocessing/comstockpostproc/comstock_query_builder.py b/postprocessing/comstockpostproc/comstock_query_builder.py index d1be591cc..db3e1ab26 100644 --- a/postprocessing/comstockpostproc/comstock_query_builder.py +++ b/postprocessing/comstockpostproc/comstock_query_builder.py @@ -332,6 +332,7 @@ def get_building_characteristics_query(self, def get_applicability_query(self, upgrade_ids: List[Union[int, str]], state: Optional[str] = None, + county: Optional[str] = None, columns: Optional[List[str]] = None, weight_view_table: Optional[str] = None) -> str: """ @@ -340,23 +341,26 @@ def get_applicability_query(self, Args: upgrade_ids: List of upgrade IDs to filter on state: State abbreviation to filter on (optional) + county: County GISJOIN to filter on (optional) columns: Specific columns to select (optional, defaults to common applicability columns) weight_view_table: Name of the weight view table (optional, uses default naming) Returns: SQL query string """ - + # If no weight view table provided, construct default name if weight_view_table is None: weight_view_table = f"{self.athena_table_name}_md_agg_national_by_state_vu" # Default columns for applicability queries if columns is None: + # Choose geographic column based on which filter is being used + geo_column = '"in.nhgis_county_gisjoin"' if county else '"in.state"' columns = [ 'dataset', - '"in.state"', - 'bldg_id', + geo_column, + 'bldg_id', 'upgrade', 'applicability' ] @@ -366,25 +370,27 @@ def get_applicability_query(self, # Build WHERE clause where_conditions = [] - + # Filter by upgrade IDs if len(upgrade_ids) == 1: where_conditions.append(f'upgrade = {upgrade_ids[0]}') else: upgrade_list = ','.join(map(str, upgrade_ids)) where_conditions.append(f'upgrade IN ({upgrade_list})') - + # Filter by applicability where_conditions.append('applicability = true') - - # Filter by state if provided + + # Filter by geographic location (state or county) if state: where_conditions.append(f'"in.state" = \'{state}\'') + elif county: + where_conditions.append(f'"in.nhgis_county_gisjoin" = \'{county}\'') where_clause = ' AND '.join(where_conditions) query = f""" - SELECT DISTINCT + SELECT DISTINCT {select_clause} FROM {weight_view_table} WHERE {where_clause} diff --git a/postprocessing/comstockpostproc/plotting_mixin.py b/postprocessing/comstockpostproc/plotting_mixin.py index 9d2c10d64..c02b04eee 100644 --- a/postprocessing/comstockpostproc/plotting_mixin.py +++ b/postprocessing/comstockpostproc/plotting_mixin.py @@ -3053,7 +3053,7 @@ def map_to_season(month): else: return 'Winter' - def plot_measure_timeseries_peak_week_by_state(self, df, output_dir, states, color_map, comstock_run_name, comstock_obj=None): #, df, region, building_type, color_map, output_dir + def plot_measure_timeseries_peak_week_by_state(self, df, output_dir, timeseries_locations_to_plot, color_map, comstock_run_name, comstock_obj=None): #, df, region, building_type, color_map, output_dir # get upgrade ID df_data = df.copy() @@ -3064,8 +3064,8 @@ def plot_measure_timeseries_peak_week_by_state(self, df, output_dir, states, col upgrade_name = list(df_upgrade[self.UPGRADE_NAME].unique()) # apply queries and weighting - for state, state_name in states.items(): - dfs_base_combined, dfs_upgrade_combined = comstock_obj.get_weighted_load_profiles_from_s3(df_data, upgrade_num, state, upgrade_name) + for location, location_name in timeseries_locations_to_plot.items(): + dfs_base_combined, dfs_upgrade_combined = comstock_obj.get_weighted_load_profiles_from_s3(df_data, upgrade_num, location, upgrade_name) # merge into single dataframe dfs_merged = pd.concat([dfs_base_combined, dfs_upgrade_combined], ignore_index=True) @@ -3194,7 +3194,7 @@ def map_to_season(month): y=-0.35, # Adjust this value as needed to place the title correctly xref='paper', yref='paper', - text=f"{season} Peak Week, Applicable Buildings - {state_name}", + text=f"{season} Peak Week, Applicable Buildings - {location_name}", showarrow=False, font=dict( size=16 @@ -3208,7 +3208,7 @@ def map_to_season(month): title = f"{season}_peak_week" fig_name = f'{title.replace(" ", "_").lower()}.{self.image_type}' fig_name_html = f'{title.replace(" ", "_").lower()}.html' - fig_sub_dir = os.path.abspath(os.path.join(output_dir, f"timeseries/{state_name}")) + fig_sub_dir = os.path.abspath(os.path.join(output_dir, f"timeseries/{location_name}")) if not os.path.exists(fig_sub_dir): os.makedirs(fig_sub_dir) fig_path = os.path.abspath(os.path.join(fig_sub_dir, fig_name)) @@ -3217,10 +3217,10 @@ def map_to_season(month): fig.write_image(fig_path, scale=6) fig.write_html(fig_path_html) - print(f"{fig_sub_dir}/timeseries_data_{state_name})") - dfs_merged.to_csv(f"{fig_sub_dir}/timeseries_data_{state_name}.csv") + print(f"{fig_sub_dir}/timeseries_data_{location_name})") + dfs_merged.to_csv(f"{fig_sub_dir}/timeseries_data_{location_name}.csv") - def plot_measure_timeseries_season_average_by_state(self, df, output_dir, states, color_map, comstock_run_name, comstock_obj=None): + def plot_measure_timeseries_season_average_by_state(self, df, output_dir, timeseries_locations_to_plot, color_map, comstock_run_name, comstock_obj=None): # get upgrade ID df_data = df.copy() @@ -3248,15 +3248,15 @@ def map_to_dow(dow): return 'Weekend' # apply queries and weighting - for state, state_name in states.items(): + for location, location_name in timeseries_locations_to_plot.items(): # check to see if timeseries file exists. # if it does, reload. Else, query data. - fig_sub_dir = os.path.abspath(os.path.join(output_dir, f"timeseries/{state_name}")) - file_path = os.path.join(fig_sub_dir, f"timeseries_data_{state_name}.csv") + fig_sub_dir = os.path.abspath(os.path.join(output_dir, f"timeseries/{location_name}")) + file_path = os.path.join(fig_sub_dir, f"timeseries_data_{location_name}.csv") dfs_merged=None if not os.path.exists(file_path): - dfs_base_combined, dfs_upgrade_combined = comstock_obj.get_weighted_load_profiles_from_s3(df_data, upgrade_num, state, upgrade_name) + dfs_base_combined, dfs_upgrade_combined = comstock_obj.get_weighted_load_profiles_from_s3(df_data, upgrade_num, location, upgrade_name) # merge into single dataframe dfs_merged = pd.concat([dfs_base_combined, dfs_upgrade_combined], ignore_index=True) @@ -3398,7 +3398,7 @@ def map_to_dow(dow): # Update layout fig.update_layout( - title=f"Seasonal Average, Applicable Buildings - {state_name}", + title=f"Seasonal Average, Applicable Buildings - {location_name}", title_x=0.04, # Align title to the left title_y=0.97, # Move title to the bottom title_xanchor='left', @@ -3419,7 +3419,7 @@ def map_to_dow(dow): title = "seasonal_average_subplot" fig_name = f'{title.replace(" ", "_").lower()}.{self.image_type}' fig_name_html = f'{title.replace(" ", "_").lower()}.html' - fig_sub_dir = os.path.abspath(os.path.join(output_dir, f"timeseries/{state_name}")) + fig_sub_dir = os.path.abspath(os.path.join(output_dir, f"timeseries/{location_name}")) if not os.path.exists(fig_sub_dir): os.makedirs(fig_sub_dir) fig_path = os.path.abspath(os.path.join(fig_sub_dir, fig_name)) @@ -3428,7 +3428,7 @@ def map_to_dow(dow): fig.write_image(fig_path, scale=6) fig.write_html(fig_path_html) - def plot_measure_timeseries_annual_average_by_state_and_enduse(self, df, output_dir, states, color_map, comstock_run_name, comstock_obj=None): + def plot_measure_timeseries_annual_average_by_state_and_enduse(self, df, output_dir, timeseries_locations_to_plot, color_map, comstock_run_name, comstock_obj=None): # get upgrade ID df_data = df.copy() @@ -3456,7 +3456,7 @@ def map_to_dow(dow): return 'Weekend' # apply queries and weighting - for state, state_name in states.items(): + for state, state_name in timeseries_locations_to_plot.items(): # check to see if timeseries data already exists # check to see if timeseries file exists. From d52934597bee49b7f064af3f92500b91a4188dd5 Mon Sep 17 00:00:00 2001 From: Chris CaraDonna <50876267+ChristopherCaradonna@users.noreply.github.com> Date: Fri, 7 Nov 2025 13:23:26 -0700 Subject: [PATCH 07/16] refactor code to accommodate multiple state/county queries together for timeseries plots --- .../compare_upgrades-test_timeseries_plots.py | 6 +- postprocessing/comstockpostproc/comstock.py | 156 ++++++++++-------- .../comstock_query_builder.py | 28 +++- 3 files changed, 109 insertions(+), 81 deletions(-) diff --git a/postprocessing/compare_upgrades-test_timeseries_plots.py b/postprocessing/compare_upgrades-test_timeseries_plots.py index 2d77ed435..b8ac84801 100644 --- a/postprocessing/compare_upgrades-test_timeseries_plots.py +++ b/postprocessing/compare_upgrades-test_timeseries_plots.py @@ -45,9 +45,9 @@ def main(): upgrade_ids_to_skip=[2,3], # Use [1, 3] etc. to exclude certain upgrades make_timeseries_plots=True, timeseries_locations_to_plot={ - # 'MN': 'Minnesota', # specify location (either county ID or state ID) and corresponding name for plots and folders. + 'MN': 'Minnesota', # specify location (either county ID or state ID) and corresponding name for plots and folders. #'MA':'Massachusetts', - ['MA', 'NH', 'CT', 'VT', 'RI']: 'New England', # example of multiple states together + ('MA', 'NH', 'CT', 'VT', 'RI'): 'New England', # example of multiple states together - using tuples as keys #'OR': 'Oregon', #'LA': 'Louisiana', #'AZ': 'Arizona', @@ -55,7 +55,7 @@ def main(): #'G2500250': 'Boston', # if specifying a county, you must export county level data to S3 #'G4900350': 'Salt Lake City', #'G4804530': 'Austin', - ['G2500250', 'G4804530']:'Baustin' + ('G2500250', 'G4804530'):'Baustin' # multiple counties together - using tuples as keys }, upgrade_ids_for_comparison={} # Use {'':[0,1,2]}; add as many upgrade IDs as needed, but plots look strange over 5 diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py index df8dc9ddc..2fa54d2c5 100644 --- a/postprocessing/comstockpostproc/comstock.py +++ b/postprocessing/comstockpostproc/comstock.py @@ -4553,48 +4553,67 @@ def create_views( # get weighted load profiles - def determine_state_or_county_timeseries_table(self, location_id, location_name=None): + def determine_state_or_county_timeseries_table(self, location_input, location_name=None): """ - Determine if a single location key-value pair represents a state or county request. + Determine if location input represents state or county request(s). Args: - location_id (str): The location ID (e.g., 'MN', 'G123456') + location_input: Either a single location ID string or tuple/list of location IDs location_name (str, optional): The location name (e.g., 'Minnesota', 'Boston') Returns: str: 'state' for state-level locations, 'county' for county-level locations """ - # Analyze the location ID format - if len(location_id) == 2 and location_id.isalpha(): - # State ID: 2-letter alphabetic code (e.g., 'MN', 'CA', 'TX') - timeseries_geo_type = 'state' - elif location_id.startswith('G') and len(location_id) > 2: - # County ID: starts with 'G' followed by numbers (e.g., 'G123456') - timeseries_geo_type = 'county' - else: - # Handle other potential formats - default to state - print(f"Warning: Unrecognized location ID format: {location_id}, defaulting to state") - timeseries_geo_type = 'state' + # Handle single location (string) + if isinstance(location_input, str): + if len(location_input) == 2 and location_input.isalpha(): + # State ID: 2-letter alphabetic code (e.g., 'MN', 'CA', 'TX') + return 'state' + elif location_input.startswith('G') and len(location_input) > 2: + # County ID: starts with 'G' followed by numbers (e.g., 'G123456') + return 'county' + else: + # Handle other potential formats - default to state + print(f"Warning: Unrecognized location ID format: {location_input}, defaulting to state") + return 'state' + + # Handle multiple locations (tuple or list) + elif isinstance(location_input, (tuple, list)): + # Check all locations and determine if any counties are present + for location_id in location_input: + if self.determine_state_or_county_timeseries_table(location_id) == 'county': + return 'county' # Use county table if any counties present + return 'state' # All are states - return timeseries_geo_type + else: + print(f"Warning: Unexpected location input type: {type(location_input)}, defaulting to state") + return 'state' - def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_id, upgrade_name): + def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_input, upgrade_name): """ This method retrieves weighted timeseries profiles from s3/athena. Returns dataframe with weighted kWh columns for baseline and upgrade. Args: - location_id: Either a state ID (e.g., 'MN') or county ID (e.g., 'G123456') + location_input: Either a single location ID (e.g., 'MN') or tuple/list of location IDs (e.g., ('MA', 'NH')) """ + # Normalize location_input to a list for consistent processing + if isinstance(location_input, str): + location_list = [location_input] + elif isinstance(location_input, (tuple, list)): + location_list = list(location_input) + else: + raise ValueError(f"Unexpected location_input type: {type(location_input)}") + # Determine table type based on the ENTIRE request set, not just this location # If ANY location in timeseries_locations_to_plot is a county, use county table for ALL requires_county_table = False - for loc_id in self.timeseries_locations_to_plot.keys(): - if self.determine_state_or_county_timeseries_table(loc_id) == 'county': + for loc_key in self.timeseries_locations_to_plot.keys(): + if self.determine_state_or_county_timeseries_table(loc_key) == 'county': requires_county_table = True break @@ -4608,13 +4627,7 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_id, upgra metadata_table_suffix = '_md_agg_national_by_state_vu' weight_view_table = f'{self.comstock_run_name}_md_agg_national_by_state_vu' - # Determine the geographic type for this specific location (for query building) - location_geo_type = self.determine_state_or_county_timeseries_table(location_id) - - # run crawler - # Determine which metadata table suffix to use based on what's available - # Prefer county-level data if available, otherwise use state-level - # Try county-level first + # Initialize Athena client athena_client = BuildStockQuery(workgroup='eulp', db_name='enduse', table_name=self.comstock_run_name, @@ -4627,16 +4640,28 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_id, upgra upgrade_name_mapping = self.data.select(self.UPGRADE_ID, self.UPGRADE_NAME).unique().collect().sort(self.UPGRADE_ID).to_dict(as_series=False) dict_upid_to_upname = dict(zip(upgrade_name_mapping[self.UPGRADE_ID], upgrade_name_mapping[self.UPGRADE_NAME])) - # generate applicability for upgrade - # if there are multiple upgrades, applicability is based on first upgrade only + # Determine geographic types for all locations + state_locations = [] + county_locations = [] + + for location_id in location_list: + location_geo_type = self.determine_state_or_county_timeseries_table(location_id) + if location_geo_type == 'state': + state_locations.append(location_id) + else: + county_locations.append(location_id) + + # Build applicability query for all locations builder = ComStockQueryBuilder(self.comstock_run_name) - # Build query parameters based on what we're actually querying for this location + # Determine column type based on primary geographic type + primary_geo_type = self.determine_state_or_county_timeseries_table(location_input) + query_params = { 'upgrade_ids': upgrade_num, 'columns': [ self.DATASET, - f'"{self.STATE_NAME}"' if location_geo_type == 'state' else f'"{self.COUNTY_ID}"', + f'"{self.STATE_NAME}"' if primary_geo_type == 'state' else f'"{self.COUNTY_ID}"', self.BLDG_ID, self.UPGRADE_ID, self.UPGRADE_APPL @@ -4644,19 +4669,21 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_id, upgra 'weight_view_table': weight_view_table } - # Add geographic filter based on THIS location's type (not the table type) - if location_geo_type == 'state': - query_params['state'] = location_id # location_id is the state code (e.g., 'MN') - elif location_geo_type == 'county': - query_params['county'] = location_id # location_id is the county code (e.g., 'G123456') + # Add geographic filters for all locations + if state_locations and county_locations: + # Mixed - need to query both separately and combine + # For now, use the primary type's approach + if primary_geo_type == 'state': + query_params['state'] = state_locations if len(state_locations) > 1 else state_locations[0] + else: + query_params['county'] = county_locations if len(county_locations) > 1 else county_locations[0] + elif state_locations: + query_params['state'] = state_locations if len(state_locations) > 1 else state_locations[0] + elif county_locations: + query_params['county'] = county_locations if len(county_locations) > 1 else county_locations[0] applicability_query = builder.get_applicability_query(**query_params) - #print("Applicability Query:") - #print("="*80) - #print(applicability_query) - #print("="*80) - # Execute query to get applicable buildings applic_df = athena_client.execute(applicability_query) applic_bldgs_list = list(applic_df['bldg_id'].unique()) @@ -4665,6 +4692,13 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_id, upgra import time from datetime import datetime, time + # Build geographic restrictions for all locations + geo_restrictions = [] + if state_locations: + geo_restrictions.append(('in.state', state_locations)) + if county_locations: + geo_restrictions.append(('in.nhgis_county_gisjoin', county_locations)) + # loop through upgrades for upgrade_id in df[self.UPGRADE_ID].unique(): @@ -4674,54 +4708,39 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_id, upgra # Create query builder and generate query builder = ComStockQueryBuilder(self.comstock_run_name) - # Build geographic restrictions based on THIS location's type - if location_geo_type == 'state': - geo_restriction = ('in.state', [f"{location_id}"]) - else: # county - geo_restriction = ('in.nhgis_county_gisjoin', [f"{location_id}"]) + # Build restrictions list including all geographic locations + restrictions = [('bldg_id', applic_bldgs_list)] + restrictions.extend(geo_restrictions) query = builder.get_timeseries_aggregation_query( upgrade_id=upgrade_id, enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), - restrictions=[ - geo_restriction, - ('bldg_id', applic_bldgs_list) # match applicability of upgrade applic_bldgs_list[0] - ], + restrictions=restrictions, timestamp_grouping='hour', weight_view_table=weight_view_table ) - #print("Generated Query:") - #print("="*80) - #print(query) - #print("="*80) - - print(f"Getting weighted baseline load profile for {location_geo_type} {location_id} and upgrade id {upgrade_id} with {len(applic_bldgs_list)} applicable buildings (using {agg} table).") + location_desc = f"locations {location_list}" if len(location_list) > 1 else f"{primary_geo_type} {location_list[0]}" + print(f"Getting weighted baseline load profile for {location_desc} and upgrade id {upgrade_id} with {len(applic_bldgs_list)} applicable buildings (using {agg} table).") - # temp save figure + # Execute query df_base_ts_agg_weighted = athena_client.execute(query) df_base_ts_agg_weighted[self.UPGRADE_NAME] = dict_upid_to_upname[0] else: # baseline load data when no upgrades are present, or upgrade load data - ## Create query builder and generate query for upgrade data builder = ComStockQueryBuilder(self.comstock_run_name) - # Build geographic restrictions based on THIS location's type (same as baseline) - if location_geo_type == 'state': - geo_restriction = ('in.state', [f"{location_id}"]) - else: # county - geo_restriction = ('in.nhgis_county_gisjoin', [f"{location_id}"]) + # Build restrictions list including all geographic locations (same as baseline) + restrictions = [('bldg_id', applic_bldgs_list)] + restrictions.extend(geo_restrictions) query = builder.get_timeseries_aggregation_query( upgrade_id=upgrade_id, enduses=(list(self.END_USES_TIMESERIES_DICT.values())+["total_site_electricity_kwh"]), - restrictions=[ - geo_restriction, - ('bldg_id', applic_bldgs_list) # match applicability of upgrade applic_bldgs_list[0] - ], + restrictions=restrictions, timestamp_grouping='hour', weight_view_table=weight_view_table ) @@ -4729,11 +4748,6 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_id, upgra df_up_ts_agg_weighted = athena_client.execute(query) df_up_ts_agg_weighted[self.UPGRADE_NAME] = dict_upid_to_upname[int(upgrade_num[0])] - #print("Generated Upgrade Query:") - #print("="*80) - #print(query) - #print("="*80) - # Initialize default values in case no data was processed df_base_ts_agg_weighted = pd.DataFrame() if 'df_base_ts_agg_weighted' not in locals() else df_base_ts_agg_weighted diff --git a/postprocessing/comstockpostproc/comstock_query_builder.py b/postprocessing/comstockpostproc/comstock_query_builder.py index db3e1ab26..1f940c79f 100644 --- a/postprocessing/comstockpostproc/comstock_query_builder.py +++ b/postprocessing/comstockpostproc/comstock_query_builder.py @@ -331,8 +331,8 @@ def get_building_characteristics_query(self, def get_applicability_query(self, upgrade_ids: List[Union[int, str]], - state: Optional[str] = None, - county: Optional[str] = None, + state: Optional[Union[str, List[str]]] = None, + county: Optional[Union[str, List[str]]] = None, columns: Optional[List[str]] = None, weight_view_table: Optional[str] = None) -> str: """ @@ -340,8 +340,8 @@ def get_applicability_query(self, Args: upgrade_ids: List of upgrade IDs to filter on - state: State abbreviation to filter on (optional) - county: County GISJOIN to filter on (optional) + state: State abbreviation(s) to filter on (optional, can be single string or list) + county: County GISJOIN(s) to filter on (optional, can be single string or list) columns: Specific columns to select (optional, defaults to common applicability columns) weight_view_table: Name of the weight view table (optional, uses default naming) @@ -381,11 +381,25 @@ def get_applicability_query(self, # Filter by applicability where_conditions.append('applicability = true') - # Filter by geographic location (state or county) + # Filter by geographic location (state or county) - support single values or lists if state: - where_conditions.append(f'"in.state" = \'{state}\'') + if isinstance(state, str): + where_conditions.append(f'"in.state" = \'{state}\'') + elif isinstance(state, (list, tuple)): + if len(state) == 1: + where_conditions.append(f'"in.state" = \'{state[0]}\'') + else: + state_list = "', '".join(state) + where_conditions.append(f'"in.state" IN (\'{state_list}\')') elif county: - where_conditions.append(f'"in.nhgis_county_gisjoin" = \'{county}\'') + if isinstance(county, str): + where_conditions.append(f'"in.nhgis_county_gisjoin" = \'{county}\'') + elif isinstance(county, (list, tuple)): + if len(county) == 1: + where_conditions.append(f'"in.nhgis_county_gisjoin" = \'{county[0]}\'') + else: + county_list = "', '".join(county) + where_conditions.append(f'"in.nhgis_county_gisjoin" IN (\'{county_list}\')') where_clause = ' AND '.join(where_conditions) From f234281df0e4e1fd61bb0c431b6381802a7598ff Mon Sep 17 00:00:00 2001 From: Chris CaraDonna <50876267+ChristopherCaradonna@users.noreply.github.com> Date: Fri, 7 Nov 2025 13:53:41 -0700 Subject: [PATCH 08/16] Update compare_upgrades-test_timeseries_plots.py --- postprocessing/compare_upgrades-test_timeseries_plots.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/postprocessing/compare_upgrades-test_timeseries_plots.py b/postprocessing/compare_upgrades-test_timeseries_plots.py index b8ac84801..346786a17 100644 --- a/postprocessing/compare_upgrades-test_timeseries_plots.py +++ b/postprocessing/compare_upgrades-test_timeseries_plots.py @@ -47,16 +47,16 @@ def main(): timeseries_locations_to_plot={ 'MN': 'Minnesota', # specify location (either county ID or state ID) and corresponding name for plots and folders. #'MA':'Massachusetts', - ('MA', 'NH', 'CT', 'VT', 'RI'): 'New England', # example of multiple states together - using tuples as keys #'OR': 'Oregon', #'LA': 'Louisiana', #'AZ': 'Arizona', #'TN': 'Tennessee', - #'G2500250': 'Boston', # if specifying a county, you must export county level data to S3 + ('MA', 'NH', 'CT', 'VT', 'RI'): 'New England', # example of multiple states together - using tuples as keys #'G4900350': 'Salt Lake City', + #'G2500250': 'Boston', # if specifying a county, you must export county level data to S3 #'G4804530': 'Austin', ('G2500250', 'G4804530'):'Baustin' # multiple counties together - using tuples as keys - }, + }, upgrade_ids_for_comparison={} # Use {'':[0,1,2]}; add as many upgrade IDs as needed, but plots look strange over 5 #output_dir = 's3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2025/comstock_amy2018_release_1' From 6e878d508de11c2128d75f8e238c86fd74168fc0 Mon Sep 17 00:00:00 2001 From: christophercaradonna Date: Fri, 14 Nov 2025 11:41:55 -0700 Subject: [PATCH 09/16] Add AMI plot functionality and some cleanup --- .../compare_comstock_to_ami.py.template | 126 ++++-- postprocessing/compare_upgrades.py.template | 123 +++--- postprocessing/comstockpostproc/comstock.py | 161 +++++--- .../comstock_measure_comparison.py | 121 +++--- .../comstock_query_builder.py | 380 +++++------------- .../comstock_to_ami_comparison.py | 13 +- .../comstockpostproc/plotting_mixin.py | 25 +- 7 files changed, 463 insertions(+), 486 deletions(-) diff --git a/postprocessing/compare_comstock_to_ami.py.template b/postprocessing/compare_comstock_to_ami.py.template index 9e18c1fe2..fa7ce6b3e 100644 --- a/postprocessing/compare_comstock_to_ami.py.template +++ b/postprocessing/compare_comstock_to_ami.py.template @@ -12,37 +12,105 @@ logger = logging.getLogger(__name__) def main(): # ComStock run comstock = cspp.ComStock( - s3_base_dir='eulp/comstock_core', - comstock_run_name='ami_comparison', - comstock_run_version='ami_comparison', - comstock_year=2018, - truth_data_version='v01', - buildstock_csv_name='buildstock.csv', - acceptable_failure_percentage=0.9, - drop_failed_runs=True, - color_hex='#0072B2', - skip_missing_columns=True, - athena_table_name='ami_comparison', - reload_from_csv=False, - include_upgrades=False - ) - - # CBECS + s3_base_dir='com-sdr', # If run not on S3, download results_up**.parquet manually + comstock_run_name='rtuadv_v11', # Name of the run on S3 + comstock_run_version='rtuadv_v11', # Use whatever you want to see in plot and folder names + comstock_year=2018, # Typically don't change this + athena_table_name=None, # Typically don't change this + truth_data_version='v01', # Typically don't change this + buildstock_csv_name='buildstock.csv', # Download buildstock.csv manually + acceptable_failure_percentage=0.25, # Can increase this when testing and high failure are OK + drop_failed_runs=True, # False if you want to evaluate which runs failed in raw output data + color_hex='#0072B2', # Color used to represent this run in plots + skip_missing_columns=True, # False if you want to ensure you have all data specified for export + reload_from_cache=True, # True if CSV already made and want faster reload times + include_upgrades=False, # False if not looking at upgrades + # output_dir = 's3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2025/comstock_amy2018_release_1' + ) + + + # Stock Estimation for Apportionment: + stock_estimate = cspp.Apportion( + stock_estimation_version='2025R3', # Only updated when a new stock estimate is published + truth_data_version='v01', # Typically don't change this + reload_from_cache=True # Set to "True" if you have already run apportionment and would like to keep consistant values between postprocessing runs. + ) + + # Scale ComStock runs to the 'truth data' from StockE V3 estimates using bucket-based apportionment + base_sim_outs = comstock.get_sim_outs_for_upgrade(0) + comstock.create_allocated_weights(stock_estimate, base_sim_outs, reload_from_cache=False) + + # CBECS cbecs = cspp.CBECS( - cbecs_year = 2018, - truth_data_version='v01', - color_hex='#009E73', - reload_from_csv=False + cbecs_year=2018, # 2012 and 2018 currently available + truth_data_version='v01', # Typically don't change this + color_hex='#009E73', # Color used to represent CBECS in plots + reload_from_csv=True # True if CSV already made and want faster reload times ) - - # Scale ComStock run to CBECS 2018 AND remove non-ComStock buildings from CBECS + + # First scale ComStock runs to the 'truth data' from StockE V3 estimates using bucket-based apportionment + # Then scale both ComStock runs to CBECS 2018 AND remove non-ComStock buildings from CBECS # This is how weights in the models are set to represent national energy consumption - comstock.add_national_scaling_weights(cbecs, remove_non_comstock_bldg_types_from_cbecs=True) + base_sim_outs = comstock.get_sim_outs_for_upgrade(0) + alloc_wts = comstock.get_allocated_weights() + comstock.create_allocated_weights_scaled_to_cbecs(cbecs, base_sim_outs, alloc_wts, remove_non_comstock_bldg_types_from_cbecs=True) + comstock.create_allocated_weights_plus_util_bills_for_upgrade(0) + + + # county resolution, files by state and county + county_resolution = { + 'geo_top_dir': 'by_state_and_county', + 'partition_cols': { + comstock.STATE_ABBRV: 'state', + comstock.COUNTY_ID: 'county', + }, + 'aggregation_levels': [comstock.COUNTY_ID], # , comstock.COUNTY_ID], # Full tract resolution (agg=in.nhgis_tract_gisjoin) + 'data_types': ['full'], + 'file_types': ['parquet'], + } + + # state level resolution, one single national file + state_resolution = { + 'geo_top_dir': 'national_by_state', + 'partition_cols': {}, + 'aggregation_levels': [[comstock.STATE_ABBRV, comstock.CZ_ASHRAE]], + 'data_types': ['full'], # other options: 'detailed', 'basic' **If using multiple options, order must go from more detailed to less detailed. + 'file_types': ['parquet'], # other options:'parquet' + } + + # specify the export level + # IMPORTANT: if making county level timeseries plots, must export county level data to S3. This does not occur automatically. + geo_exports = [county_resolution] #state_resolution + + #for geo_export in geo_exports: + # # write files locally as needed + # #comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id=0, geo_exports=[geo_export]) + + # # Also write to S3 if making timeseries plots + # if comstock.make_timeseries_plots: + # s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" + # s3_output_dir = comstock.setup_fsspec_filesystem(s3_dir, aws_profile_name=None) + # comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id=0, geo_exports=[geo_export], output_dir=s3_output_dir) + + # write select results to S3 for Athena/Glue when needed for timeseries plots + #s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" + #database = "enduse" + #dataset_name = f"{comstock.comstock_run_name}/{comstock.comstock_run_name}" # name of the dir in s3_dir we want to make new tables and views for + #crawler_name = comstock.comstock_run_name # used to set name of crawler, cannot include slashes + #workgroup = "eulp" # Athena workgroup to use + #glue_service_role = "service-role/AWSGlueServiceRole-default" + + # Export parquet files to S3 for Athena/Glue + # TODO: modify so that county data can be used for timeseries plots + # TODO: Modify geo export structure to specify parquet files only for this part of the workflow + #comstock.create_sightglass_tables(s3_location=f"{s3_dir}/metadata_and_annual_results_aggregates", + # dataset_name=crawler_name, + # database_name=database, + # glue_service_role=glue_service_role) + #comstock.fix_timeseries_tables(crawler_name, database) + #comstock.create_views(crawler_name, database, workgroup) + - # Export CBECS and ComStock data to wide and long formats for Tableau and to skip processing later - cbecs.export_to_csv_wide() # May comment this out after run once - comstock.export_to_csv_wide() # May comment this out after run once - # AMI ami = cspp.AMI( truth_data_version='v01', @@ -52,8 +120,8 @@ def main(): # comparison comparison = cspp.ComStockToAMIComparison(comstock, ami, make_comparison_plots=True) - comparison.export_plot_data_to_csv_wide() - + #comparison.export_plot_data_to_csv_wide() + # Code to execute the script if __name__ == "__main__": main() \ No newline at end of file diff --git a/postprocessing/compare_upgrades.py.template b/postprocessing/compare_upgrades.py.template index 36618a804..1580a60c4 100644 --- a/postprocessing/compare_upgrades.py.template +++ b/postprocessing/compare_upgrades.py.template @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- import logging - import comstockpostproc as cspp logging.basicConfig(level='INFO') # Use DEBUG, INFO, or WARNING @@ -11,9 +10,9 @@ logger = logging.getLogger(__name__) def main(): # ComStock run comstock = cspp.ComStock( - s3_base_dir='com-sdr', # If run not on S3, download results_up**.parquet manually - comstock_run_name='sdr_2025_r3_combined', # Name of the run on S3 - comstock_run_version='sdr_2025_r3_combined', # Use whatever you want to see in plot and folder names + s3_base_dir='eulp/euss_com', # If run not on S3, download results_up**.parquet manually + comstock_run_name='cchpc_10k_3', # Name of the run on S3 + comstock_run_version='cchpc_10k_3', # Use whatever you want to see in plot and folder names comstock_year=2018, # Typically don't change this athena_table_name=None, # Typically don't change this truth_data_version='v01', # Typically don't change this @@ -22,18 +21,25 @@ def main(): drop_failed_runs=True, # False if you want to evaluate which runs failed in raw output data color_hex='#0072B2', # Color used to represent this run in plots skip_missing_columns=True, # False if you want to ensure you have all data specified for export - reload_from_cache=True, # True if CSV already made and want faster reload times + reload_from_cache=False, # True if CSV already made and want faster reload times include_upgrades=True, # False if not looking at upgrades upgrade_ids_to_skip=[], # Use [1, 3] etc. to exclude certain upgrades - make_timeseries_plots=False, - states={ - #'MN': 'Minnesota', # specify state to use for timeseries plots in dictionary format. State ID must correspond correctly. - 'MA':'Massachusetts', - #'OR': 'Oregon', - #'LA': 'Louisiana', - #'AZ': 'Arizona', - #'TN': 'Tennessee' - }, + make_timeseries_plots=True, + timeseries_locations_to_plot={ + 'MN': 'Minnesota', # specify location (either county ID or state ID) and corresponding name for plots and folders. + #'MA':'Massachusetts', + 'OH':'Ohio', + #'OR': 'Oregon', + #'LA': 'Louisiana', + 'AZ': 'Arizona', + #'TN': 'Tennessee', + #('MA', 'NH', 'CT', 'VT', 'RI'): 'New England', # example of multiple states together - using tuples as keys + #'G4900350': 'Salt Lake City', + #'G2500250': 'Boston', # if specifying a county, you must export county level data to S3 + #'G4804530': 'Austin', + #('G2500250', 'G4804530'):'Baustin' # multiple counties together - using tuples as keys + }, + upgrade_ids_for_comparison={} # Use {'':[0,1,2]}; add as many upgrade IDs as needed, but plots look strange over 5 #output_dir = 's3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2025/comstock_amy2018_release_1' ) @@ -42,26 +48,26 @@ def main(): stock_estimate = cspp.Apportion( stock_estimation_version='2025R3', # Only updated when a new stock estimate is published truth_data_version='v01', # Typically don't change this - reload_from_cache=True, # Set to "True" if you have already run apportionment and would like to keep consistant values between postprocessing runs. + reload_from_cache=False, # Set to "True" if you have already run apportionment and would like to keep consistant values between postprocessing runs. #output_dir = 's3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2025/comstock_amy2018_release_1' ) # Scale ComStock runs to the 'truth data' from StockE V3 estimates using bucket-based apportionment base_sim_outs = comstock.get_sim_outs_for_upgrade(0) - comstock.create_allocated_weights(stock_estimate, base_sim_outs, reload_from_cache=True) + comstock.create_allocated_weights(stock_estimate, base_sim_outs, reload_from_cache=False) # CBECS cbecs = cspp.CBECS( cbecs_year=2018, # 2012 and 2018 currently available truth_data_version='v01', # Typically don't change this color_hex='#009E73', # Color used to represent CBECS in plots - reload_from_csv=True # True if CSV already made and want faster reload times + reload_from_csv=False # True if CSV already made and want faster reload times ) # Scale ComStock to CBECS 2018 AND remove non-ComStock buildings from CBECS base_sim_outs = comstock.get_sim_outs_for_upgrade(0) alloc_wts = comstock.get_allocated_weights() - comstock.create_allocated_weights_scaled_to_cbecs(cbecs, base_sim_outs, alloc_wts, remove_non_comstock_bldg_types_from_cbecs=False) + comstock.create_allocated_weights_scaled_to_cbecs(cbecs, base_sim_outs, alloc_wts, remove_non_comstock_bldg_types_from_cbecs=True) # Add utility bills onto allocated weights for upgrade_id in comstock.upgrade_ids_to_process: @@ -69,44 +75,67 @@ def main(): # up_alloc_wts = comstock.get_allocated_weights_scaled_to_cbecs_for_upgrade(upgrade_id) comstock.create_allocated_weights_plus_util_bills_for_upgrade(upgrade_id) - # Export metadata files - geo_exports = [ - #{'geo_top_dir': 'by_state_and_county', - # 'partition_cols': { - # comstock.STATE_ABBRV: 'state', - # comstock.COUNTY_ID: 'county', - # }, - # 'aggregation_levels': ['in.nhgis_tract_gisjoin'], # , comstock.COUNTY_ID], # Full tract resolution (agg=in.nhgis_tract_gisjoin) - # 'data_types': ['full', 'basic'], - # 'file_types': ['csv', 'parquet'], - #}, - {'geo_top_dir': 'national_by_state', - 'partition_cols': {}, - 'aggregation_levels': [[comstock.STATE_ABBRV, comstock.CZ_ASHRAE]], - 'data_types': ['full'], # other options: 'detailed', 'basic' **If using multiple options, order must go from more detailed to less detailed. - 'file_types': ['csv'], # other options:'parquet' - } - ] + # Specify geo exports + # county resolution, files by state and county + county_resolution = { + 'geo_top_dir': 'by_state_and_county', + 'partition_cols': { + comstock.STATE_ABBRV: 'state', + comstock.COUNTY_ID: 'county', + }, + 'aggregation_levels': [comstock.COUNTY_ID], # , comstock.COUNTY_ID], # Full tract resolution (agg=in.nhgis_tract_gisjoin) + 'data_types': ['full'], + 'file_types': ['parquet'], + } + + # state level resolution, one single national file + state_resolution = { + 'geo_top_dir': 'national_by_state', + 'partition_cols': {}, + 'aggregation_levels': [[comstock.STATE_ABBRV, comstock.CZ_ASHRAE]], + 'data_types': ['full'], # other options: 'detailed', 'basic' **If using multiple options, order must go from more detailed to less detailed. + 'file_types': ['parquet'], # other options:'parquet' + } + + # specify the export level + # IMPORTANT: if making county level timeseries plots, must export county level data to S3. This does not occur automatically. + geo_exports = [state_resolution] #state_resolution for geo_export in geo_exports: for upgrade_id in comstock.upgrade_ids_to_process: - # if upgrade_id == 0: - # continue + # if upgrade_id == 0: + # continue comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id, [geo_export]) - ## Compare to CBECS - #comp = cspp.ComStockToCBECSComparison(cbecs_list=[cbecs], - # comstock_list=[comstock], - # upgrade_id='All', - # make_comparison_plots=True, - # make_hvac_plots=False) - #comp.export_to_csv_wide() # May comment this out after run once + # Also write to S3 if making timeseries plots + if comstock.make_timeseries_plots: #TODO: force geo exports to county data if couunty timeseries is requested. + s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" + s3_output_dir = comstock.setup_fsspec_filesystem(s3_dir, aws_profile_name=None) + comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id=upgrade_id, geo_exports=[geo_export], output_dir=s3_output_dir) + + + # write select results to S3 for Athena/Glue when needed for timeseries plots + if comstock.make_timeseries_plots: + s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" + database = "enduse" + dataset_name = f"{comstock.comstock_run_name}/{comstock.comstock_run_name}" # name of the dir in s3_dir we want to make new tables and views for + crawler_name = comstock.comstock_run_name # used to set name of crawler, cannot include slashes + workgroup = "eulp" # Athena workgroup to use + glue_service_role = "service-role/AWSGlueServiceRole-default" + + # Export parquet files to S3 for Athena/Glue + comstock.create_sightglass_tables(s3_location=f"{s3_dir}/metadata_and_annual_results_aggregates", + dataset_name=crawler_name, + database_name=database, + glue_service_role=glue_service_role) + comstock.fix_timeseries_tables(crawler_name, database) + comstock.create_views(crawler_name, database, workgroup) # Create measure run comparisons; only use if run has measures - #comparison = cspp.ComStockMeasureComparison(comstock, states=comstock.states, make_comparison_plots = comstock.make_comparison_plots, make_timeseries_plots = comstock.make_timeseries_plots) + comparison = cspp.ComStockMeasureComparison(comstock, timeseries_locations_to_plot=comstock.timeseries_locations_to_plot, make_comparison_plots = comstock.make_comparison_plots, make_timeseries_plots = comstock.make_timeseries_plots) # Export dictionaries corresponding to the exported columns - comstock.export_data_and_enumeration_dictionary() + #comstock.export_data_and_enumeration_dictionary() # Code to execute the script if __name__=="__main__": diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py index 2fa54d2c5..dfa9f80fa 100644 --- a/postprocessing/comstockpostproc/comstock.py +++ b/postprocessing/comstockpostproc/comstock.py @@ -388,9 +388,19 @@ def download_data(self): self.read_delimited_truth_data_file_from_S3(s3_file_path, ',') def download_timeseries_data_for_ami_comparison(self, ami, reload_from_csv=True, save_individual_regions=False): + + # Initialize Athena client + athena_client = BuildStockQuery(workgroup='eulp', + db_name='enduse', + table_name=self.comstock_run_name, + buildstock_type='comstock', + skip_reports=True, + metadata_table_suffix='_md_agg_by_state_and_county_vu', + ) + if reload_from_csv: file_name = f'Timeseries for AMI long.csv' - file_path = os.path.join(self.output_dir, file_name) + file_path = os.path.join(self.output_dir['fs_path'], file_name) if not os.path.exists(file_path): raise FileNotFoundError( f'Cannot find {file_path} to reload data, set reload_from_csv=False to create CSV.') @@ -403,13 +413,26 @@ def download_timeseries_data_for_ami_comparison(self, ami, reload_from_csv=True, athena_end_uses.append('total_site_electricity_kwh') all_timeseries_df = pd.DataFrame() for region in ami.ami_region_map: - region_file_path_long = os.path.join(self.output_dir, region['source_name'] + '_building_type_timeseries_long.csv') + region_file_path_long = os.path.join(self.output_dir['fs_path'], region['source_name'] + '_building_type_timeseries_long.csv') if os.path.isfile(region_file_path_long) and reload_from_csv and save_individual_regions: logger.info(f"timeseries data in long format for {region['source_name']} already exists at {region_file_path_long}") continue - ts_agg = self.athena_client.agg.aggregate_timeseries(enduses=athena_end_uses, - group_by=['build_existing_model.building_type', 'time'], - restrict=[('build_existing_model.county_id', region['county_ids'])]) + + + # Use ComStockQueryBuilder + builder = ComStockQueryBuilder(self.comstock_run_name) + weight_view_table = f'{self.comstock_run_name}_md_agg_by_state_and_county_vu' + query = builder.get_timeseries_aggregation_query( + upgrade_id=0, + enduses=athena_end_uses, + group_by=[self.BLDG_TYPE, 'time'], + restrictions=[(self.COUNTY_ID, region['county_ids'])], + timestamp_grouping='hour', + weight_view_table=weight_view_table, + include_area_normalized_cols=True + ) + + ts_agg = athena_client.execute(query) ######################## END REVISED SECTION @@ -425,7 +448,7 @@ def download_timeseries_data_for_ami_comparison(self, ami, reload_from_csv=True, timeseries_df['region_name'] = region['source_name'] all_timeseries_df = pd.concat([all_timeseries_df, timeseries_df]) - data_path = os.path.join(self.output_dir, 'Timeseries for AMI long.csv') + data_path = os.path.join(self.output_dir['fs_path'], 'Timeseries for AMI long.csv') all_timeseries_df.to_csv(data_path, index=True) self.ami_timeseries_data = all_timeseries_df @@ -433,7 +456,7 @@ def convert_timeseries_to_long(self, agg_df, county_ids, output_name, save_indiv # rename columns agg_df = agg_df.set_index('time') agg_df = agg_df.rename(columns={ - "build_existing_model.building_type": "building_type", + self.BLDG_TYPE: "building_type", "electricity_exterior_lighting_kwh": "exterior_lighting", "electricity_interior_lighting_kwh": "interior_lighting", "electricity_interior_equipment_kwh": "interior_equipment", @@ -444,7 +467,18 @@ def convert_timeseries_to_long(self, agg_df, county_ids, output_name, save_indiv "electricity_cooling_kwh": "cooling", "electricity_heating_kwh": "heating", "electricity_refrigeration_kwh": "refrigeration", - "total_site_electricity_kwh": "total" + "total_site_electricity_kwh": "total", + "electricity_exterior_lighting_kwh_per_sf": "exterior_lighting_per_sf", + "electricity_interior_lighting_kwh_per_sf": "interior_lighting_per_sf", + "electricity_interior_equipment_kwh_per_sf": "interior_equipment_per_sf", + "electricity_water_systems_kwh_per_sf": "water_systems_per_sf", + "electricity_heat_recovery_kwh_per_sf": "heat_recovery_per_sf", + "electricity_fans_kwh_per_sf": "fans_per_sf", + "electricity_pumps_kwh_per_sf": "pumps_per_sf", + "electricity_cooling_kwh_per_sf": "cooling_per_sf", + "electricity_heating_kwh_per_sf": "heating_per_sf", + "electricity_refrigeration_kwh_per_sf": "refrigeration_per_sf", + "total_site_electricity_kwh_per_sf": "kwh_per_sf" }) # aggregate by hour @@ -466,8 +500,9 @@ def convert_timeseries_to_long(self, agg_df, county_ids, output_name, save_indiv agg_df = agg_df.set_index('timestamp') agg_df = agg_df[agg_df.index.dayofyear != 366] - # melt into long format - val_vars = [ + # melt into long format with both kwh and kwh_per_sf columns + # First melt the absolute values (kwh) + kwh_vars = [ 'exterior_lighting', 'interior_lighting', 'interior_equipment', @@ -480,50 +515,82 @@ def convert_timeseries_to_long(self, agg_df, county_ids, output_name, save_indiv 'refrigeration', 'total' ] - agg_df = pd.melt( + + kwh_df = pd.melt( agg_df.reset_index(), id_vars=[ 'timestamp', 'building_type', 'sample_count' ], - value_vars=val_vars, + value_vars=kwh_vars, var_name='enduse', value_name='kwh' ).set_index('timestamp') - agg_df = agg_df.rename(columns={'sample_count': 'bldg_count'}) - # size get data - # note that this is a Polars dataframe, not a Pandas dataframe - self.data = self.data.with_columns([pl.col('in.nhgis_county_gisjoin').cast(pl.Utf8)]) - # size_df = self.data.filter(self.data['in.nhgis_county_gisjoin'].is_in(county_ids)) - size_df = self.data.clone().filter(pl.col('in.nhgis_county_gisjoin').is_in(county_ids)) - size_df = size_df.select(['in.comstock_building_type', 'in.sqft', 'calc.weighted.sqft']).collect() + # Then melt the normalized values (kwh_per_sf) + kwh_per_sf_vars = [ + 'exterior_lighting_per_sf', + 'interior_lighting_per_sf', + 'interior_equipment_per_sf', + 'water_systems_per_sf', + 'heat_recovery_per_sf', + 'fans_per_sf', + 'pumps_per_sf', + 'cooling_per_sf', + 'heating_per_sf', + 'refrigeration_per_sf', + 'kwh_per_sf' + ] - # Cast columns to match dtype of lists - size_df = size_df.with_columns([ - pl.col('in.comstock_building_type').cast(pl.Utf8), - pl.col('in.sqft').cast(pl.Float64), - pl.col('calc.weighted.sqft').cast(pl.Float64) - ]) + # Map per_sf column names to base enduse names + per_sf_mapping = { + 'exterior_lighting_per_sf': 'exterior_lighting', + 'interior_lighting_per_sf': 'interior_lighting', + 'interior_equipment_per_sf': 'interior_equipment', + 'water_systems_per_sf': 'water_systems', + 'heat_recovery_per_sf': 'heat_recovery', + 'fans_per_sf': 'fans', + 'pumps_per_sf': 'pumps', + 'cooling_per_sf': 'cooling', + 'heating_per_sf': 'heating', + 'refrigeration_per_sf': 'refrigeration', + 'kwh_per_sf': 'total' + } + + kwh_per_sf_df = pd.melt( + agg_df.reset_index(), + id_vars=[ + 'timestamp', + 'building_type', + 'sample_count' + ], + value_vars=kwh_per_sf_vars, + var_name='enduse', + value_name='kwh_per_sf_value' + ).set_index('timestamp') - size_df = size_df.group_by('in.comstock_building_type').agg(pl.col(['in.sqft', 'calc.weighted.sqft']).sum()) - size_df = size_df.with_columns((pl.col('calc.weighted.sqft')/pl.col('in.sqft')).alias('weight')) - size_df = size_df.with_columns((pl.col('in.comstock_building_type').replace(self.BLDG_TYPE_TO_SNAKE_CASE, default=None)).alias('building_type')) - # file_path = os.path.join(self.output_dir, output_name + '_building_type_size.csv') - # size_df.write_csv(file_path) + # Map the per_sf enduse names to base names + kwh_per_sf_df['enduse'] = kwh_per_sf_df['enduse'].map(per_sf_mapping) - weight_dict = dict(zip(size_df['building_type'].to_list(), size_df['weight'].to_list())) - weight_size_dict = dict(zip(size_df['building_type'].to_list(), size_df['calc.weighted.sqft'].to_list())) - agg_df['kwh_weighted'] = agg_df['kwh'] * agg_df['building_type'].map(weight_dict) + # Rename the value column to the final name + kwh_per_sf_df = kwh_per_sf_df.rename(columns={'kwh_per_sf_value': 'kwh_per_sf'}) - # calculate kwh/sf - agg_df['kwh_per_sf'] = agg_df.apply(lambda row: row['kwh_weighted'] / weight_size_dict.get(row['building_type'], 1), axis=1) - agg_df = agg_df.drop(['kwh', 'kwh_weighted'], axis=1) + # Reset index so timestamp becomes a column for merging + kwh_df = kwh_df.reset_index() + kwh_per_sf_df = kwh_per_sf_df.reset_index() + + # Merge the two dataframes on timestamp, building_type, sample_count, and enduse + agg_df = kwh_df.merge( + kwh_per_sf_df, + on=['timestamp', 'building_type', 'sample_count', 'enduse'], + how='inner' + ).set_index('timestamp') + agg_df = agg_df.rename(columns={'sample_count': 'bldg_count'})\ # save out long data format if save_individual_region: - output_file_path = os.path.join(self.output_dir, output_name + '_building_type_timeseries_long.csv') + output_file_path = os.path.join(self.output_dir['fs_path'], output_name + '_building_type_timeseries_long.csv') agg_df.to_csv(output_file_path, index=True) logger.info(f"Saved enduse timeseries in long format for {output_name} to {output_file_path}") @@ -540,9 +607,6 @@ def load_data(self, upgrade_id, acceptable_failure_percentage=0.01, drop_failed_ buildstock = pl.read_csv(os.path.join(self.data_dir, self.buildstock_file_name), infer_schema_length=50000) buildstock = buildstock.rename({'Building': 'sample_building_id'}) - - # if "sample_building_id" not in buildstock: - # raise Exception(f"the csv path is {os.path.join(self.data_dir, self.buildstock_file_name)}") buildstock_bldg_count = buildstock.shape[0] logger.debug(f'{buildstock_bldg_count} models in buildstock.csv') @@ -4278,6 +4342,7 @@ def column_filter(col): or col.name.startswith("out.electricity.total.nov") or col.name.startswith("out.electricity.total.oct") or col.name.startswith("out.electricity.total.sep") + or col.name == "in.airtightness..m3_per_m2_h" # Skip BIGINT airtightness column TODO: fix upstream in postproc ) @staticmethod @@ -4449,7 +4514,7 @@ def create_views( # when determining the unique set of filter values for SightGlass if c.name.startswith('in.') and (str(c.type) == 'BIGINT' or (str(c.type) == 'BOOLEAN')): col_type_errs += 1 - logger.error(f'in.foo column {c} may not be a BIGINT or BOOLEAN for SightGlass to work') + logger.error(f'in.foo column {c.name} may not be a BIGINT or BOOLEAN for SightGlass to work') # Expected bigint columns in SightGlass expected_bigints = ['metadata_index', 'upgrade', 'bldg_id'] @@ -4483,8 +4548,8 @@ def create_views( view_name = metadata_tblname.replace('_parquet', '') view_name = f"{view_name}_vu" db_plus_vu = f'{database_name}_{view_name}' - if len(db_plus_vu) > 63: - raise RuntimeError(f'db + view: `{db_plus_vu}` must be < 64 chars for SightGlass postgres storage') + #if len(db_plus_vu) > 63: #TODO: add name truncation if we need to use this method for sightglass + # raise RuntimeError(f'db + view: `{db_plus_vu}` must be < 64 chars for SightGlass postgres storage') view = sa.Table(view_name, meta) create_view = CreateView(view, q, or_replace=True) logger.info(f"Creating metadata view: {view_name}, partitioned by {bys}") @@ -4590,8 +4655,6 @@ def determine_state_or_county_timeseries_table(self, location_input, location_na print(f"Warning: Unexpected location input type: {type(location_input)}, defaulting to state") return 'state' - - def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_input, upgrade_name): """ This method retrieves weighted timeseries profiles from s3/athena. @@ -4686,7 +4749,7 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_input, up # Execute query to get applicable buildings applic_df = athena_client.execute(applicability_query) - applic_bldgs_list = list(applic_df['bldg_id'].unique()) + applic_bldgs_list = list(applic_df[self.BLDG_ID].unique()) applic_bldgs_list = [int(x) for x in applic_bldgs_list] import time @@ -4695,9 +4758,9 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_input, up # Build geographic restrictions for all locations geo_restrictions = [] if state_locations: - geo_restrictions.append(('in.state', state_locations)) + geo_restrictions.append((self.STATE_ABBRV, state_locations)) if county_locations: - geo_restrictions.append(('in.nhgis_county_gisjoin', county_locations)) + geo_restrictions.append((self.COUNTY_ID, county_locations)) # loop through upgrades for upgrade_id in df[self.UPGRADE_ID].unique(): @@ -4709,7 +4772,7 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_input, up builder = ComStockQueryBuilder(self.comstock_run_name) # Build restrictions list including all geographic locations - restrictions = [('bldg_id', applic_bldgs_list)] + restrictions = [(self.BLDG_ID, applic_bldgs_list)] restrictions.extend(geo_restrictions) query = builder.get_timeseries_aggregation_query( diff --git a/postprocessing/comstockpostproc/comstock_measure_comparison.py b/postprocessing/comstockpostproc/comstock_measure_comparison.py index a08fe7405..7c9d27212 100644 --- a/postprocessing/comstockpostproc/comstock_measure_comparison.py +++ b/postprocessing/comstockpostproc/comstock_measure_comparison.py @@ -92,7 +92,6 @@ def __init__(self, comstock_object: comstock.ComStock, timeseries_locations_to_p else: logger.info("make_comparison_plots is set to false, so not plots were created. Set make_comparison_plots to True for plots.") - # make plots comparing multiple upgrades together for comp_name, comp_up_ids in self.upgrade_ids_for_comparison.items(): @@ -120,7 +119,7 @@ def __init__(self, comstock_object: comstock.ComStock, timeseries_locations_to_p # make consumption plots for upgrades if requested by user if make_comparison_plots: - self.make_comparative_plots(df_upgrade, self.column_for_grouping, timeseries_locations_to_plot, make_timeseries_plots, color_map, comp_output_dir) + self.make_comparative_plots(df_upgrade, self.column_for_grouping, timeseries_locations_to_plot, make_timeseries_plots, color_map, comp_output_dir, self.s3_base_dir, self.athena_table_name, self.comstock_run_name) else: logger.info("make_comparison_plots is set to false, so not plots were created. Set make_comparison_plots to True for plots.") end_time = pd.Timestamp.now() @@ -132,66 +131,63 @@ def make_plots(self, lazy_frame: pl.LazyFrame, column_for_grouping, timeseries_l BASIC_PARAMS = { 'column_for_grouping': column_for_grouping, 'color_map': color_map, - 'output_dir': output_dir, - 's3_base_dir':s3_base_dir, - 'athena_table_name':athena_table_name, - 'comstock_run_name':comstock_run_name + 'output_dir': output_dir } - #LazyFramePlotter.plot_with_lazy( - # plot_method=self.plot_energy_by_enduse_and_fuel_type, - # lazy_frame=lazy_frame.clone(), - # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_COLUMNS_ANN_ENDUSE + self.lazyframe_plotter.WTD_COLUMNS_ANN_PV + self.lazyframe_plotter.WTD_COLUMNS_SUMMARIZE))(**BASIC_PARAMS) - #LazyFramePlotter.plot_with_lazy( - # plot_method=self.plot_emissions_by_fuel_type, - # lazy_frame=lazy_frame.clone(), - # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_GHG_COLUMNS))(**BASIC_PARAMS) - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_utility_bills_by_fuel_type, lazy_frame=lazy_frame.clone(), columns=( - # self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_UTILITY_COLUMNS))(**BASIC_PARAMS) - #LazyFramePlotter.plot_with_lazy( - # plot_method=self.plot_floor_area_and_energy_totals, - # lazy_frame=lazy_frame.clone(), - # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_COLUMNS_SUMMARIZE))(**BASIC_PARAMS) - #LazyFramePlotter.plot_with_lazy( - # plot_method=self.plot_floor_area_and_energy_totals_by_building_type, - # lazy_frame=lazy_frame.clone(), - # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_COLUMNS_SUMMARIZE))(**BASIC_PARAMS) - #LazyFramePlotter.plot_with_lazy( - # plot_method=self.plot_end_use_totals_by_building_type, - # lazy_frame=lazy_frame.clone(), - # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_COLUMNS_ANN_ENDUSE + [self.BLDG_TYPE, self.CEN_DIV]))(**BASIC_PARAMS) - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_eui_histograms_by_building_type, - # lazy_frame=lazy_frame.clone(), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_ANN_TOTL_COLUMNS + [self.BLDG_TYPE]))(**BASIC_PARAMS) - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_eui_boxplots_by_building_type, - # lazy_frame=lazy_frame.clone(), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_ANN_TOTL_COLUMNS + [self.CEN_DIV, self.BLDG_TYPE]))(**BASIC_PARAMS) - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_savings_distributions_enduse_and_fuel, lazy_frame=lazy_frame.clone( - #), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.SAVINGS_DISTRI_ENDUSE_COLUMNS + [self.UPGRADE_ID]))(output_dir=output_dir) - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_savings_distributions_by_building_type, lazy_frame=lazy_frame.clone(), - # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.SAVINGS_DISTRI_BUILDINTYPE + [self.BLDG_TYPE, self.UPGRADE_ID]))(output_dir=output_dir) - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_savings_distributions_by_climate_zone, lazy_frame=lazy_frame.clone( - #), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.SAVINGS_DISTRI_BUILDINTYPE + [self.CZ_ASHRAE, self.UPGRADE_ID]))(output_dir=output_dir) - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_savings_distributions_by_hvac_system_type, lazy_frame=lazy_frame.clone( - #), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.SAVINGS_DISTRI_BUILDINTYPE + [self.HVAC_SYS, self.UPGRADE_ID]))(output_dir=output_dir) - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_utility_savings_distributions_by_fuel, lazy_frame=lazy_frame.clone( - #), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_SAVINGS_COLUMNS + [self.UPGRADE_ID]))(output_dir=output_dir) - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_utility_savings_distributions_by_building_type, lazy_frame=lazy_frame.clone( - #), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_SAVINGS_COLUMNS + [self.BLDG_TYPE, self.UPGRADE_ID]))(output_dir=output_dir) - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_utility_savings_distributions_by_climate_zone, lazy_frame=lazy_frame.clone( - #), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_SAVINGS_COLUMNS + [self.CZ_ASHRAE, self.UPGRADE_ID]))(output_dir=output_dir) - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_utility_savings_distributions_by_hvac_system, lazy_frame=lazy_frame.clone( - #), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_SAVINGS_COLUMNS + [self.HVAC_SYS, self.UPGRADE_ID]))(output_dir=output_dir) - - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_qoi_timing, lazy_frame=lazy_frame.clone(), - # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.QOI_COLUMNS))(**BASIC_PARAMS) - - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_qoi_max_use, lazy_frame=lazy_frame.clone(), - # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.QOI_COLUMNS))(**BASIC_PARAMS) - - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_qoi_min_use, lazy_frame=lazy_frame.clone(), - # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.QOI_COLUMNS))(**BASIC_PARAMS) - - #LazyFramePlotter.plot_with_lazy(plot_method=self.plot_unmet_hours, lazy_frame=lazy_frame.clone(), - # columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.UNMET_HOURS_COLS))(**BASIC_PARAMS) + LazyFramePlotter.plot_with_lazy( + plot_method=self.plot_energy_by_enduse_and_fuel_type, + lazy_frame=lazy_frame.clone(), + columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_COLUMNS_ANN_ENDUSE + self.lazyframe_plotter.WTD_COLUMNS_ANN_PV + self.lazyframe_plotter.WTD_COLUMNS_SUMMARIZE))(**BASIC_PARAMS) + LazyFramePlotter.plot_with_lazy( + plot_method=self.plot_emissions_by_fuel_type, + lazy_frame=lazy_frame.clone(), + columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_GHG_COLUMNS))(**BASIC_PARAMS) + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_utility_bills_by_fuel_type, lazy_frame=lazy_frame.clone(), columns=( + self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_UTILITY_COLUMNS))(**BASIC_PARAMS) + LazyFramePlotter.plot_with_lazy( + plot_method=self.plot_floor_area_and_energy_totals, + lazy_frame=lazy_frame.clone(), + columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_COLUMNS_SUMMARIZE))(**BASIC_PARAMS) + LazyFramePlotter.plot_with_lazy( + plot_method=self.plot_floor_area_and_energy_totals_by_building_type, + lazy_frame=lazy_frame.clone(), + columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_COLUMNS_SUMMARIZE))(**BASIC_PARAMS) + LazyFramePlotter.plot_with_lazy( + plot_method=self.plot_end_use_totals_by_building_type, + lazy_frame=lazy_frame.clone(), + columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.WTD_COLUMNS_ANN_ENDUSE + [self.BLDG_TYPE, self.CEN_DIV]))(**BASIC_PARAMS) + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_eui_histograms_by_building_type, + lazy_frame=lazy_frame.clone(), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_ANN_TOTL_COLUMNS + [self.BLDG_TYPE]))(**BASIC_PARAMS) + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_eui_boxplots_by_building_type, + lazy_frame=lazy_frame.clone(), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_ANN_TOTL_COLUMNS + [self.CEN_DIV, self.BLDG_TYPE]))(**BASIC_PARAMS) + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_savings_distributions_enduse_and_fuel, lazy_frame=lazy_frame.clone( + ), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.SAVINGS_DISTRI_ENDUSE_COLUMNS + [self.UPGRADE_ID]))(output_dir=output_dir) + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_savings_distributions_by_building_type, lazy_frame=lazy_frame.clone(), + columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.SAVINGS_DISTRI_BUILDINTYPE + [self.BLDG_TYPE, self.UPGRADE_ID]))(output_dir=output_dir) + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_savings_distributions_by_climate_zone, lazy_frame=lazy_frame.clone( + ), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.SAVINGS_DISTRI_BUILDINTYPE + [self.CZ_ASHRAE, self.UPGRADE_ID]))(output_dir=output_dir) + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_savings_distributions_by_hvac_system_type, lazy_frame=lazy_frame.clone( + ), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.SAVINGS_DISTRI_BUILDINTYPE + [self.HVAC_SYS, self.UPGRADE_ID]))(output_dir=output_dir) + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_utility_savings_distributions_by_fuel, lazy_frame=lazy_frame.clone( + ), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_SAVINGS_COLUMNS + [self.UPGRADE_ID]))(output_dir=output_dir) + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_utility_savings_distributions_by_building_type, lazy_frame=lazy_frame.clone( + ), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_SAVINGS_COLUMNS + [self.BLDG_TYPE, self.UPGRADE_ID]))(output_dir=output_dir) + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_utility_savings_distributions_by_climate_zone, lazy_frame=lazy_frame.clone( + ), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_SAVINGS_COLUMNS + [self.CZ_ASHRAE, self.UPGRADE_ID]))(output_dir=output_dir) + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_utility_savings_distributions_by_hvac_system, lazy_frame=lazy_frame.clone( + ), columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.EUI_SAVINGS_COLUMNS + [self.HVAC_SYS, self.UPGRADE_ID]))(output_dir=output_dir) + + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_qoi_timing, lazy_frame=lazy_frame.clone(), + columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.QOI_COLUMNS))(**BASIC_PARAMS) + + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_qoi_max_use, lazy_frame=lazy_frame.clone(), + columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.QOI_COLUMNS))(**BASIC_PARAMS) + + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_qoi_min_use, lazy_frame=lazy_frame.clone(), + columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.QOI_COLUMNS))(**BASIC_PARAMS) + + LazyFramePlotter.plot_with_lazy(plot_method=self.plot_unmet_hours, lazy_frame=lazy_frame.clone(), + columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.UNMET_HOURS_COLS))(**BASIC_PARAMS) if make_timeseries_plots: TIMESERIES_PARAMS = {'comstock_run_name': self.comstock_run_name, 'timeseries_locations_to_plot': timeseries_locations_to_plot, 'color_map': color_map, @@ -213,10 +209,7 @@ def make_comparative_plots(self, lazy_frame: pl.LazyFrame, column_for_grouping, BASIC_PARAMS = { 'column_for_grouping': column_for_grouping, 'color_map': color_map, - 'output_dir': output_dir, - 's3_base_dir': s3_base_dir, - 'athena_table_name': athena_table_name, - 'comstock_run_name': comstock_run_name + 'output_dir': output_dir } logger.info(f'Making comparison plots for upgrade groupings') diff --git a/postprocessing/comstockpostproc/comstock_query_builder.py b/postprocessing/comstockpostproc/comstock_query_builder.py index 1f940c79f..f16b26374 100644 --- a/postprocessing/comstockpostproc/comstock_query_builder.py +++ b/postprocessing/comstockpostproc/comstock_query_builder.py @@ -2,8 +2,7 @@ Query templates and builders for ComStock data extraction from Athena. This module provides SQL query templates and builder functions to construct -Athena queries for various ComStock analysis needs, separating query logic -from the main ComStock processing classes. +Athena queries for various ComStock analysis needs. """ import logging @@ -31,6 +30,7 @@ def __init__(self, athena_table_name: str): self.timeseries_table = f"{athena_table_name}_timeseries" self.baseline_table = f"{athena_table_name}_baseline" + #TODO: this method has not yet been validated, and should be prior to use. def get_monthly_energy_consumption_query(self, upgrade_ids: Optional[List[Union[int, str]]] = None, states: Optional[List[str]] = None, @@ -47,63 +47,65 @@ def get_monthly_energy_consumption_query(self, SQL query string """ - ## Build WHERE clause conditions - #where_conditions = ['"build_existing_model.building_type" IS NOT NULL'] - - #if upgrade_ids: - # upgrade_list = ', '.join([f'"{uid}"' for uid in upgrade_ids]) - # where_conditions.append(f'"upgrade" IN ({upgrade_list})') - - #if states: - # state_list = ', '.join([f'"{state}"' for state in states]) - # where_conditions.append(f'SUBSTRING("build_existing_model.county_id", 2, 2) IN ({state_list})') - - #if building_types: - # btype_list = ', '.join([f'"{bt}"' for bt in building_types]) - # where_conditions.append(f'"build_existing_model.create_bar_from_building_type_ratios_bldg_type_a" IN ({btype_list})') - - #where_clause = ' AND '.join(where_conditions) - - #query = f""" - # SELECT - # "upgrade", - # "month", - # "state_id", - # "building_type", - # sum("total_site_gas_kbtu") AS "total_site_gas_kbtu", - # sum("total_site_electricity_kwh") AS "total_site_electricity_kwh" - # FROM - # ( - # SELECT - # EXTRACT(MONTH from "time") as "month", - # SUBSTRING("build_existing_model.county_id", 2, 2) AS "state_id", - # "build_existing_model.create_bar_from_building_type_ratios_bldg_type_a" as "building_type", - # "upgrade", - # "total_site_gas_kbtu", - # "total_site_electricity_kwh" - # FROM - # "{self.timeseries_table}" - # JOIN "{self.baseline_table}" - # ON "{self.timeseries_table}"."building_id" = "{self.baseline_table}"."building_id" - # WHERE {where_clause} - # ) - # GROUP BY - # "upgrade", - # "month", - # "state_id", - # "building_type" - #""" - - #return query.strip() + # Build WHERE clause conditions + where_conditions = ['"build_existing_model.building_type" IS NOT NULL'] + + if upgrade_ids: + upgrade_list = ', '.join([f'"{uid}"' for uid in upgrade_ids]) + where_conditions.append(f'"upgrade" IN ({upgrade_list})') + + if states: + state_list = ', '.join([f'"{state}"' for state in states]) + where_conditions.append(f'SUBSTRING("build_existing_model.county_id", 2, 2) IN ({state_list})') + + if building_types: + btype_list = ', '.join([f'"{bt}"' for bt in building_types]) + where_conditions.append(f'"build_existing_model.create_bar_from_building_type_ratios_bldg_type_a" IN ({btype_list})') + + where_clause = ' AND '.join(where_conditions) + + query = f""" + SELECT + "upgrade", + "month", + "state_id", + "building_type", + sum("total_site_gas_kbtu") AS "total_site_gas_kbtu", + sum("total_site_electricity_kwh") AS "total_site_electricity_kwh" + FROM + ( + SELECT + EXTRACT(MONTH from "time") as "month", + SUBSTRING("build_existing_model.county_id", 2, 2) AS "state_id", + "build_existing_model.create_bar_from_building_type_ratios_bldg_type_a" as "building_type", + "upgrade", + "total_site_gas_kbtu", + "total_site_electricity_kwh" + FROM + "{self.timeseries_table}" + JOIN "{self.baseline_table}" + ON "{self.timeseries_table}"."building_id" = "{self.baseline_table}"."building_id" + WHERE {where_clause} + ) + GROUP BY + "upgrade", + "month", + "state_id", + "building_type" + """ + + return query.strip() def get_timeseries_aggregation_query(self, upgrade_id: Union[int, str], enduses: List[str], + weight_view_table: str, + group_by: Optional[List[str]] = None, restrictions: List[tuple] = None, timestamp_grouping: str = 'hour', building_ids: Optional[List[int]] = None, - weight_view_table: Optional[str] = None, - include_sample_stats: bool = True) -> str: + include_sample_stats: bool = True, + include_area_normalized_cols: bool = False) -> str: """ Build query for timeseries data aggregation similar to buildstock query functionality. @@ -113,21 +115,24 @@ def get_timeseries_aggregation_query(self, Args: upgrade_id: Upgrade ID to filter on enduses: List of end use columns to select + weight_view_table: Name of the weight view table (required - e.g., 'rtuadv_v11_md_agg_national_by_state_vu' or 'rtuadv_v11_md_agg_national_by_county_vu') + group_by: Additional columns to group by (e.g., ['build_existing_model.building_type']) restrictions: List of (column, values) tuples for filtering timestamp_grouping: How to group timestamps ('hour', 'day', 'month') building_ids: Specific building IDs to filter on (optional) - weight_view_table: Name of the weight view table (e.g., 'rtuadv_v11_md_agg_national_by_state_vu') include_sample_stats: Whether to include sample_count, units_count, rows_per_sample + include_area_normalized_cols: Whether to include weighted area and kwh_weighted columns for AMI comparison Returns: SQL query string """ - # If no weight view table provided, construct default name + # Weight view table is required - cannot auto-assign without knowing state vs county aggregation if weight_view_table is None: - weight_view_table = f"{self.athena_table_name}_md_agg_national_by_state_vu" #TODO: make table name dynamic to aggregation level + raise ValueError("weight_view_table parameter is required. Cannot auto-assign without knowing geographic aggregation level (state vs county). " + "Please provide the full weight view table name (e.g., 'your_dataset_md_agg_national_by_state_vu' or 'your_dataset_md_agg_national_by_county_vu')") - # Build timestamp grouping with date_trunc and time adjustment (like buildstock does) + # Build timestamp grouping with date_trunc and time adjustment if timestamp_grouping == 'hour': time_select = f"date_trunc('hour', date_add('second', -900, {self.timeseries_table}.time)) AS time" time_group = '1' @@ -141,9 +146,15 @@ def get_timeseries_aggregation_query(self, time_select = f"{self.timeseries_table}.time AS time" time_group = '1' - # Build SELECT clause with sample statistics (matching buildstock pattern) + # Build SELECT clause with sample statistics select_clauses = [time_select] + # Add group_by columns to SELECT (excluding 'time' which is already handled) + if group_by: + for col in group_by: + if col != 'time': + select_clauses.append(f'{weight_view_table}."{col}"') + if include_sample_stats: select_clauses.extend([ f"count(distinct({self.timeseries_table}.building_id)) AS sample_count", @@ -151,11 +162,23 @@ def get_timeseries_aggregation_query(self, f"sum(1) / count(distinct({self.timeseries_table}.building_id)) AS rows_per_sample" ]) + # Add weighted area if requested (for AMI comparison) + if include_area_normalized_cols: + weighted_area = f'sum({weight_view_table}."in.sqft" * {weight_view_table}.weight) AS weighted_sqft' + select_clauses.append(weighted_area) + # Build weighted enduse aggregations for enduse in enduses: weighted_sum = f"sum({self.timeseries_table}.{enduse} * 1 * {weight_view_table}.weight) AS {enduse}" select_clauses.append(weighted_sum) + # Add kwh_per_sf columns if requested (for AMI comparison - normalized by weighted area) + # Note: When aggregating 15-minute data to hourly, area gets summed multiple times + # Divide by rows_per_sample to get the correct area normalization + if include_area_normalized_cols and enduse.endswith('_kwh'): + kwh_per_sf = f"sum({self.timeseries_table}.{enduse} * 1 * {weight_view_table}.weight) / (sum({weight_view_table}.\"in.sqft\" * {weight_view_table}.weight) / (sum(1) / count(distinct({self.timeseries_table}.building_id)))) AS {enduse.replace('_kwh', '_kwh_per_sf')}" + select_clauses.append(kwh_per_sf) + select_clause = ',\n '.join(select_clauses) # Build WHERE clause - join conditions first, then filters @@ -196,139 +219,27 @@ def get_timeseries_aggregation_query(self, where_clause = ' AND '.join(where_conditions) - # Build the query using FROM clause with comma-separated tables (like buildstock) + # Build GROUP BY clause - use column positions based on SELECT order + group_by_positions = [time_group] # Time is always position 1 + if group_by: + # Add positions for additional group_by columns (excluding 'time') + position = 2 # Start after time + for col in group_by: + if col != 'time': + group_by_positions.append(str(position)) + position += 1 + + group_by_clause = ', '.join(group_by_positions) + + # Build the query using FROM clause with comma-separated tables query = f"""SELECT {select_clause} FROM {weight_view_table}, {self.timeseries_table} WHERE {where_clause} - GROUP BY {time_group} - ORDER BY {time_group}""" + GROUP BY {group_by_clause} + ORDER BY {group_by_clause}""" return query - def get_timeseries_aggregation_query_with_join(self, - upgrade_id: Union[int, str], - enduses: List[str], - restrictions: List[tuple] = None, - timestamp_grouping: str = 'hour', - building_ids: Optional[List[int]] = None) -> str: - """ - Alternative version using JOIN syntax instead of comma-separated FROM clause. - Use this if you prefer explicit JOIN syntax over the buildstock comma-style. - """ - - # Build SELECT clause for enduses - enduse_selects = [] - for enduse in enduses: - enduse_selects.append(f'sum("{enduse}") AS "{enduse}"') - - enduse_clause = ',\n '.join(enduse_selects) - - # Build timestamp grouping - if timestamp_grouping == 'hour': - time_select = 'EXTRACT(HOUR from "time") as "hour"' - time_group = '"hour"' - elif timestamp_grouping == 'day': - time_select = 'EXTRACT(DAY from "time") as "day"' - time_group = '"day"' - elif timestamp_grouping == 'month': - time_select = 'EXTRACT(MONTH from "time") as "month"' - time_group = '"month"' - else: - time_select = '"time"' - time_group = '"time"' - - # Build WHERE clause - where_conditions = [f'"upgrade" = "{upgrade_id}"'] - - if building_ids: - bldg_list = ', '.join([str(bid) for bid in building_ids]) - where_conditions.append(f'"{self.timeseries_table}"."building_id" IN ({bldg_list})') - - if restrictions: - for column, values in restrictions: - if isinstance(values, (list, tuple)): - value_list = ', '.join([f'"{v}"' for v in values]) - where_conditions.append(f'"{column}" IN ({value_list})') - else: - where_conditions.append(f'"{column}" = "{values}"') - - where_clause = ' AND '.join(where_conditions) - - query = f""" - SELECT - {time_select}, - {enduse_clause} - FROM - "{self.timeseries_table}" - JOIN "{self.baseline_table}" - ON "{self.timeseries_table}"."building_id" = "{self.baseline_table}"."building_id" - WHERE {where_clause} - GROUP BY - {time_group} - ORDER BY - {time_group} - """ - - return query.strip() - - def get_building_characteristics_query(self, - upgrade_ids: Optional[List[Union[int, str]]] = None, - characteristics: Optional[List[str]] = None, - filters: Optional[Dict[str, Union[str, List[str]]]] = None) -> str: - """ - Build query to extract building characteristics and metadata. - - Args: - upgrade_ids: List of upgrade IDs to include (optional, defaults to all) - characteristics: Specific characteristic columns to select (optional) - filters: Dictionary of column: value(s) filters to apply - - Returns: - SQL query string - """ - - # Default characteristics if none specified - if characteristics is None: - characteristics = [ - 'building_id', - 'upgrade', - 'build_existing_model.building_type', - 'build_existing_model.county_id', - 'build_existing_model.create_bar_from_building_type_ratios_bldg_type_a', - 'in.sqft', - 'in.geometry_building_type_recs' - ] - - # Build SELECT clause - select_clause = ',\n '.join([f'"{char}"' for char in characteristics]) - - # Build WHERE clause - where_conditions = [] - - if upgrade_ids: - upgrade_list = ', '.join([f'"{uid}"' for uid in upgrade_ids]) - where_conditions.append(f'"upgrade" IN ({upgrade_list})') - - if filters: - for column, values in filters.items(): - if isinstance(values, (list, tuple)): - value_list = ', '.join([f'"{v}"' for v in values]) - where_conditions.append(f'"{column}" IN ({value_list})') - else: - where_conditions.append(f'"{column}" = "{values}"') - - where_clause = 'WHERE ' + ' AND '.join(where_conditions) if where_conditions else '' - - query = f""" - SELECT - {select_clause} - FROM - "{self.baseline_table}" - {where_clause} - """ - - return query.strip() - def get_applicability_query(self, upgrade_ids: List[Union[int, str]], state: Optional[Union[str, List[str]]] = None, @@ -349,9 +260,12 @@ def get_applicability_query(self, SQL query string """ - # If no weight view table provided, construct default name + # If no weight view table provided, construct name based on geographic level if weight_view_table is None: - weight_view_table = f"{self.athena_table_name}_md_agg_national_by_state_vu" + if county is not None: + weight_view_table = f"{self.athena_table_name}_md_agg_national_by_county_vu" + else: + weight_view_table = f"{self.athena_table_name}_md_agg_national_by_state_vu" # Default columns for applicability queries if columns is None: @@ -411,96 +325,4 @@ def get_applicability_query(self, ORDER BY bldg_id """ - return query.strip() - - -def get_monthly_energy_query(athena_table_name: str, **kwargs) -> str: - """ - Convenience function to get monthly energy consumption query. - - Args: - athena_table_name: Base Athena table name - **kwargs: Additional arguments passed to get_monthly_energy_consumption_query - - Returns: - SQL query string - """ - builder = ComStockQueryBuilder(athena_table_name) - return builder.get_monthly_energy_consumption_query(**kwargs) - - -def get_timeseries_query(athena_table_name: str, **kwargs) -> str: - """ - Convenience function to get timeseries aggregation query. - - Args: - athena_table_name: Base Athena table name - **kwargs: Additional arguments passed to get_timeseries_aggregation_query - - Returns: - SQL query string - """ - builder = ComStockQueryBuilder(athena_table_name) - return builder.get_timeseries_aggregation_query(**kwargs) - - -def get_building_chars_query(athena_table_name: str, **kwargs) -> str: - """ - Convenience function to get building characteristics query. - - Args: - athena_table_name: Base Athena table name - **kwargs: Additional arguments passed to get_building_characteristics_query - - Returns: - SQL query string - """ - builder = ComStockQueryBuilder(athena_table_name) - return builder.get_building_characteristics_query(**kwargs) - - -# Query template constants for common patterns -MONTHLY_ENERGY_TEMPLATE = """ -SELECT - "upgrade", - "month", - "state_id", - "building_type", - sum("total_site_gas_kbtu") AS "total_site_gas_kbtu", - sum("total_site_electricity_kwh") AS "total_site_electricity_kwh" -FROM -( - SELECT - EXTRACT(MONTH from "time") as "month", - SUBSTRING("build_existing_model.county_id", 2, 2) AS "state_id", - "build_existing_model.create_bar_from_building_type_ratios_bldg_type_a" as "building_type", - "upgrade", - "total_site_gas_kbtu", - "total_site_electricity_kwh" - FROM - "{timeseries_table}" - JOIN "{baseline_table}" - ON "{timeseries_table}"."building_id" = "{baseline_table}"."building_id" - WHERE {where_clause} -) -GROUP BY - "upgrade", - "month", - "state_id", - "building_type" -""" - -TIMESERIES_AGG_TEMPLATE = """ -SELECT - {time_grouping}, - {enduse_aggregations} -FROM - "{timeseries_table}" -JOIN "{baseline_table}" - ON "{timeseries_table}"."building_id" = "{baseline_table}"."building_id" -WHERE {where_clause} -GROUP BY - {time_grouping} -ORDER BY - {time_grouping} -""" \ No newline at end of file + return query.strip() \ No newline at end of file diff --git a/postprocessing/comstockpostproc/comstock_to_ami_comparison.py b/postprocessing/comstockpostproc/comstock_to_ami_comparison.py index 87ef1dcc7..2fecaf0ab 100644 --- a/postprocessing/comstockpostproc/comstock_to_ami_comparison.py +++ b/postprocessing/comstockpostproc/comstock_to_ami_comparison.py @@ -35,17 +35,17 @@ def __init__(self, comstock_object:ComStock, ami_object:AMI, image_type='jpg', n logger.warning(f'No timeseries data was available for {dataset.dataset_name}, unable to make AMI comparison.') continue if isinstance(dataset, ComStock): - df = dataset.ami_timeseries_data + df = dataset.ami_timeseries_data.copy() df['run'] = self.comstock_object.dataset_name df['year'] = self.comstock_object.year + # Normalize building type names to match AMI format + df['building_type'] = df['building_type'].map(self.BLDG_TYPE_TO_SNAKE_CASE).fillna(df['building_type'].str.lower()) dfs_to_concat.append(df) - df.iloc[0:0] elif isinstance(dataset, AMI): - df = dataset.ami_timeseries_data + df = dataset.ami_timeseries_data.copy() df['run'] = self.ami_object.dataset_name df['enduse'] = 'total' dfs_to_concat.append(df) - df.iloc[0:0] self.color_map[dataset.dataset_name] = dataset.color dataset_names.append(dataset.dataset_name) @@ -54,7 +54,10 @@ def __init__(self, comstock_object:ComStock, ami_object:AMI, image_type='jpg', n self.name = ' vs '.join(dataset_names) # Combine into a single dataframe for convenience - self.ami_timeseries_data = pd.concat(dfs_to_concat, join='outer') + if len(dfs_to_concat) == 0: + self.ami_timeseries_data = pd.DataFrame() + else: + self.ami_timeseries_data = pd.concat(dfs_to_concat, join='outer') # Make directories current_dir = os.path.dirname(os.path.abspath(__file__)) diff --git a/postprocessing/comstockpostproc/plotting_mixin.py b/postprocessing/comstockpostproc/plotting_mixin.py index c02b04eee..d3426c142 100644 --- a/postprocessing/comstockpostproc/plotting_mixin.py +++ b/postprocessing/comstockpostproc/plotting_mixin.py @@ -3152,13 +3152,13 @@ def map_to_season(month): else: # if more than 1 upgrade, add only aggregate loads for upgrade in upgrade_num: - dfs_merged_pw_up_mult = dfs_merged_pw_up.loc[dfs_merged_pw_up['in.upgrade_name'] == self.dict_upid_to_upname[upgrade]] + dfs_merged_pw_up_mult = dfs_merged_pw_up.loc[dfs_merged_pw_up['in.upgrade_name'] == self.dict_upid_to_upname[int(upgrade)]] upgrade_trace = go.Scatter( x=dfs_merged_pw_up_mult['time'], y=dfs_merged_pw_up_mult['Measure Total'], mode='lines', - line=dict(width=1.8, dash='solid'), #color=color_map[self.dict_upid_to_upname[upgrade]] - name=self.dict_upid_to_upname[upgrade], + line=dict(width=1.8, dash='solid'), #color=color_map[self.dict_upid_to_upname[int(upgrade)]] + name=self.dict_upid_to_upname[int(upgrade)], ) traces.append(upgrade_trace) @@ -3216,8 +3216,7 @@ def map_to_season(month): fig.write_image(fig_path, scale=6) fig.write_html(fig_path_html) - - print(f"{fig_sub_dir}/timeseries_data_{location_name})") + # save timeseries data dfs_merged.to_csv(f"{fig_sub_dir}/timeseries_data_{location_name}.csv") def plot_measure_timeseries_season_average_by_state(self, df, output_dir, timeseries_locations_to_plot, color_map, comstock_run_name, comstock_obj=None): @@ -3361,14 +3360,14 @@ def map_to_dow(dow): showlegend = upgrade not in legend_entries legend_entries.add(upgrade) - dfs_merged_pw_up_mult = dfs_merged_pw_up.loc[dfs_merged_pw_up['in.upgrade_name'] == self.dict_upid_to_upname[upgrade], :] + dfs_merged_pw_up_mult = dfs_merged_pw_up.loc[dfs_merged_pw_up['in.upgrade_name'] == self.dict_upid_to_upname[int(upgrade)], :] upgrade_trace = go.Scatter( x=dfs_merged_pw_up_mult['Hour_of_Day'], y=dfs_merged_pw_up_mult['Measure Total'], mode='lines', - line=dict(color=upgrade_colors[upgrade], width=1.8, dash='solid'), #color=color_map[self.dict_upid_to_upname[upgrade]] - name=self.dict_upid_to_upname[upgrade], - legendgroup=self.dict_upid_to_upname[upgrade], + line=dict(color=upgrade_colors[upgrade], width=1.8, dash='solid'), #color=color_map[self.dict_upid_to_upname[int(upgrade)]] + name=self.dict_upid_to_upname[int(upgrade)], + legendgroup=self.dict_upid_to_upname[int(upgrade)], showlegend=showlegend ) fig.add_trace(upgrade_trace, row=row, col=col) @@ -3563,15 +3562,15 @@ def map_to_dow(dow): else: # if more than 1 upgrade, add only aggregate loads for upgrade in upgrade_num: - dfs_merged_pw_up_mult = dfs_merged_gb_season_up.loc[dfs_merged_gb_season_up['in.upgrade_name'] == self.dict_upid_to_upname[upgrade]] + dfs_merged_pw_up_mult = dfs_merged_gb_season_up.loc[dfs_merged_gb_season_up['in.upgrade_name'] == self.dict_upid_to_upname[int(upgrade)]] upgrade_trace = go.Scatter( x=dfs_merged_pw_up_mult['Hour_of_Day'], y=dfs_merged_pw_up_mult[enduse]/1000, mode='lines', - line=dict(color=upgrade_colors[upgrade], width=1.8, dash='solid'), #color=color_map[self.dict_upid_to_upname[upgrade]] - name=self.dict_upid_to_upname[upgrade], - legendgroup=self.dict_upid_to_upname[upgrade], + line=dict(color=upgrade_colors[upgrade], width=1.8, dash='solid'), #color=color_map[self.dict_upid_to_upname[int(upgrade)]] + name=self.dict_upid_to_upname[int(upgrade)], + legendgroup=self.dict_upid_to_upname[int(upgrade)], showlegend=showlegend ) fig.add_trace(upgrade_trace, row=row, col=col) From 285422425e651f9f6803ca57a6bd9b786dd63b41 Mon Sep 17 00:00:00 2001 From: Chris CaraDonna <50876267+ChristopherCaradonna@users.noreply.github.com> Date: Tue, 18 Nov 2025 09:56:27 -0700 Subject: [PATCH 10/16] Adds warning if an applicability list for timeseries is empty instead of failing --- postprocessing/comstockpostproc/comstock.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py index dfa9f80fa..afb4788f4 100644 --- a/postprocessing/comstockpostproc/comstock.py +++ b/postprocessing/comstockpostproc/comstock.py @@ -4752,6 +4752,12 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_input, up applic_bldgs_list = list(applic_df[self.BLDG_ID].unique()) applic_bldgs_list = [int(x) for x in applic_bldgs_list] + # Check if any applicable buildings were found + if len(applic_bldgs_list) == 0: + location_desc = f"locations {location_list}" if len(location_list) > 1 else f"{primary_geo_type} {location_list[0]}" + print(f"Warning: No applicable buildings found for {location_desc} and upgrade(s) {upgrade_num}. Returning empty DataFrames.") + return pd.DataFrame(), pd.DataFrame() + import time from datetime import datetime, time From d64e60ae1d3ce988e8bbcd3b31ab8f178ff435de Mon Sep 17 00:00:00 2001 From: Chris CaraDonna <50876267+ChristopherCaradonna@users.noreply.github.com> Date: Mon, 22 Dec 2025 09:30:22 -0700 Subject: [PATCH 11/16] Update postprocessing/comstockpostproc/plotting_mixin.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- postprocessing/comstockpostproc/plotting_mixin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/postprocessing/comstockpostproc/plotting_mixin.py b/postprocessing/comstockpostproc/plotting_mixin.py index d3426c142..0c4deb8cf 100644 --- a/postprocessing/comstockpostproc/plotting_mixin.py +++ b/postprocessing/comstockpostproc/plotting_mixin.py @@ -3059,7 +3059,7 @@ def plot_measure_timeseries_peak_week_by_state(self, df, output_dir, timeseries_ df_data = df.copy() # coerce data type of upgrade ID - arrives as float, need str of int for querying df_data[self.UPGRADE_ID] = pd.to_numeric(df_data[self.UPGRADE_ID], errors="coerce").astype("Int64").astype(str) - df_upgrade = df_data.loc[(df_data[self.UPGRADE_ID]!="0") & (df_data[self.UPGRADE_ID]!=0), :] + df_upgrade = df_data.loc[(df_data[self.UPGRADE_ID]!="0"), :] upgrade_num = list(df_upgrade[self.UPGRADE_ID].unique()) upgrade_name = list(df_upgrade[self.UPGRADE_NAME].unique()) From 9d337a2c875461648f1b76181449efa76b9bb7bc Mon Sep 17 00:00:00 2001 From: Chris CaraDonna <50876267+ChristopherCaradonna@users.noreply.github.com> Date: Mon, 22 Dec 2025 10:13:25 -0700 Subject: [PATCH 12/16] Update postprocessing/comstockpostproc/comstock.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- postprocessing/comstockpostproc/comstock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py index 23fd167df..d8da6bbaf 100644 --- a/postprocessing/comstockpostproc/comstock.py +++ b/postprocessing/comstockpostproc/comstock.py @@ -4785,7 +4785,7 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_input, up ) location_desc = f"locations {location_list}" if len(location_list) > 1 else f"{primary_geo_type} {location_list[0]}" - print(f"Getting weighted baseline load profile for {location_desc} and upgrade id {upgrade_id} with {len(applic_bldgs_list)} applicable buildings (using {agg} table).") + logger.info(f"Getting weighted baseline load profile for {location_desc} and upgrade id {upgrade_id} with {len(applic_bldgs_list)} applicable buildings (using {agg} table).") # Execute query df_base_ts_agg_weighted = athena_client.execute(query) From 1014b959fb7994ac28457e9be87bcba7674aa4f0 Mon Sep 17 00:00:00 2001 From: Chris CaraDonna <50876267+ChristopherCaradonna@users.noreply.github.com> Date: Mon, 22 Dec 2025 10:14:54 -0700 Subject: [PATCH 13/16] Update postprocessing/comstockpostproc/plotting_mixin.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- postprocessing/comstockpostproc/plotting_mixin.py | 1 - 1 file changed, 1 deletion(-) diff --git a/postprocessing/comstockpostproc/plotting_mixin.py b/postprocessing/comstockpostproc/plotting_mixin.py index 0c4deb8cf..7d6cc51d6 100644 --- a/postprocessing/comstockpostproc/plotting_mixin.py +++ b/postprocessing/comstockpostproc/plotting_mixin.py @@ -15,7 +15,6 @@ from buildstock_query import BuildStockQuery import matplotlib.colors as mcolors from plotly.subplots import make_subplots -import comstockpostproc.comstock as comstock from comstockpostproc.comstock_query_builder import ComStockQueryBuilder import fsspec import s3fs From 3ef05779c03a2d4e68cfb239d56b8494ea5c3bdb Mon Sep 17 00:00:00 2001 From: Chris CaraDonna <50876267+ChristopherCaradonna@users.noreply.github.com> Date: Mon, 22 Dec 2025 10:43:39 -0700 Subject: [PATCH 14/16] Update postprocessing/comstockpostproc/plotting_mixin.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- postprocessing/comstockpostproc/plotting_mixin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/postprocessing/comstockpostproc/plotting_mixin.py b/postprocessing/comstockpostproc/plotting_mixin.py index 7d6cc51d6..5c061a1c6 100644 --- a/postprocessing/comstockpostproc/plotting_mixin.py +++ b/postprocessing/comstockpostproc/plotting_mixin.py @@ -3432,7 +3432,7 @@ def plot_measure_timeseries_annual_average_by_state_and_enduse(self, df, output_ df_data = df.copy() # coerce data type of upgrade ID - arrives as float, need str of int for querying df_data[self.UPGRADE_ID] = pd.to_numeric(df_data[self.UPGRADE_ID], errors="coerce").astype("Int64").astype(str) - df_upgrade = df_data.loc[(df_data[self.UPGRADE_ID]!="0") & (df_data[self.UPGRADE_ID]!=0), :] + df_upgrade = df_data.loc[(df_data[self.UPGRADE_ID]!="0"), :] upgrade_num = list(df_upgrade[self.UPGRADE_ID].unique()) upgrade_name = list(df_upgrade[self.UPGRADE_NAME].unique()) From 14650c5daf69f9068bff456ca7668b22cc709e2c Mon Sep 17 00:00:00 2001 From: christophercaradonna Date: Tue, 30 Dec 2025 08:11:02 -0700 Subject: [PATCH 15/16] PR review updates --- .../compare_comstock_to_ami.py.template | 77 +++++++++---------- .../compare_upgrades-test_timeseries_plots.py | 56 ++++---------- postprocessing/compare_upgrades.py.template | 4 +- postprocessing/comstockpostproc/comstock.py | 52 ++++--------- .../comstock_measure_comparison.py | 8 +- .../comstock_query_builder.py | 71 ++++++++++------- .../comstock_to_ami_comparison.py | 4 +- .../comstockpostproc/plotting_mixin.py | 21 +---- postprocessing/setup.py | 2 +- 9 files changed, 123 insertions(+), 172 deletions(-) diff --git a/postprocessing/compare_comstock_to_ami.py.template b/postprocessing/compare_comstock_to_ami.py.template index fa7ce6b3e..2f2e8a2aa 100644 --- a/postprocessing/compare_comstock_to_ami.py.template +++ b/postprocessing/compare_comstock_to_ami.py.template @@ -23,7 +23,7 @@ def main(): drop_failed_runs=True, # False if you want to evaluate which runs failed in raw output data color_hex='#0072B2', # Color used to represent this run in plots skip_missing_columns=True, # False if you want to ensure you have all data specified for export - reload_from_cache=True, # True if CSV already made and want faster reload times + reload_from_cache=False, # True if CSV already made and want faster reload times include_upgrades=False, # False if not looking at upgrades # output_dir = 's3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2025/comstock_amy2018_release_1' ) @@ -33,7 +33,7 @@ def main(): stock_estimate = cspp.Apportion( stock_estimation_version='2025R3', # Only updated when a new stock estimate is published truth_data_version='v01', # Typically don't change this - reload_from_cache=True # Set to "True" if you have already run apportionment and would like to keep consistant values between postprocessing runs. + reload_from_cache=False # Set to "True" if you have already run apportionment and would like to keep consistant values between postprocessing runs. ) # Scale ComStock runs to the 'truth data' from StockE V3 estimates using bucket-based apportionment @@ -58,58 +58,55 @@ def main(): # county resolution, files by state and county - county_resolution = { - 'geo_top_dir': 'by_state_and_county', - 'partition_cols': { - comstock.STATE_ABBRV: 'state', - comstock.COUNTY_ID: 'county', - }, - 'aggregation_levels': [comstock.COUNTY_ID], # , comstock.COUNTY_ID], # Full tract resolution (agg=in.nhgis_tract_gisjoin) - 'data_types': ['full'], - 'file_types': ['parquet'], - } + county_resolution = { + 'geo_top_dir': 'by_state_and_county', + 'partition_cols': { + comstock.STATE_ABBRV: 'state', + comstock.COUNTY_ID: 'county', + }, + 'aggregation_levels': [comstock.COUNTY_ID], + 'data_types': ['full'], + 'file_types': ['parquet'], + } # state level resolution, one single national file state_resolution = { - 'geo_top_dir': 'national_by_state', - 'partition_cols': {}, - 'aggregation_levels': [[comstock.STATE_ABBRV, comstock.CZ_ASHRAE]], - 'data_types': ['full'], # other options: 'detailed', 'basic' **If using multiple options, order must go from more detailed to less detailed. - 'file_types': ['parquet'], # other options:'parquet' + 'geo_top_dir': 'national_by_state', + 'partition_cols': {}, + 'aggregation_levels': [[comstock.STATE_ABBRV, comstock.CZ_ASHRAE]], + 'data_types': ['full'], # other options: 'detailed', 'basic' **If using multiple options, order must go from more detailed to less detailed. + 'file_types': ['parquet'], # other options:'parquet' } # specify the export level # IMPORTANT: if making county level timeseries plots, must export county level data to S3. This does not occur automatically. geo_exports = [county_resolution] #state_resolution - #for geo_export in geo_exports: - # # write files locally as needed - # #comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id=0, geo_exports=[geo_export]) + for geo_export in geo_exports: + # write files locally as needed - usually not needed for AMI comparison plots + #comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id=0, geo_exports=[geo_export]) - # # Also write to S3 if making timeseries plots - # if comstock.make_timeseries_plots: - # s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" - # s3_output_dir = comstock.setup_fsspec_filesystem(s3_dir, aws_profile_name=None) - # comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id=0, geo_exports=[geo_export], output_dir=s3_output_dir) + # Also write to S3 if making timeseries plots + s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" + s3_output_dir = comstock.setup_fsspec_filesystem(s3_dir, aws_profile_name=None) + comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id=0, geo_exports=[geo_export], output_dir=s3_output_dir) # write select results to S3 for Athena/Glue when needed for timeseries plots - #s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" - #database = "enduse" - #dataset_name = f"{comstock.comstock_run_name}/{comstock.comstock_run_name}" # name of the dir in s3_dir we want to make new tables and views for - #crawler_name = comstock.comstock_run_name # used to set name of crawler, cannot include slashes - #workgroup = "eulp" # Athena workgroup to use - #glue_service_role = "service-role/AWSGlueServiceRole-default" + s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" + database = "enduse" + dataset_name = f"{comstock.comstock_run_name}/{comstock.comstock_run_name}" # name of the dir in s3_dir we want to make new tables and views for + crawler_name = comstock.comstock_run_name # used to set name of crawler, cannot include slashes + workgroup = "eulp" # Athena workgroup to use + glue_service_role = "service-role/AWSGlueServiceRole-default" # Export parquet files to S3 for Athena/Glue - # TODO: modify so that county data can be used for timeseries plots - # TODO: Modify geo export structure to specify parquet files only for this part of the workflow - #comstock.create_sightglass_tables(s3_location=f"{s3_dir}/metadata_and_annual_results_aggregates", - # dataset_name=crawler_name, - # database_name=database, - # glue_service_role=glue_service_role) - #comstock.fix_timeseries_tables(crawler_name, database) - #comstock.create_views(crawler_name, database, workgroup) - + comstock.create_sightglass_tables( + s3_location=f"{s3_dir}/metadata_and_annual_results_aggregates", + dataset_name=crawler_name, + database_name=database, + glue_service_role=glue_service_role) + comstock.fix_timeseries_tables(crawler_name, database) + comstock.create_views(crawler_name, database, workgroup) # AMI ami = cspp.AMI( diff --git a/postprocessing/compare_upgrades-test_timeseries_plots.py b/postprocessing/compare_upgrades-test_timeseries_plots.py index 346786a17..77997de28 100644 --- a/postprocessing/compare_upgrades-test_timeseries_plots.py +++ b/postprocessing/compare_upgrades-test_timeseries_plots.py @@ -2,26 +2,7 @@ # -*- coding: utf-8 -*- import logging -import boto3 -import time -from fsspec.core import url_to_fs -from sqlalchemy.engine import create_engine -import sqlalchemy as sa import comstockpostproc as cspp -from sqlalchemy_views import CreateView -import re - - -#import sys, inspect, importlib, site, platform, shutil, subprocess, os - -#mods = ["pyathena","s3fs","fsspec","botocore","aiobotocore","boto3","pandas"] -#for m in mods: -# try: -# mod = importlib.import_module(m) -# print(f"{m:12} {getattr(mod,'__version__','?'):>10} @ {inspect.getfile(mod)}") -# except Exception as e: -# print(f"{m:12} NOT IMPORTABLE: {e}") - logging.basicConfig(level='INFO') # Use DEBUG, INFO, or WARNING logger = logging.getLogger(__name__) @@ -118,20 +99,19 @@ def main(): # specify the export level # IMPORTANT: if making county level timeseries plots, must export county level data to S3. This does not occur automatically. - geo_exports = [county_resolution] #state_resolution - - #for geo_export in geo_exports: - # for upgrade_id in comstock.upgrade_ids_to_process: - # if upgrade_id == 0: - # continue - # comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id, [geo_export]) + geo_exports = [state_resolution] #county_resolution - # # Also write to S3 if making timeseries plots - # if comstock.make_timeseries_plots: TODO: force geo exports to county data if couunty timeseries is requested. - # s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" - # s3_output_dir = comstock.setup_fsspec_filesystem(s3_dir, aws_profile_name=None) - # comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id=upgrade_id, geo_exports=[geo_export], output_dir=s3_output_dir) + for geo_export in geo_exports: + for upgrade_id in comstock.upgrade_ids_to_process: + if upgrade_id == 0: + continue + comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id, [geo_export]) + # Also write to S3 if making timeseries plots + if comstock.make_timeseries_plots: # TODO: force geo exports to county data if couunty timeseries is requested. + s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" + s3_output_dir = comstock.setup_fsspec_filesystem(s3_dir, aws_profile_name=None) + comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id=upgrade_id, geo_exports=[geo_export], output_dir=s3_output_dir) # write select results to S3 for Athena/Glue when needed for timeseries plots if comstock.make_timeseries_plots: @@ -143,14 +123,12 @@ def main(): glue_service_role = "service-role/AWSGlueServiceRole-default" # Export parquet files to S3 for Athena/Glue - # TODO: modify so that county data can be used for timeseries plots - # TODO: Modify geo export structure to specify parquet files only for this part of the workflow - #comstock.create_sightglass_tables(s3_location=f"{s3_dir}/metadata_and_annual_results_aggregates", - # dataset_name=crawler_name, - # database_name=database, - # glue_service_role=glue_service_role) - #comstock.fix_timeseries_tables(crawler_name, database) - #comstock.create_views(crawler_name, database, workgroup) + comstock.create_sightglass_tables(s3_location=f"{s3_dir}/metadata_and_annual_results_aggregates", + dataset_name=crawler_name, + database_name=database, + glue_service_role=glue_service_role) + comstock.fix_timeseries_tables(crawler_name, database) + comstock.create_views(crawler_name, database, workgroup) # Create measure run comparisons; only use if run has measures comparison = cspp.ComStockMeasureComparison(comstock, timeseries_locations_to_plot=comstock.timeseries_locations_to_plot, make_comparison_plots = comstock.make_comparison_plots, make_timeseries_plots = comstock.make_timeseries_plots) diff --git a/postprocessing/compare_upgrades.py.template b/postprocessing/compare_upgrades.py.template index 1580a60c4..43a7cd784 100644 --- a/postprocessing/compare_upgrades.py.template +++ b/postprocessing/compare_upgrades.py.template @@ -48,7 +48,7 @@ def main(): stock_estimate = cspp.Apportion( stock_estimation_version='2025R3', # Only updated when a new stock estimate is published truth_data_version='v01', # Typically don't change this - reload_from_cache=False, # Set to "True" if you have already run apportionment and would like to keep consistant values between postprocessing runs. + reload_from_cache=False, # Set to "True" if you have already run apportionment and would like to keep consistent values between postprocessing runs. #output_dir = 's3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2025/comstock_amy2018_release_1' ) @@ -108,7 +108,7 @@ def main(): comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id, [geo_export]) # Also write to S3 if making timeseries plots - if comstock.make_timeseries_plots: #TODO: force geo exports to county data if couunty timeseries is requested. + if comstock.make_timeseries_plots: #TODO: force geo exports to county data if county timeseries is requested. s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" s3_output_dir = comstock.setup_fsspec_filesystem(s3_dir, aws_profile_name=None) comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id=upgrade_id, geo_exports=[geo_export], output_dir=s3_output_dir) diff --git a/postprocessing/comstockpostproc/comstock.py b/postprocessing/comstockpostproc/comstock.py index 23fd167df..e6faf59ab 100644 --- a/postprocessing/comstockpostproc/comstock.py +++ b/postprocessing/comstockpostproc/comstock.py @@ -11,7 +11,6 @@ from fsspec.core import url_to_fs from sqlalchemy.engine import create_engine from joblib import Parallel, delayed -from fsspec import register_implementation import sqlalchemy as sa import time from sqlalchemy_views import CreateView @@ -403,7 +402,6 @@ def download_timeseries_data_for_ami_comparison(self, ami, reload_from_csv=True, self.ami_timeseries_data = pd.read_csv(file_path, low_memory=False, index_col='timestamp', parse_dates=True) else: - ########## REVISE TO USE NEW TIMESERIES FUNCTIONALITY athena_end_uses = list(map(lambda x: self.END_USES_TIMESERIES_DICT[x], self.END_USES)) athena_end_uses.append('total_site_electricity_kwh') all_timeseries_df = pd.DataFrame() @@ -412,9 +410,6 @@ def download_timeseries_data_for_ami_comparison(self, ami, reload_from_csv=True, if os.path.isfile(region_file_path_long) and reload_from_csv and save_individual_regions: logger.info(f"timeseries data in long format for {region['source_name']} already exists at {region_file_path_long}") continue - - - # Use ComStockQueryBuilder builder = ComStockQueryBuilder(self.comstock_run_name) weight_view_table = f'{self.comstock_run_name}_md_agg_by_state_and_county_vu' query = builder.get_timeseries_aggregation_query( @@ -429,8 +424,6 @@ def download_timeseries_data_for_ami_comparison(self, ami, reload_from_csv=True, ts_agg = athena_client.execute(query) - ######################## END REVISED SECTION - if ts_agg['time'].dtype == 'Int64': # Convert bigint to timestamp type if necessary ts_agg['time'] = pd.to_datetime(ts_agg['time']/1e9, unit='s') @@ -581,7 +574,7 @@ def convert_timeseries_to_long(self, agg_df, county_ids, output_name, save_indiv on=['timestamp', 'building_type', 'sample_count', 'enduse'], how='inner' ).set_index('timestamp') - agg_df = agg_df.rename(columns={'sample_count': 'bldg_count'})\ + agg_df = agg_df.rename(columns={'sample_count': 'bldg_count'}) # save out long data format if save_individual_region: @@ -4161,7 +4154,7 @@ def create_sightglass_tables( }] try: - crawler_info = glue.get_crawler(Name=crawler_name) + _ = glue.get_crawler(Name=crawler_name) except glue.exceptions.EntityNotFoundException as ex: logger.info(f"Creating Crawler {crawler_name}") glue.create_crawler(**crawler_params) @@ -4300,7 +4293,7 @@ def fix_timeseries_tables(dataset_name: str, database_name: str = "vizstock"): else: logger.debug('Waiting 10 seconds to check index creation status') time.sleep(10) - if not index_deleted: + if not index_created: raise RuntimeError(f'Did not create index in 600 seconds, stopping.') @staticmethod @@ -4458,19 +4451,6 @@ def create_views( bys = bys.replace(f'agg_', '').replace(f'by_', '').replace(f'_parquet', '').split('_and_') logger.debug(f"Partition identifiers = {', '.join(bys)}") - # Rename the partition column to match hive partition - # Move it up in the column order for easier debugging - # TODO figure out a better approach than hard-coding - aliases = { - 'state': {'col': 'in.state','alias': 'state'}, - 'puma': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'}, - 'county': {'col': 'in.nhgis_county_gisjoin', 'alias': 'county'}, - 'puma_midwest': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'}, - 'puma_northeast': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'}, - 'puma_south': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'}, - 'puma_west': {'col': 'in.nhgis_puma_gisjoin', 'alias': 'puma'} - } - # Select columns for the metadata, aliasing partition columns cols = [] for col in metadata_tbl.columns: @@ -4535,9 +4515,7 @@ def create_views( create_view = CreateView(view, q, or_replace=True) logger.info(f"Creating metadata view: {view_name}, partitioned by {bys}") engine.execute(create_view) - # logger.debug('Columns in view:') - # for c in cols: - # logger.debug(c) + else: q = sa.select(cols) view_name = metadata_tblname.replace('_parquet', '') @@ -4549,9 +4527,6 @@ def create_views( create_view = CreateView(view, q, or_replace=True) logger.info(f"Creating metadata view: {view_name}, partitioned by {bys}") engine.execute(create_view) - # logger.debug('Columns in view:') - # for c in cols: - # logger.debug(c) # Get a list of timeseries tables get_ts_tables_kw = { @@ -4635,7 +4610,7 @@ def determine_state_or_county_timeseries_table(self, location_input, location_na return 'county' else: # Handle other potential formats - default to state - print(f"Warning: Unrecognized location ID format: {location_input}, defaulting to state") + logger.warning(f"Warning: Unrecognized location ID format: {location_input}, defaulting to state") return 'state' # Handle multiple locations (tuple or list) @@ -4647,7 +4622,7 @@ def determine_state_or_county_timeseries_table(self, location_input, location_na return 'state' # All are states else: - print(f"Warning: Unexpected location input type: {type(location_input)}, defaulting to state") + logger.warning(f"Warning: Unexpected location input type: {type(location_input)}, defaulting to state") return 'state' def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_input, upgrade_name): @@ -4747,15 +4722,14 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_input, up applic_bldgs_list = list(applic_df[self.BLDG_ID].unique()) applic_bldgs_list = [int(x) for x in applic_bldgs_list] + # Create location description string for logging messages + location_desc = f"locations {location_list}" if len(location_list) > 1 else f"{primary_geo_type} {location_list[0]}" + # Check if any applicable buildings were found if len(applic_bldgs_list) == 0: - location_desc = f"locations {location_list}" if len(location_list) > 1 else f"{primary_geo_type} {location_list[0]}" print(f"Warning: No applicable buildings found for {location_desc} and upgrade(s) {upgrade_num}. Returning empty DataFrames.") return pd.DataFrame(), pd.DataFrame() - import time - from datetime import datetime, time - # Build geographic restrictions for all locations geo_restrictions = [] if state_locations: @@ -4763,6 +4737,10 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_input, up if county_locations: geo_restrictions.append((self.COUNTY_ID, county_locations)) + # Initialize variables before loop to avoid NameError if loop doesn't execute + df_base_ts_agg_weighted = None + df_up_ts_agg_weighted = None + # loop through upgrades for upgrade_id in df[self.UPGRADE_ID].unique(): @@ -4784,7 +4762,6 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_input, up weight_view_table=weight_view_table ) - location_desc = f"locations {location_list}" if len(location_list) > 1 else f"{primary_geo_type} {location_list[0]}" print(f"Getting weighted baseline load profile for {location_desc} and upgrade id {upgrade_id} with {len(applic_bldgs_list)} applicable buildings (using {agg} table).") # Execute query @@ -4793,8 +4770,7 @@ def get_weighted_load_profiles_from_s3(self, df, upgrade_num, location_input, up else: # baseline load data when no upgrades are present, or upgrade load data - - ## Create query builder and generate query for upgrade data + # create query builder and generate query for upgrade data builder = ComStockQueryBuilder(self.comstock_run_name) # Build restrictions list including all geographic locations (same as baseline) diff --git a/postprocessing/comstockpostproc/comstock_measure_comparison.py b/postprocessing/comstockpostproc/comstock_measure_comparison.py index 7c9d27212..6073dd427 100644 --- a/postprocessing/comstockpostproc/comstock_measure_comparison.py +++ b/postprocessing/comstockpostproc/comstock_measure_comparison.py @@ -190,8 +190,12 @@ def make_plots(self, lazy_frame: pl.LazyFrame, column_for_grouping, timeseries_l columns=(self.lazyframe_plotter.BASE_COLUMNS + self.lazyframe_plotter.UNMET_HOURS_COLS))(**BASIC_PARAMS) if make_timeseries_plots: - TIMESERIES_PARAMS = {'comstock_run_name': self.comstock_run_name, 'timeseries_locations_to_plot': timeseries_locations_to_plot, 'color_map': color_map, - 'output_dir': output_dir, 'comstock_obj': self.comstock_object} + TIMESERIES_PARAMS = { + 'comstock_run_name': self.comstock_run_name, + 'timeseries_locations_to_plot': timeseries_locations_to_plot, + 'color_map': color_map, + 'output_dir': output_dir, + 'comstock_obj': self.comstock_object} LazyFramePlotter.plot_with_lazy(plot_method=self.plot_measure_timeseries_peak_week_by_state, lazy_frame=lazy_frame.clone(), columns=(self.lazyframe_plotter.BASE_COLUMNS + [self.UPGRADE_ID, self.BLDG_TYPE]))(**TIMESERIES_PARAMS) #self.BLDG_WEIGHT, diff --git a/postprocessing/comstockpostproc/comstock_query_builder.py b/postprocessing/comstockpostproc/comstock_query_builder.py index f16b26374..c9bc0adaa 100644 --- a/postprocessing/comstockpostproc/comstock_query_builder.py +++ b/postprocessing/comstockpostproc/comstock_query_builder.py @@ -6,7 +6,7 @@ """ import logging -from typing import Dict, List, Optional, Union +from typing import List, Optional, Union logger = logging.getLogger(__name__) @@ -32,9 +32,9 @@ def __init__(self, athena_table_name: str): #TODO: this method has not yet been validated, and should be prior to use. def get_monthly_energy_consumption_query(self, - upgrade_ids: Optional[List[Union[int, str]]] = None, - states: Optional[List[str]] = None, - building_types: Optional[List[str]] = None) -> str: + upgrade_ids: Optional[List[Union[int, str]]] = None, + states: Optional[List[str]] = None, + building_types: Optional[List[str]] = None) -> str: """ Build query for monthly natural gas and electricity consumption by state and building type. @@ -45,6 +45,7 @@ def get_monthly_energy_consumption_query(self, Returns: SQL query string + """ # Build WHERE clause conditions @@ -96,16 +97,17 @@ def get_monthly_energy_consumption_query(self, return query.strip() - def get_timeseries_aggregation_query(self, - upgrade_id: Union[int, str], - enduses: List[str], - weight_view_table: str, - group_by: Optional[List[str]] = None, - restrictions: List[tuple] = None, - timestamp_grouping: str = 'hour', - building_ids: Optional[List[int]] = None, - include_sample_stats: bool = True, - include_area_normalized_cols: bool = False) -> str: + def get_timeseries_aggregation_query( + self, + upgrade_id: Union[int, str], + enduses: List[str], + weight_view_table: str, + group_by: Optional[List[str]] = None, + restrictions: List[tuple] = None, + timestamp_grouping: str = 'hour', + building_ids: Optional[List[int]] = None, + include_sample_stats: bool = True, + include_area_normalized_cols: bool = False) -> str: """ Build query for timeseries data aggregation similar to buildstock query functionality. @@ -125,6 +127,7 @@ def get_timeseries_aggregation_query(self, Returns: SQL query string + """ # Weight view table is required - cannot auto-assign without knowing state vs county aggregation @@ -133,18 +136,15 @@ def get_timeseries_aggregation_query(self, "Please provide the full weight view table name (e.g., 'your_dataset_md_agg_national_by_state_vu' or 'your_dataset_md_agg_national_by_county_vu')") # Build timestamp grouping with date_trunc and time adjustment + time_group = '1' if timestamp_grouping == 'hour': time_select = f"date_trunc('hour', date_add('second', -900, {self.timeseries_table}.time)) AS time" - time_group = '1' elif timestamp_grouping == 'day': time_select = f"date_trunc('day', {self.timeseries_table}.time) AS time" - time_group = '1' elif timestamp_grouping == 'month': time_select = f"date_trunc('month', {self.timeseries_table}.time) AS time" - time_group = '1' else: time_select = f"{self.timeseries_table}.time AS time" - time_group = '1' # Build SELECT clause with sample statistics select_clauses = [time_select] @@ -158,7 +158,7 @@ def get_timeseries_aggregation_query(self, if include_sample_stats: select_clauses.extend([ f"count(distinct({self.timeseries_table}.building_id)) AS sample_count", - f"(count(distinct({self.timeseries_table}.building_id)) * sum(1 * {weight_view_table}.weight)) / sum(1) AS units_count", + f"(count(distinct({self.timeseries_table}.building_id)) * sum({weight_view_table}.weight)) / sum(1) AS units_count", f"sum(1) / count(distinct({self.timeseries_table}.building_id)) AS rows_per_sample" ]) @@ -169,14 +169,28 @@ def get_timeseries_aggregation_query(self, # Build weighted enduse aggregations for enduse in enduses: - weighted_sum = f"sum({self.timeseries_table}.{enduse} * 1 * {weight_view_table}.weight) AS {enduse}" + weighted_sum = f"sum({self.timeseries_table}.{enduse} * {weight_view_table}.weight) AS {enduse}" select_clauses.append(weighted_sum) # Add kwh_per_sf columns if requested (for AMI comparison - normalized by weighted area) - # Note: When aggregating 15-minute data to hourly, area gets summed multiple times - # Divide by rows_per_sample to get the correct area normalization + # Note: Building area is per building, but appears in multiple timestep rows when joined. + # The sum() counts each building's area once per timestep. Divide by rows_per_sample to correct. + # + # Example: 4 buildings (each 10k sqft), 15-min data aggregated to hourly (4 timesteps): + # - Each building contributes 4 rows (one per 15-min interval) + # - weighted_energy_sum: 100 kWh (energy varies per timestep, summed across all 16 rows) + # - weighted_area_sum: 160k sqft (same 10k per building repeated 4 times: 4 buildings × 10k × 4 timesteps) + # - rows_per_sample: 16 total rows / 4 distinct buildings = 4 timesteps/building + # - Corrected area: 160k / 4 = 40k sqft (removes timestep duplication) + # - Result: 100 kWh / 40k sqft = 0.0025 kWh/sqft if include_area_normalized_cols and enduse.endswith('_kwh'): - kwh_per_sf = f"sum({self.timeseries_table}.{enduse} * 1 * {weight_view_table}.weight) / (sum({weight_view_table}.\"in.sqft\" * {weight_view_table}.weight) / (sum(1) / count(distinct({self.timeseries_table}.building_id)))) AS {enduse.replace('_kwh', '_kwh_per_sf')}" + # Build the formula in parts for clarity + weighted_energy_sum = f"sum({self.timeseries_table}.{enduse} * {weight_view_table}.weight)" + weighted_area_sum = f"sum({weight_view_table}.\"in.sqft\" * {weight_view_table}.weight)" + rows_per_sample = f"(sum(1) / count(distinct({self.timeseries_table}.building_id)))" + corrected_area = f"({weighted_area_sum} / {rows_per_sample})" + + kwh_per_sf = f"{weighted_energy_sum} / {corrected_area} AS {enduse.replace('_kwh', '_kwh_per_sf')}" select_clauses.append(kwh_per_sf) select_clause = ',\n '.join(select_clauses) @@ -241,11 +255,11 @@ def get_timeseries_aggregation_query(self, return query def get_applicability_query(self, - upgrade_ids: List[Union[int, str]], - state: Optional[Union[str, List[str]]] = None, - county: Optional[Union[str, List[str]]] = None, - columns: Optional[List[str]] = None, - weight_view_table: Optional[str] = None) -> str: + upgrade_ids: List[Union[int, str]], + state: Optional[Union[str, List[str]]] = None, + county: Optional[Union[str, List[str]]] = None, + columns: Optional[List[str]] = None, + weight_view_table: Optional[str] = None) -> str: """ Build query to get applicable buildings and their characteristics from the weight view. @@ -258,6 +272,7 @@ def get_applicability_query(self, Returns: SQL query string + """ # If no weight view table provided, construct name based on geographic level diff --git a/postprocessing/comstockpostproc/comstock_to_ami_comparison.py b/postprocessing/comstockpostproc/comstock_to_ami_comparison.py index 2fecaf0ab..35bcfdebe 100644 --- a/postprocessing/comstockpostproc/comstock_to_ami_comparison.py +++ b/postprocessing/comstockpostproc/comstock_to_ami_comparison.py @@ -35,14 +35,14 @@ def __init__(self, comstock_object:ComStock, ami_object:AMI, image_type='jpg', n logger.warning(f'No timeseries data was available for {dataset.dataset_name}, unable to make AMI comparison.') continue if isinstance(dataset, ComStock): - df = dataset.ami_timeseries_data.copy() + df = dataset.ami_timeseries_data.copy(deep=True) df['run'] = self.comstock_object.dataset_name df['year'] = self.comstock_object.year # Normalize building type names to match AMI format df['building_type'] = df['building_type'].map(self.BLDG_TYPE_TO_SNAKE_CASE).fillna(df['building_type'].str.lower()) dfs_to_concat.append(df) elif isinstance(dataset, AMI): - df = dataset.ami_timeseries_data.copy() + df = dataset.ami_timeseries_data.copy(deep=True) df['run'] = self.ami_object.dataset_name df['enduse'] = 'total' dfs_to_concat.append(df) diff --git a/postprocessing/comstockpostproc/plotting_mixin.py b/postprocessing/comstockpostproc/plotting_mixin.py index 0c4deb8cf..86e94d412 100644 --- a/postprocessing/comstockpostproc/plotting_mixin.py +++ b/postprocessing/comstockpostproc/plotting_mixin.py @@ -11,28 +11,9 @@ import plotly.express as px import seaborn as sns import plotly.graph_objects as go -import sys -from buildstock_query import BuildStockQuery import matplotlib.colors as mcolors from plotly.subplots import make_subplots import comstockpostproc.comstock as comstock -from comstockpostproc.comstock_query_builder import ComStockQueryBuilder -import fsspec -import s3fs -from pyathena.arrow.cursor import ArrowCursor - -# --- put this at the VERY TOP of your main script --- -from pyathena import connect as _orig_connect -from pyathena.arrow.cursor import ArrowCursor -import buildstock_query.query_core as qc # this is what QueryCore calls - -def _connect_with_arrow(*args, **kwargs): - kwargs.setdefault("cursor_class", ArrowCursor) - kwargs.setdefault("cursor_kwargs", {"unload": True}) # downloads via boto3, no s3fs/fsspec involved - return _orig_connect(*args, **kwargs) - -qc.connect = _connect_with_arrow # patch the exact symbol QueryCore uses -# --- end patch --- matplotlib.use('Agg') logger = logging.getLogger(__name__) @@ -3056,7 +3037,7 @@ def map_to_season(month): def plot_measure_timeseries_peak_week_by_state(self, df, output_dir, timeseries_locations_to_plot, color_map, comstock_run_name, comstock_obj=None): #, df, region, building_type, color_map, output_dir # get upgrade ID - df_data = df.copy() + df_data = df.copy(deep=True) # coerce data type of upgrade ID - arrives as float, need str of int for querying df_data[self.UPGRADE_ID] = pd.to_numeric(df_data[self.UPGRADE_ID], errors="coerce").astype("Int64").astype(str) df_upgrade = df_data.loc[(df_data[self.UPGRADE_ID]!="0"), :] diff --git a/postprocessing/setup.py b/postprocessing/setup.py index 7cd302bf1..4b9dba852 100644 --- a/postprocessing/setup.py +++ b/postprocessing/setup.py @@ -42,7 +42,7 @@ 'Programming Language :: Python :: 3.10', ], keywords='comstock postprocessing', - python_requires=">=3.9", + python_requires=">=3.10.12", install_requires=[ 'boto3', 'botocore', From 043aec354d167805ea3560cbc6d10d49572396c2 Mon Sep 17 00:00:00 2001 From: christophercaradonna Date: Tue, 30 Dec 2025 12:51:30 -0700 Subject: [PATCH 16/16] Various updates from PR comments --- .../compare_comstock_to_ami.py.template | 1 - .../compare_upgrades-test_timeseries_plots.py | 25 +++++++++-------- .../comstock_query_builder.py | 27 +++++++++++++++++++ .../comstockpostproc/plotting_mixin.py | 2 -- postprocessing/setup.py | 4 +-- 5 files changed, 41 insertions(+), 18 deletions(-) diff --git a/postprocessing/compare_comstock_to_ami.py.template b/postprocessing/compare_comstock_to_ami.py.template index 2f2e8a2aa..00cea4e02 100644 --- a/postprocessing/compare_comstock_to_ami.py.template +++ b/postprocessing/compare_comstock_to_ami.py.template @@ -94,7 +94,6 @@ def main(): # write select results to S3 for Athena/Glue when needed for timeseries plots s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" database = "enduse" - dataset_name = f"{comstock.comstock_run_name}/{comstock.comstock_run_name}" # name of the dir in s3_dir we want to make new tables and views for crawler_name = comstock.comstock_run_name # used to set name of crawler, cannot include slashes workgroup = "eulp" # Athena workgroup to use glue_service_role = "service-role/AWSGlueServiceRole-default" diff --git a/postprocessing/compare_upgrades-test_timeseries_plots.py b/postprocessing/compare_upgrades-test_timeseries_plots.py index 77997de28..078dc529b 100644 --- a/postprocessing/compare_upgrades-test_timeseries_plots.py +++ b/postprocessing/compare_upgrades-test_timeseries_plots.py @@ -11,8 +11,8 @@ def main(): # ComStock run comstock = cspp.ComStock( s3_base_dir='com-sdr', # If run not on S3, download results_up**.parquet manually - comstock_run_name='rtuadv_v11', # Name of the run on S3 - comstock_run_version='rtuadv_v11', # Use whatever you want to see in plot and folder names + comstock_run_name='pump_v9', # Name of the run on S3 + comstock_run_version='pump_v9', # Use whatever you want to see in plot and folder names comstock_year=2018, # Typically don't change this athena_table_name=None, # Typically don't change this truth_data_version='v01', # Typically don't change this @@ -21,10 +21,10 @@ def main(): drop_failed_runs=True, # False if you want to evaluate which runs failed in raw output data color_hex='#0072B2', # Color used to represent this run in plots skip_missing_columns=True, # False if you want to ensure you have all data specified for export - reload_from_cache=True, # True if CSV already made and want faster reload times + reload_from_cache=False, # True if CSV already made and want faster reload times include_upgrades=True, # False if not looking at upgrades - upgrade_ids_to_skip=[2,3], # Use [1, 3] etc. to exclude certain upgrades - make_timeseries_plots=True, + upgrade_ids_to_skip=[], # Use [1, 3] etc. to exclude certain upgrades + make_timeseries_plots=False, timeseries_locations_to_plot={ 'MN': 'Minnesota', # specify location (either county ID or state ID) and corresponding name for plots and folders. #'MA':'Massachusetts', @@ -47,20 +47,20 @@ def main(): stock_estimate = cspp.Apportion( stock_estimation_version='2025R3', # Only updated when a new stock estimate is published truth_data_version='v01', # Typically don't change this - reload_from_cache=True, # Set to "True" if you have already run apportionment and would like to keep consistant values between postprocessing runs. + reload_from_cache=False, # Set to "True" if you have already run apportionment and would like to keep consistant values between postprocessing runs. #output_dir = 's3://oedi-data-lake/nrel-pds-building-stock/end-use-load-profiles-for-us-building-stock/2025/comstock_amy2018_release_1' ) # Scale ComStock runs to the 'truth data' from StockE V3 estimates using bucket-based apportionment base_sim_outs = comstock.get_sim_outs_for_upgrade(0) - comstock.create_allocated_weights(stock_estimate, base_sim_outs, reload_from_cache=True) + comstock.create_allocated_weights(stock_estimate, base_sim_outs, reload_from_cache=False) # CBECS cbecs = cspp.CBECS( cbecs_year=2018, # 2012 and 2018 currently available truth_data_version='v01', # Typically don't change this color_hex='#009E73', # Color used to represent CBECS in plots - reload_from_csv=True # True if CSV already made and want faster reload times + reload_from_csv=False # True if CSV already made and want faster reload times ) # Scale ComStock to CBECS 2018 AND remove non-ComStock buildings from CBECS @@ -99,13 +99,13 @@ def main(): # specify the export level # IMPORTANT: if making county level timeseries plots, must export county level data to S3. This does not occur automatically. - geo_exports = [state_resolution] #county_resolution + geo_exports = [county_resolution] #county_resolution for geo_export in geo_exports: for upgrade_id in comstock.upgrade_ids_to_process: - if upgrade_id == 0: - continue - comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id, [geo_export]) + #if upgrade_id == 0: + # continue + #comstock.export_metadata_and_annual_results_for_upgrade(upgrade_id, [geo_export]) # Also write to S3 if making timeseries plots if comstock.make_timeseries_plots: # TODO: force geo exports to county data if couunty timeseries is requested. @@ -117,7 +117,6 @@ def main(): if comstock.make_timeseries_plots: s3_dir = f"s3://{comstock.s3_base_dir}/{comstock.comstock_run_name}/{comstock.comstock_run_name}" database = "enduse" - dataset_name = f"{comstock.comstock_run_name}/{comstock.comstock_run_name}" # name of the dir in s3_dir we want to make new tables and views for crawler_name = comstock.comstock_run_name # used to set name of crawler, cannot include slashes workgroup = "eulp" # Athena workgroup to use glue_service_role = "service-role/AWSGlueServiceRole-default" diff --git a/postprocessing/comstockpostproc/comstock_query_builder.py b/postprocessing/comstockpostproc/comstock_query_builder.py index c9bc0adaa..33ca48ece 100644 --- a/postprocessing/comstockpostproc/comstock_query_builder.py +++ b/postprocessing/comstockpostproc/comstock_query_builder.py @@ -114,6 +114,12 @@ def get_timeseries_aggregation_query( This matches the pattern from working buildstock queries that join timeseries data with a weight view table for proper weighting and aggregation. + Automatically detects if the timeseries table is partitioned by state based on naming conventions: + - Tables with 'ts_by_state' in the name are treated as state-partitioned + - Tables with '_timeseries' in the name (without 'ts_by_state') are treated as non-partitioned + - Other naming patterns assume no partitioning with a warning to the user regarding this assumption. + TODO: Improve this in the future to check for partitioning directly from Athena metadata if possible. + Args: upgrade_id: Upgrade ID to filter on enduses: List of end use columns to select @@ -135,6 +141,21 @@ def get_timeseries_aggregation_query( raise ValueError("weight_view_table parameter is required. Cannot auto-assign without knowing geographic aggregation level (state vs county). " "Please provide the full weight view table name (e.g., 'your_dataset_md_agg_national_by_state_vu' or 'your_dataset_md_agg_national_by_county_vu')") + # Auto-detect if timeseries is partitioned by state based on naming conventions + table_name_lower = self.timeseries_table.lower() + if 'ts_by_state' in table_name_lower: + timeseries_partitioned_by_state = True + logger.info(f"Detected state-partitioned timeseries table: {self.timeseries_table}. " + "Query will include state partition filter so timeseries files for a building ID are not double counted.") + elif '_timeseries' in table_name_lower: + timeseries_partitioned_by_state = False + logger.info(f"Detected standard non-partitioned timeseries table: {self.timeseries_table}") + else: + timeseries_partitioned_by_state = False + logger.warning(f"Timeseries table name '{self.timeseries_table}' does not match expected patterns ('ts_by_state' or '_timeseries'). " + "Assuming no state partitioning. If the table is actually partitioned by state, building IDs may be double-counted. " + "Please use standard naming conventions: 'dataset_ts_by_state' for partitioned or 'dataset_timeseries' for non-partitioned tables.") + # Build timestamp grouping with date_trunc and time adjustment time_group = '1' if timestamp_grouping == 'hour': @@ -202,6 +223,12 @@ def get_timeseries_aggregation_query( f'{weight_view_table}.upgrade = {upgrade_id}' ] + # Add state partition filter if timeseries is partitioned by state (enables partition pruning) + # When partitioned by state, each building's timeseries data is duplicated across each state + # it is apportioned to. To avoid double counting, we filter to only the state matching the weight view. + if timeseries_partitioned_by_state: + where_conditions.append(f'{weight_view_table}.state = {self.timeseries_table}.state') + if building_ids: bldg_list = ', '.join([str(bid) for bid in building_ids]) where_conditions.append(f'{weight_view_table}."bldg_id" IN ({bldg_list})') diff --git a/postprocessing/comstockpostproc/plotting_mixin.py b/postprocessing/comstockpostproc/plotting_mixin.py index 2db476796..10e7a7bff 100644 --- a/postprocessing/comstockpostproc/plotting_mixin.py +++ b/postprocessing/comstockpostproc/plotting_mixin.py @@ -13,7 +13,6 @@ import plotly.graph_objects as go import matplotlib.colors as mcolors from plotly.subplots import make_subplots -import comstockpostproc.comstock as comstock matplotlib.use('Agg') logger = logging.getLogger(__name__) @@ -3476,7 +3475,6 @@ def map_to_dow(dow): dfs_merged['Season'] = dfs_merged['Month'].apply(map_to_season) dfs_merged_gb = dfs_merged.groupby(['in.upgrade_name', 'Season', 'Hour_of_Day'], observed=True)[dfs_merged.loc[:, dfs_merged.columns.str.contains('_kwh')].columns].mean().reset_index() - max_peak = dfs_merged_gb.loc[:, 'total_site_electricity_kwh'].max() # rename columns, convert units dfs_merged_gb.columns = dfs_merged_gb.columns.str.replace("electricity_", "") diff --git a/postprocessing/setup.py b/postprocessing/setup.py index 4b9dba852..aab6fd4b9 100644 --- a/postprocessing/setup.py +++ b/postprocessing/setup.py @@ -42,7 +42,7 @@ 'Programming Language :: Python :: 3.10', ], keywords='comstock postprocessing', - python_requires=">=3.10.12", + python_requires="==3.12.12", install_requires=[ 'boto3', 'botocore', @@ -59,7 +59,7 @@ 'scipy', 'seaborn>=0.12.0', 'xlrd', - 'buildstock_query @ git+https://github.com/NREL/buildstock-query@8f65e034' + 'buildstock_query @ git+https://github.com/NREL/buildstock-query@0479759' ], extras_require={ 'dev': [