From 646a5c3cdd18a34e76023f2375b5bb15c4161b93 Mon Sep 17 00:00:00 2001 From: aporr Date: Fri, 23 Jan 2026 15:27:22 -0500 Subject: [PATCH 1/5] In morpc.CONST_REGIONS, rename entry REGIONMSA to CBSA so that morpc.SUMLEVEL_LOOKUP works correctly. Cascading changes to morpc.CONST_COLUMBUS_MSA_ID, morpc.CONST_REGIONS_GEOID, and morpc.CONST_REGIONS_COUNTYBASED --- morpc/morpc.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/morpc/morpc.py b/morpc/morpc.py index 1c6cae3..2b33d81 100644 --- a/morpc/morpc.py +++ b/morpc/morpc.py @@ -24,7 +24,7 @@ # Commonly used geographic identifiers # The following are assigned by the U.S. Census Bureau -CONST_COLUMBUS_MSA_ID = '18140' +CONST_COLUMBUS_CBSA_ID = '18140' CONST_OHIO_STATE_ID = '39' CONST_OHIO_REGION_ID = '2' # Midwest CONST_OHIO_DIVISION_ID = '3' # East North Central @@ -124,7 +124,7 @@ def get_state_ids(): CONST_REGIONS["OneColumbus Region"] = CONST_REGIONS["REGIONONECBUS"] CONST_REGIONS["REGIONCEDS"] = CONST_REGIONS["REGION10"] + ["Logan"] CONST_REGIONS["CEDS Region"] = CONST_REGIONS["REGIONCEDS"] -CONST_REGIONS["REGIONMSA"] = CONST_REGIONS["REGION7"] + ["Hocking","Morrow","Perry"] +CONST_REGIONS["CBSA"] = CONST_REGIONS["REGION7"] + ["Hocking","Morrow","Perry"] # Region identifiers # Note that the Columbus MSA already has a GEOID that is defined by the Census Bureau. See CONST_COLUMBUS_MSA_ID above. @@ -138,11 +138,11 @@ def get_state_ids(): CONST_REGIONS_GEOID["REGIONONECBUS"] = "001" CONST_REGIONS_GEOID["REGIONMPO"] = "001" CONST_REGIONS_GEOID["REGIONTDM"] = "001" -CONST_REGIONS_GEOID["REGIONMSA"] = CONST_COLUMBUS_MSA_ID +CONST_REGIONS_GEOID["CBSA"] = CONST_COLUMBUS_MSA_ID # The following regions are comprised of collections of whole counties. Not all region definitions are county-based, # for example the MPO region. -CONST_REGIONS_COUNTYBASED = ["REGION15","REGION10","REGION7","REGIONCEDS","REGIONCORPO","REGIONONECBUS","REGIONMSA"] +CONST_REGIONS_COUNTYBASED = ["REGION15","REGION10","REGION7","REGIONCEDS","REGIONCORPO","REGIONONECBUS","CBSA"] # County name abbreviations ## CONST_COUNTY_ABBREV maps the full county name to its three-letter abbreviation From 549b42f8894726f389c361a328f2c42a132fb73e Mon Sep 17 00:00:00 2001 From: aporr Date: Fri, 23 Jan 2026 15:30:05 -0500 Subject: [PATCH 2/5] If no log file name is specified, derive log file name from script file name. Jordan said it already behaved as described for him as is, but it didn't work for Adam. May be sensitive to differences in environment. Adam uses JupyterLab and Jordan uses VisualStudio Code. Probably other environmental differences as well. --- morpc/logs.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/morpc/logs.py b/morpc/logs.py index 12cf897..25dbbf7 100644 --- a/morpc/logs.py +++ b/morpc/logs.py @@ -19,12 +19,23 @@ "critical": 50 } -def config_logs(filename, level, mode = 'w'): +def config_logs(filename=None, level='info', mode = 'w'): """ Set up logs within a notebook to store log outputs in filename, and display in output. """ import logging import sys + import os + import ipykernel + import json + + fileSpecified = True + if(filename is None): + fileSpecified = False + connectionInfo = json.loads(ipykernel.get_connection_info()) + scriptFilename = os.path.basename(os.path.normpath(connectionInfo["jupyter_session"])) + filename = f"{scriptFilename}.log" + logging.basicConfig( level=LEVEL_MAP[level], force=True, @@ -36,4 +47,7 @@ def config_logs(filename, level, mode = 'w'): ) logging.getLogger(__name__).setLevel(LEVEL_MAP[level]) - logger.info(f'Set up logging save to file {filename}') \ No newline at end of file + if(fileSpecified == False): + logger.info(f'Log filename not specified. Using default filename.') + + logger.info(f'Set up logging save to file "{filename}", log level "{level}"') From 913940eb23efc466397ada11cdb5b5406de5f3fb Mon Sep 17 00:00:00 2001 From: aporr Date: Fri, 23 Jan 2026 15:32:11 -0500 Subject: [PATCH 3/5] Add optional second output to morpc.reapportion_by_area that provides features and attributes for the intersection geographies --- morpc/morpc.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/morpc/morpc.py b/morpc/morpc.py index 2b33d81..580c6e8 100644 --- a/morpc/morpc.py +++ b/morpc/morpc.py @@ -2807,7 +2807,7 @@ def write_table(df, path, format=None, index=None): print("morpc.write_table | ERROR | This function does not currently support format {}. Add export arguments for this format in morpc.PANDAS_EXPORT_ARGS_OVERRIDE or use the native pandas export functions.".format(format)) raise RuntimeError -def reapportion_by_area(targetGeos, sourceGeos, apportionColumns=None, summaryType="sum", roundPreserveSum=None, partialCoverageStrategy="error", zeroCoverageStrategy="error", sourceShareTolerance=6, targetShareTolerance=6): +def reapportion_by_area(targetGeos, sourceGeos, apportionColumns=None, summaryType="sum", roundPreserveSum=None, partialCoverageStrategy="error", zeroCoverageStrategy="error", sourceShareTolerance=6, targetShareTolerance=6, returnIntersectData=False): """ Given you have some variable(s) summarized at one geography level, reapportion those variables to other geographies in proportion # to the area of overlap of the target geographies with the source geographies. This is accomplished by intersecting the target @@ -2855,11 +2855,17 @@ def reapportion_by_area(targetGeos, sourceGeos, apportionColumns=None, summaryTy geographies do not sum to 1. Round the sums to the specified decimal place prior to evaluation. Sum greater than 1 may indicate that there are overlapping polygons in the target geos or source geos. Sum less than 1 may indicate that portions of the target geos do not overlap the source geos. Set to None to allow no tolerance (warning will be generated if shares do not sum to exactly 1). + returnIntersectData : bool + Optional. If False, return only one output consisting of the reapportioned data (default). If true, return a second output (GeoDataFrame) + consisting of the intersection geometries and their attributes. Returns ------- targetGeosUpdated : geopandas.geodataframe.GeoDataFrame with polygon geometry type An updated version of targetGeos that includes the reapportioned variables. + intersectGeosUpdate : geopandas.geodataframe.GeoDataFrame with polygon geometry type + If requested. The results of the intersection of the source geos and target geos, including + areas and shares for both, plus geometry and area for the intersection polygon. """ import pandas as pd @@ -2922,6 +2928,11 @@ def reapportion_by_area(targetGeos, sourceGeos, apportionColumns=None, summaryTy # Compute the share of the target geo that each intersection polygon represents intersectGeos["TARGET_SHARE"] = intersectGeos["INTERSECT_GEOS_AREA"] / intersectGeos["TARGET_GEOS_AREA"] + # Create a copy of the intersection data that we can output if the user requested it. Put the columns in a sensible order. + intersectGeosOutput = intersectGeos.copy() + intersectGeosOutput = intersectGeosOutput.filter(items=["sourceIndex","targetIndex","SOURCE_GEOS_AREA","TARGET_GEOS_AREA","INTERSECT_GEOS_AREA", + "SOURCE_SHARE","TARGET_SHARE","geometry"], axis="columns") + # Make a list of the source geo IDs that appeared in the original source data but do not appear in the intersection data. # These are source geos that had zero overlap with the target geos. If there are entries in the list, throw an error if appropriate. if(summaryType == "sum"): @@ -3043,8 +3054,11 @@ def reapportion_by_area(targetGeos, sourceGeos, apportionColumns=None, summaryTy # Reorder the target geos columns as they were originally and append the reapportioned variables # to the end. targetGeosUpdated = targetGeosUpdated.filter(items=list(targetGeos.columns)+apportionColumns, axis="columns") - - return targetGeosUpdated + + if(returnIntersectData == True): + return (targetGeosUpdated, intersectGeosOutput) + else: + return targetGeosUpdated def hist_scaled(series, logy="auto", yRatioThreshold=100, xClassify=False, xRatioThreshold=100, scheme="NaturalBreaks", bins=10, retBinsCounts=False, figsize=None): """ From 774c42ffd0b1b0483681fc2d0b2ca195f30ab47e Mon Sep 17 00:00:00 2001 From: aporr Date: Mon, 26 Jan 2026 12:12:05 -0500 Subject: [PATCH 4/5] Convert a few references from MSA to CBSA that I missed previously --- morpc/census/geos.py | 8 ++++---- morpc/morpc.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/morpc/census/geos.py b/morpc/census/geos.py index 31856b6..97523a2 100644 --- a/morpc/census/geos.py +++ b/morpc/census/geos.py @@ -57,9 +57,9 @@ "for": f"county:{','.join([morpc.CONST_COUNTY_NAME_TO_ID[x][2:6] for x in morpc.CONST_REGIONS['CEDS Region']])}" } }, - {"regionmsa": { + {"regioncbsa": { "in": "state:39", - "for": f"county:{','.join([morpc.CONST_COUNTY_NAME_TO_ID[x][2:6] for x in morpc.CONST_REGIONS['REGIONMSA']])}" + "for": f"county:{','.join([morpc.CONST_COUNTY_NAME_TO_ID[x][2:6] for x in morpc.CONST_REGIONS['CBSA']])}" } } ] @@ -68,8 +68,8 @@ "us": { 'for': 'us:1' }, - "columbusmsa": { - "for": f"metropolitan statistical area/micropolitan statistical area:{morpc.CONST_COLUMBUS_MSA_ID}" + "columbuscbsa": { + "for": f"metropolitan statistical area/micropolitan statistical area:{morpc.CONST_COLUMBUS_CBSA_ID}" } } diff --git a/morpc/morpc.py b/morpc/morpc.py index 580c6e8..5328b75 100644 --- a/morpc/morpc.py +++ b/morpc/morpc.py @@ -138,7 +138,7 @@ def get_state_ids(): CONST_REGIONS_GEOID["REGIONONECBUS"] = "001" CONST_REGIONS_GEOID["REGIONMPO"] = "001" CONST_REGIONS_GEOID["REGIONTDM"] = "001" -CONST_REGIONS_GEOID["CBSA"] = CONST_COLUMBUS_MSA_ID +CONST_REGIONS_GEOID["CBSA"] = CONST_COLUMBUS_CBSA_ID # The following regions are comprised of collections of whole counties. Not all region definitions are county-based, # for example the MPO region. From ddeeec1faf6ae29ca6acc7e02c18695be5ada3fa Mon Sep 17 00:00:00 2001 From: aporr Date: Tue, 27 Jan 2026 16:42:00 -0500 Subject: [PATCH 5/5] Add morpc.updateExistingTable( ) --- morpc/morpc.py | 134 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 133 insertions(+), 1 deletion(-) diff --git a/morpc/morpc.py b/morpc/morpc.py index 5328b75..e14e19f 100644 --- a/morpc/morpc.py +++ b/morpc/morpc.py @@ -3354,4 +3354,136 @@ def ages_in_year(self, years, generations=None): youngest_age = max(youngest_age, 0) agesInYear[generation][year] = list(range(youngest_age, oldest_age+1)) - return agesInYear \ No newline at end of file + return agesInYear + +def updateExistingTable(newData, schema, existingData=None, sortColumns=None, overwrite=False): + """ + This function takes a data table in a well-defined form (as captured in a Frictionless schema) and attempts to update + an existing table in the same form if it exists or otherwise outputs the new data as-is. If records with identical primary + key values exist in both tables, the existing data will be overwritten by the new data for those records (unless overwrite is + set to False). Records in the new data which are not already present in the existing data will be appended. The resulting + dataframe will be transformed to be compliant with the provided schema and, optionally, will be sorted by a user-specified + set of columns. + + Parameters + ---------- + newData : pandas.dataframe.DataFrame + A Pandas DataFrame containing the new data to update the existing table if it exists + schema : frictionless.schema.schema.Schema + A Frictionless schema object that applies to both newData and existingData. Often created using morpc.frictionless.load_schema(). + existingData : pandas.dataframe.DataFrame + If provided, a Pandas DataFrame containing existing data to which the updated data in newData will be applied. It may be the case + that no data exists yet (the first time a script is run, for example). In that case, simply omit existingData or set it explicitly + to None and the function will return newData in a form that is compliant with the schema and (optionally) sorted. + sortColumns : str or list + Optional. A specification of the columns to use to sort the updated table. If set to None, no sorting will occur. If set to + "primary_key", the columns identified in schema.primary_key will be used. Otherwise, provide a list of strings representing the + names of columns to be used. + overwrite : bool + Optional. If True, records that already exist in existingData will be overwritten by equivalent records in newData (as identified + by identical primary key values in both dataframes). If False, an error will be raised if this case occurs. + + Returns + ------- + outputData : pandas.dataframe.DataFrame + A dataframe that consists of the merged contents of existingData (if provided) and newData. + """ + + import morpc + import pandas as pd + import logging + + logger = logging.getLogger(__name__) + + myNewData = newData.copy() + # Verify that the primary key exists in the new data and that the index is in a known state + if(set(schema.primary_key).issubset(set(myNewData.columns))): + pass + elif(myNewData.index.names == schema.primary_key): + myNewData = myNewData.reset_index() + else: + logger.error("New data does not seem to contain the primary key, either as columns or as an index.") + raise RuntimeError + + logger.info("Extracting required fields in new data and reordering them as specified in the schema.") + myNewData = myNewData.filter(items=schema.field_names, axis="columns") + + logger.info("Casting new data to data types specified in schema.") + myNewData = morpc.frictionless.cast_field_types(myNewData, schema) + + if(existingData is None): + myExistingData = None + else: + myExistingData = existingData.copy() + # Verify that the primary key exists in the existing data and that the index is in a known state + if(set(schema.primary_key).issubset(set(myExistingData.columns))): + pass + elif(myExistingData.index.names == schema.primary_key): + myExistingData = myExistingData.reset_index() + else: + logger.error("Existing data does not seem to contain the primary key, either as columns or as an index.") + raise RuntimeError + + logger.info("Confirming existing data includes only the required fields and that they are in the order specified in the schema.") + myNewData = myNewData.filter(items=schema.field_names, axis="columns") + + logger.info("Confirming existing data is cast as data types specified in schema.") + myExistingData = morpc.frictionless.cast_field_types(myExistingData, schema) + + + if(myExistingData is None): + logger.info("No existing data was found. Creating output data from scratch.") + outputData = myNewData.copy() + else: + logger.info("Existing output data was found. Merging new data with existing data.") + + outputData = myExistingData.copy() + + logger.info("Setting new data index to primary key specified in schema.") + myNewData = myNewData.set_index(schema.primary_key) + + logger.info("Setting existing data index to primary key specified in schema.") + outputData = outputData.set_index(schema.primary_key) + + logger.info("Analyzing differences between new and existing data.") + recordsToUpdate = myNewData.index.intersection(outputData.index) + logger.info(f"--> Found {len(recordsToUpdate)} records which are present in existing data and will be updated.") + recordsToAppend = myNewData.index.difference(outputData.index) + logger.info(f"--> Found {len(recordsToAppend)} records which are not present in existing data and will be appended.") + + # Update entries that were already present in output table + if(not myNewData.loc[recordsToUpdate].empty): + try: + outputData.update(myNewData.loc[recordsToUpdate], overwrite=True, errors=("ignore" if overwrite == True else "raise")) + except Exception as e: + logger.error(f"Failed to update existing data with new data: {e}") + logger.info("To force update when data overlaps, set overwrite parameter to True.") + raise + + # Append new entries that were not present in output table + if(not myNewData.loc[recordsToAppend].empty): + outputData = pd.concat([outputData, myNewData.loc[recordsToAppend]], axis="index") + + logger.info("Resetting indices of all datasets.") + myNewData = myNewData.reset_index() + outputData = outputData.reset_index() + + logger.info("Casting merged data to data types specified in schema.") + outputData = morpc.frictionless.cast_field_types(outputData, schema) + + if(sortColumns == "primary_key"): + mySortColumns = schema.primary_key + logger.info(f"Sorting merged data by columns specified in primary key: {mySortColumns}") + elif(sortColumns is None): + logger.info(f"Column sort order is unspecified. Not sorting columns.") + elif(type(sortColumns) == "list"): + mySortColumns = sortColumns + logger.info(f'Sorting merged data by user-specified columns: {",".join(mySortColumns)}') + else: + logger.error('User-specified value for sortColumns parameter is not supported.') + raise RuntimeError + + if(sortColumns is not None): + outputData = outputData.sort_values(mySortColumns) + + return outputData \ No newline at end of file