Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions morpc/census/geos.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@
"for": f"county:{','.join([morpc.CONST_COUNTY_NAME_TO_ID[x][2:6] for x in morpc.CONST_REGIONS['CEDS Region']])}"
}
},
{"regionmsa": {
{"regioncbsa": {
"in": "state:39",
"for": f"county:{','.join([morpc.CONST_COUNTY_NAME_TO_ID[x][2:6] for x in morpc.CONST_REGIONS['REGIONMSA']])}"
"for": f"county:{','.join([morpc.CONST_COUNTY_NAME_TO_ID[x][2:6] for x in morpc.CONST_REGIONS['CBSA']])}"
}
}
]
Expand All @@ -68,8 +68,8 @@
"us": {
'for': 'us:1'
},
"columbusmsa": {
"for": f"metropolitan statistical area/micropolitan statistical area:{morpc.CONST_COLUMBUS_MSA_ID}"
"columbuscbsa": {
"for": f"metropolitan statistical area/micropolitan statistical area:{morpc.CONST_COLUMBUS_CBSA_ID}"
}
}

Expand Down
18 changes: 16 additions & 2 deletions morpc/logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,23 @@
"critical": 50
}

def config_logs(filename, level, mode = 'w'):
def config_logs(filename=None, level='info', mode = 'w'):
"""
Set up logs within a notebook to store log outputs in filename, and display in output.
"""
import logging
import sys
import os
import ipykernel
import json

fileSpecified = True
if(filename is None):
fileSpecified = False
connectionInfo = json.loads(ipykernel.get_connection_info())
scriptFilename = os.path.basename(os.path.normpath(connectionInfo["jupyter_session"]))
filename = f"{scriptFilename}.log"

logging.basicConfig(
level=LEVEL_MAP[level],
force=True,
Expand All @@ -36,4 +47,7 @@ def config_logs(filename, level, mode = 'w'):
)
logging.getLogger(__name__).setLevel(LEVEL_MAP[level])

logger.info(f'Set up logging save to file {filename}')
if(fileSpecified == False):
logger.info(f'Log filename not specified. Using default filename.')

logger.info(f'Set up logging save to file "{filename}", log level "{level}"')
162 changes: 154 additions & 8 deletions morpc/morpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

# Commonly used geographic identifiers
# The following are assigned by the U.S. Census Bureau
CONST_COLUMBUS_MSA_ID = '18140'
CONST_COLUMBUS_CBSA_ID = '18140'
CONST_OHIO_STATE_ID = '39'
CONST_OHIO_REGION_ID = '2' # Midwest
CONST_OHIO_DIVISION_ID = '3' # East North Central
Expand Down Expand Up @@ -124,7 +124,7 @@ def get_state_ids():
CONST_REGIONS["OneColumbus Region"] = CONST_REGIONS["REGIONONECBUS"]
CONST_REGIONS["REGIONCEDS"] = CONST_REGIONS["REGION10"] + ["Logan"]
CONST_REGIONS["CEDS Region"] = CONST_REGIONS["REGIONCEDS"]
CONST_REGIONS["REGIONMSA"] = CONST_REGIONS["REGION7"] + ["Hocking","Morrow","Perry"]
CONST_REGIONS["CBSA"] = CONST_REGIONS["REGION7"] + ["Hocking","Morrow","Perry"]

# Region identifiers
# Note that the Columbus MSA already has a GEOID that is defined by the Census Bureau. See CONST_COLUMBUS_MSA_ID above.
Expand All @@ -138,11 +138,11 @@ def get_state_ids():
CONST_REGIONS_GEOID["REGIONONECBUS"] = "001"
CONST_REGIONS_GEOID["REGIONMPO"] = "001"
CONST_REGIONS_GEOID["REGIONTDM"] = "001"
CONST_REGIONS_GEOID["REGIONMSA"] = CONST_COLUMBUS_MSA_ID
CONST_REGIONS_GEOID["CBSA"] = CONST_COLUMBUS_CBSA_ID

# The following regions are comprised of collections of whole counties. Not all region definitions are county-based,
# for example the MPO region.
CONST_REGIONS_COUNTYBASED = ["REGION15","REGION10","REGION7","REGIONCEDS","REGIONCORPO","REGIONONECBUS","REGIONMSA"]
CONST_REGIONS_COUNTYBASED = ["REGION15","REGION10","REGION7","REGIONCEDS","REGIONCORPO","REGIONONECBUS","CBSA"]

# County name abbreviations
## CONST_COUNTY_ABBREV maps the full county name to its three-letter abbreviation
Expand Down Expand Up @@ -2807,7 +2807,7 @@ def write_table(df, path, format=None, index=None):
print("morpc.write_table | ERROR | This function does not currently support format {}. Add export arguments for this format in morpc.PANDAS_EXPORT_ARGS_OVERRIDE or use the native pandas export functions.".format(format))
raise RuntimeError

def reapportion_by_area(targetGeos, sourceGeos, apportionColumns=None, summaryType="sum", roundPreserveSum=None, partialCoverageStrategy="error", zeroCoverageStrategy="error", sourceShareTolerance=6, targetShareTolerance=6):
def reapportion_by_area(targetGeos, sourceGeos, apportionColumns=None, summaryType="sum", roundPreserveSum=None, partialCoverageStrategy="error", zeroCoverageStrategy="error", sourceShareTolerance=6, targetShareTolerance=6, returnIntersectData=False):
"""
Given you have some variable(s) summarized at one geography level, reapportion those variables to other geographies in proportion
# to the area of overlap of the target geographies with the source geographies. This is accomplished by intersecting the target
Expand Down Expand Up @@ -2855,11 +2855,17 @@ def reapportion_by_area(targetGeos, sourceGeos, apportionColumns=None, summaryTy
geographies do not sum to 1. Round the sums to the specified decimal place prior to evaluation. Sum greater than 1 may indicate
that there are overlapping polygons in the target geos or source geos. Sum less than 1 may indicate that portions of the target geos
do not overlap the source geos. Set to None to allow no tolerance (warning will be generated if shares do not sum to exactly 1).
returnIntersectData : bool
Optional. If False, return only one output consisting of the reapportioned data (default). If true, return a second output (GeoDataFrame)
consisting of the intersection geometries and their attributes.

Returns
-------
targetGeosUpdated : geopandas.geodataframe.GeoDataFrame with polygon geometry type
An updated version of targetGeos that includes the reapportioned variables.
intersectGeosUpdate : geopandas.geodataframe.GeoDataFrame with polygon geometry type
If requested. The results of the intersection of the source geos and target geos, including
areas and shares for both, plus geometry and area for the intersection polygon.
"""

import pandas as pd
Expand Down Expand Up @@ -2922,6 +2928,11 @@ def reapportion_by_area(targetGeos, sourceGeos, apportionColumns=None, summaryTy
# Compute the share of the target geo that each intersection polygon represents
intersectGeos["TARGET_SHARE"] = intersectGeos["INTERSECT_GEOS_AREA"] / intersectGeos["TARGET_GEOS_AREA"]

# Create a copy of the intersection data that we can output if the user requested it. Put the columns in a sensible order.
intersectGeosOutput = intersectGeos.copy()
intersectGeosOutput = intersectGeosOutput.filter(items=["sourceIndex","targetIndex","SOURCE_GEOS_AREA","TARGET_GEOS_AREA","INTERSECT_GEOS_AREA",
"SOURCE_SHARE","TARGET_SHARE","geometry"], axis="columns")

# Make a list of the source geo IDs that appeared in the original source data but do not appear in the intersection data.
# These are source geos that had zero overlap with the target geos. If there are entries in the list, throw an error if appropriate.
if(summaryType == "sum"):
Expand Down Expand Up @@ -3043,8 +3054,11 @@ def reapportion_by_area(targetGeos, sourceGeos, apportionColumns=None, summaryTy
# Reorder the target geos columns as they were originally and append the reapportioned variables
# to the end.
targetGeosUpdated = targetGeosUpdated.filter(items=list(targetGeos.columns)+apportionColumns, axis="columns")

return targetGeosUpdated

if(returnIntersectData == True):
return (targetGeosUpdated, intersectGeosOutput)
else:
return targetGeosUpdated

def hist_scaled(series, logy="auto", yRatioThreshold=100, xClassify=False, xRatioThreshold=100, scheme="NaturalBreaks", bins=10, retBinsCounts=False, figsize=None):
"""
Expand Down Expand Up @@ -3340,4 +3354,136 @@ def ages_in_year(self, years, generations=None):
youngest_age = max(youngest_age, 0)
agesInYear[generation][year] = list(range(youngest_age, oldest_age+1))

return agesInYear
return agesInYear

def updateExistingTable(newData, schema, existingData=None, sortColumns=None, overwrite=False):
"""
This function takes a data table in a well-defined form (as captured in a Frictionless schema) and attempts to update
an existing table in the same form if it exists or otherwise outputs the new data as-is. If records with identical primary
key values exist in both tables, the existing data will be overwritten by the new data for those records (unless overwrite is
set to False). Records in the new data which are not already present in the existing data will be appended. The resulting
dataframe will be transformed to be compliant with the provided schema and, optionally, will be sorted by a user-specified
set of columns.

Parameters
----------
newData : pandas.dataframe.DataFrame
A Pandas DataFrame containing the new data to update the existing table if it exists
schema : frictionless.schema.schema.Schema
A Frictionless schema object that applies to both newData and existingData. Often created using morpc.frictionless.load_schema().
existingData : pandas.dataframe.DataFrame
If provided, a Pandas DataFrame containing existing data to which the updated data in newData will be applied. It may be the case
that no data exists yet (the first time a script is run, for example). In that case, simply omit existingData or set it explicitly
to None and the function will return newData in a form that is compliant with the schema and (optionally) sorted.
sortColumns : str or list
Optional. A specification of the columns to use to sort the updated table. If set to None, no sorting will occur. If set to
"primary_key", the columns identified in schema.primary_key will be used. Otherwise, provide a list of strings representing the
names of columns to be used.
overwrite : bool
Optional. If True, records that already exist in existingData will be overwritten by equivalent records in newData (as identified
by identical primary key values in both dataframes). If False, an error will be raised if this case occurs.

Returns
-------
outputData : pandas.dataframe.DataFrame
A dataframe that consists of the merged contents of existingData (if provided) and newData.
"""

import morpc
import pandas as pd
import logging

logger = logging.getLogger(__name__)

myNewData = newData.copy()
# Verify that the primary key exists in the new data and that the index is in a known state
if(set(schema.primary_key).issubset(set(myNewData.columns))):
pass
elif(myNewData.index.names == schema.primary_key):
myNewData = myNewData.reset_index()
else:
logger.error("New data does not seem to contain the primary key, either as columns or as an index.")
raise RuntimeError

logger.info("Extracting required fields in new data and reordering them as specified in the schema.")
myNewData = myNewData.filter(items=schema.field_names, axis="columns")

logger.info("Casting new data to data types specified in schema.")
myNewData = morpc.frictionless.cast_field_types(myNewData, schema)

if(existingData is None):
myExistingData = None
else:
myExistingData = existingData.copy()
# Verify that the primary key exists in the existing data and that the index is in a known state
if(set(schema.primary_key).issubset(set(myExistingData.columns))):
pass
elif(myExistingData.index.names == schema.primary_key):
myExistingData = myExistingData.reset_index()
else:
logger.error("Existing data does not seem to contain the primary key, either as columns or as an index.")
raise RuntimeError

logger.info("Confirming existing data includes only the required fields and that they are in the order specified in the schema.")
myNewData = myNewData.filter(items=schema.field_names, axis="columns")

logger.info("Confirming existing data is cast as data types specified in schema.")
myExistingData = morpc.frictionless.cast_field_types(myExistingData, schema)


if(myExistingData is None):
logger.info("No existing data was found. Creating output data from scratch.")
outputData = myNewData.copy()
else:
logger.info("Existing output data was found. Merging new data with existing data.")

outputData = myExistingData.copy()

logger.info("Setting new data index to primary key specified in schema.")
myNewData = myNewData.set_index(schema.primary_key)

logger.info("Setting existing data index to primary key specified in schema.")
outputData = outputData.set_index(schema.primary_key)

logger.info("Analyzing differences between new and existing data.")
recordsToUpdate = myNewData.index.intersection(outputData.index)
logger.info(f"--> Found {len(recordsToUpdate)} records which are present in existing data and will be updated.")
recordsToAppend = myNewData.index.difference(outputData.index)
logger.info(f"--> Found {len(recordsToAppend)} records which are not present in existing data and will be appended.")

# Update entries that were already present in output table
if(not myNewData.loc[recordsToUpdate].empty):
try:
outputData.update(myNewData.loc[recordsToUpdate], overwrite=True, errors=("ignore" if overwrite == True else "raise"))
except Exception as e:
logger.error(f"Failed to update existing data with new data: {e}")
logger.info("To force update when data overlaps, set overwrite parameter to True.")
raise

# Append new entries that were not present in output table
if(not myNewData.loc[recordsToAppend].empty):
outputData = pd.concat([outputData, myNewData.loc[recordsToAppend]], axis="index")

logger.info("Resetting indices of all datasets.")
myNewData = myNewData.reset_index()
outputData = outputData.reset_index()

logger.info("Casting merged data to data types specified in schema.")
outputData = morpc.frictionless.cast_field_types(outputData, schema)

if(sortColumns == "primary_key"):
mySortColumns = schema.primary_key
logger.info(f"Sorting merged data by columns specified in primary key: {mySortColumns}")
elif(sortColumns is None):
logger.info(f"Column sort order is unspecified. Not sorting columns.")
elif(type(sortColumns) == "list"):
mySortColumns = sortColumns
logger.info(f'Sorting merged data by user-specified columns: {",".join(mySortColumns)}')
else:
logger.error('User-specified value for sortColumns parameter is not supported.')
raise RuntimeError

if(sortColumns is not None):
outputData = outputData.sort_values(mySortColumns)

return outputData