diff --git a/.github/workflows/docker-build.yaml b/.github/workflows/docker-build.yaml new file mode 100644 index 0000000..745b79a --- /dev/null +++ b/.github/workflows/docker-build.yaml @@ -0,0 +1,78 @@ +name: Docker Build + +on: + workflow_call: + inputs: + push_to_registry: + type: boolean + default: false + required: false + version: + type: string + required: true + branch: + type: string + required: true + platforms: + type: string + default: 'linux/amd64' + required: false + secrets: + AWS_ACCOUNT_NUMBER: + required: true + AWS_ROLE: + required: true + AWS_REGION: + required: true + +permissions: + contents: read + packages: write + id-token: write + +jobs: + build-docker: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::${{ secrets.AWS_ACCOUNT_NUMBER }}:role/${{ secrets.AWS_ROLE }} + role-session-name: updateimage + aws-region: ${{ secrets.AWS_REGION }} + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v2 + with: + registries: ${{ secrets.AWS_ACCOUNT_NUMBER }} + + - name: Set Docker tags based on branch + id: set_tags + env: + ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + run: | + if [[ "${{ inputs.branch }}" == "main" ]]; then + echo "TAGS=$ECR_REGISTRY/hydroshift:latest,$ECR_REGISTRY/hydroshift:${{ inputs.version }}" >> $GITHUB_OUTPUT + else + echo "TAGS=$ECR_REGISTRY/hydroshift:pr-${{ github.event.pull_request.number }}" >> $GITHUB_OUTPUT + fi + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: . + platforms: ${{ inputs.platforms }} + push: ${{ inputs.push_to_registry }} + tags: ${{ steps.set_tags.outputs.TAGS }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/.github/workflows/get-version.yaml b/.github/workflows/get-version.yaml new file mode 100644 index 0000000..952d9ae --- /dev/null +++ b/.github/workflows/get-version.yaml @@ -0,0 +1,44 @@ +name: Get Version + +on: + workflow_call: + outputs: + version: + description: "The version string extracted from hydroshift.__version__" + value: ${{ jobs.get-version.outputs.version }} + is_new_version: + description: "Whether this version is new (not previously tagged)" + value: ${{ jobs.get-version.outputs.is_new_version }} + +jobs: + get-version: + runs-on: ubuntu-latest + outputs: + version: ${{ steps.get_version.outputs.VERSION }} + is_new_version: ${{ steps.check_tag.outputs.IS_NEW_VERSION }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.12' + + - name: Extract version from hydroshift + id: get_version + run: | + VERSION=$(python -c "import hydroshift; print(hydroshift.__version__)") + echo "VERSION=$VERSION" >> "$GITHUB_OUTPUT" + + - name: Check if version exists as tag + id: check_tag + run: | + if git tag | grep -q "v${{ steps.get_version.outputs.VERSION }}"; then + echo "Version ${{ steps.get_version.outputs.VERSION }} already exists as a tag." + echo "IS_NEW_VERSION=false" >> "$GITHUB_OUTPUT" + else + echo "Version ${{ steps.get_version.outputs.VERSION }} does not exist as a tag." + echo "IS_NEW_VERSION=true" >> "$GITHUB_OUTPUT" + fi diff --git a/.github/workflows/main-push.yaml b/.github/workflows/main-push.yaml new file mode 100644 index 0000000..48f6820 --- /dev/null +++ b/.github/workflows/main-push.yaml @@ -0,0 +1,47 @@ +name: Main Branch Push + +on: + push: + branches: [ "main" ] + paths: + - 'hydroshift/**' + - 'tests/**' + - '.devcontainer/Dockerfile' + - '.dockerignore' + - '.github/workflows/**' + - 'pyproject.toml' + workflow_dispatch: + +permissions: + contents: write + packages: write + id-token: write + +jobs: + get-version: + name: Extract Package Version + uses: ./.github/workflows/get-version.yaml + + check-version-exists: + name: Verify Version is New + needs: get-version + runs-on: ubuntu-latest + steps: + - name: Fail if version already exists + if: needs.get-version.outputs.is_new_version != 'true' + run: | + echo "::error::ERROR: Version ${{ needs.get-version.outputs.version }} already exists as a tag." + echo "::error::Please update the version in hydroshift/__init__.py before releasing to main." + exit 1 + - name: Version check passed + run: echo "Version ${{ needs.get-version.outputs.version }} is new. Proceeding with release." + + docker-build-push: + name: Build and Push Docker Image + needs: [get-version, check-version-exists] + uses: ./.github/workflows/docker-build.yaml + with: + push_to_registry: true + version: ${{ needs.get-version.outputs.version }} + branch: 'main' + platforms: 'linux/amd64' diff --git a/.github/workflows/pr-check.yaml b/.github/workflows/pr-check.yaml new file mode 100644 index 0000000..e742487 --- /dev/null +++ b/.github/workflows/pr-check.yaml @@ -0,0 +1,33 @@ +name: PR Checks + +on: + pull_request: + branches: [ "main" ] + paths: + - 'hydroshift/**' + - 'tests/**' + - '/.devcontainer/Dockerfile' + - '.dockerignore' + - '.github/workflows/**' + - 'pyproject.toml' + +permissions: + contents: read + packages: write + id-token: write + +jobs: + get-version: + name: Extract Package Version + uses: ./.github/workflows/get-version.yaml + + docker-build-test: + name: Test Docker Build + needs: get-version + uses: ./.github/workflows/docker-build.yaml + with: + push_to_registry: false + version: ${{ needs.get-version.outputs.version }} + branch: ${{ github.base_ref }} + platforms: 'linux/amd64' + secrets: inherit diff --git a/.devcontainer/Dockerfile b/Dockerfile similarity index 100% rename from .devcontainer/Dockerfile rename to Dockerfile diff --git a/hydroshift/__init__.py b/hydroshift/__init__.py index e69de29..3dc1f76 100644 --- a/hydroshift/__init__.py +++ b/hydroshift/__init__.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/hydroshift/_pages/__init__.py b/hydroshift/_pages/__init__.py index f8097f9..6548600 100644 --- a/hydroshift/_pages/__init__.py +++ b/hydroshift/_pages/__init__.py @@ -1,2 +1,3 @@ from hydroshift._pages.changepoint import changepoint +from hydroshift._pages.homepage import homepage from hydroshift._pages.summary import summary diff --git a/hydroshift/_pages/changepoint.py b/hydroshift/_pages/changepoint.py index 319a1df..8f23cf6 100644 --- a/hydroshift/_pages/changepoint.py +++ b/hydroshift/_pages/changepoint.py @@ -1,43 +1,43 @@ """A tool to identify changepoints in hydrologic timeseries.""" import traceback +import uuid from collections import defaultdict from dataclasses import dataclass, field from io import BytesIO -import numpy as np import pandas as pd import streamlit as st -from data_retrieval import ( - get_ams, -) from docx import Document from docx.shared import Inches -from plotly import graph_objects -from plots import combo_cpm, plot_lp3 - -from hydroshift.consts import METRICS, VALID_ARL0S -from hydroshift.data_retrieval import log_pearson_iii -from hydroshift.stats.tests import cp_pvalue_batch, cpm_process_stream -from hydroshift.text.changepoint import references, test_description +from plotly.graph_objects import Figure + +from hydroshift.consts import ( + CP_F1_CAPTION, + CP_F2_CAPTION, + CP_T1_CAPTION, + CP_T2_CAPTION, + METRICS, + VALID_ARL0S, +) +from hydroshift.utils.changepoint import cp_pvalue_batch, cpm_process_stream +from hydroshift.utils.common import num_2_word +from hydroshift.utils.data_retrieval import Gage +from hydroshift.utils.ffa import LP3Analysis +from hydroshift.utils.jinja import render_template, write_template +from hydroshift.utils.plots import combo_cpm, plot_lp3 @dataclass class ChangePointAnalysis: """OOP representation of the changepoint analysis.""" - data: dict = field(default_factory=pd.DataFrame) - missing_years: int = 0 + gage: Gage = field(default_factory=Gage) pval_df: dict = None cp_dict: dict = field(default_factory=dict) ffa_plot = None ffa_df: dict = field(default_factory=pd.DataFrame) - @property - def ts(self) -> np.ndarray: - """Timeseries from data.""" - return self.data["peak_va"].values - @property def nonstationary(self) -> bool: """Whether or not a nonstationarity was identified.""" @@ -70,7 +70,9 @@ def get_change_windows(self, max_dist: float = 10) -> tuple: current_group_tests = set(test_dict[dates[0]]) for i in range(1, len(dates)): - if ((dates[i] - current_group[-1]).days / 365) < max_dist: # 10 years in days + if ( + (dates[i] - current_group[-1]).days / 365 + ) < max_dist: # 10 years in days current_group.append(dates[i]) current_group_tests = current_group_tests.union(test_dict[dates[i]]) else: @@ -85,43 +87,28 @@ def get_change_windows(self, max_dist: float = 10) -> tuple: return groups, test_counts - def get_max_pvalue(self): + def get_max_pvalue(self) -> tuple[float, int]: """Get the minimum p value where all tests agree and count how often that occurred.""" all_tests = self.pval_df.fillna(1).max(axis=1).to_frame(name="pval") all_tests["run"] = ((all_tests != all_tests.shift(1)) * 1).cumsum() - for ind, r in all_tests.groupby("pval").agg({"run": pd.Series.nunique}).sort_index().iterrows(): + for ind, r in ( + all_tests.groupby("pval") + .agg({"run": pd.Series.nunique}) + .sort_index() + .iterrows() + ): if r.run > 0: min_p = ind count = r.run break return min_p, count - def num_2_word(self, number: int) -> str: - """Convert numbers less than 10 to words.""" - if number > 9: - return number - else: - d = { - 0: "no", - 1: "one", - 2: "two", - 3: "three", - 4: "four", - 5: "five", - 6: "six", - 7: "seven", - 8: "eight", - 9: "nine", - } - return d[number] - @property - def summary_plot(self) -> graph_objects: + def summary_plot(self) -> Figure: """Create plotly plot to summarize analysis.""" - return combo_cpm(self.data, self.pval_df, self.cp_dict) + return combo_cpm(self.gage.ams, self.pval_df, self.cp_dict) @property - @st.cache_resource def summary_png(self) -> BytesIO: """Export summary plot to png in memory.""" bio = BytesIO() @@ -136,10 +123,11 @@ def ffa_png(self): return bio @property - @st.cache_resource def cp_df(self): """Get a dataframe representing changepoints identified in the streaming analysis.""" - cpa_df = pd.DataFrame.from_dict(self.cp_dict, orient="index", columns=["Tests Identifying Change"]) + cpa_df = pd.DataFrame.from_dict( + self.cp_dict, orient="index", columns=["Tests Identifying Change"] + ) cpa_df.index = cpa_df.index.date return cpa_df @@ -151,95 +139,66 @@ def title(self) -> str: @property def summary_text(self) -> str: """A text summary of the changepoint analysis.""" - if self.nonstationary: - end_text = """ - some form of nonstationarity (e.g., land use change, climate change, flow regulation, etc) may be influencing flow patterns at this site. - """ - else: - end_text = """an assumption of nonstationary conditions is likely reasonable.""" - cp_count = self.num_2_word(len(self.cp_dict)) - if len(self.cp_dict) == 1: - plural_text = "point was" - else: - plural_text = "points were" - return ( - """ - There is **{}** evidence that the annual maximum series data at USGS gage {} are nonstationary in time. Four change - point detection tests were completed to assess changes in the mean, variance, and overall distribution of flood - peaks across the period of record. Significant change points were identified using a Type I error rate of 1 in - {} and ignoring significant changes in the first {} years of data. {} statistically significant change - {} identified, indicating that {} - """ - ).format( - self.evidence_level, - st.session_state.gage_id, - st.session_state.arlo_slider, - st.session_state.burn_in, - cp_count, - plural_text, - end_text, - ) + payload = { + "evidence_level": self.evidence_level, + "gage_id": self.gage.gage_id, + "arl0": st.session_state.arlo_slider, + "burn_in": st.session_state.burn_in, + "cp_count": num_2_word(len(self.cp_dict)), + "plural": len(self.cp_dict) != 1, + "nonstationary": self.nonstationary, + } + return render_template("changepoint_summary_1.md", payload) + + @property + def test_description(self): + """Analysis methodology.""" + payload = { + "alr0": st.session_state.arlo_slider, + "burn_in": st.session_state.burn_in, + } + return render_template("changepoint_description.md", payload) @property def results_text(self) -> str: """A detailed description of the results.""" - # Static analysis + # Gather stats min_p, p_count = self.get_max_pvalue() if min_p == 0.001: evidence = "strong" elif min_p == 0.005: evidence = "moderate" + elif self.pval_df.isna().all().all(): + evidence = "no" elif min_p < 1: evidence = "minor" - if p_count > 1: - plural = "s" - else: - plural = "" - if min_p < 1: - sentence = "A minimum p-value of {} was obtained at {} contiguous time period{}.".format( - min_p, self.num_2_word(p_count), plural - ) - else: - sentence = "There were no time-periods where a p-value less than 0.05 was observed in all tests." - if self.pval_df.isna().all().all(): - evidence = "no" - else: - evidence = "minimal" - p1 = """ - The static changepoint test showed {} evidence of a changepoint in the timeseries. {} The p-value reflects the probability that the distribution of - flood peaks before that date has the **same** distribution as the flood peaks after the date. - """.format( - evidence, sentence - ) - - # Streaming analysis - if len(self.cp_dict) == 1: - p2 = """ - The streaming analysis identified one statistically significant changepoint. This changepoint was identified - by {} distinct tests: {}.""" - else: - groups, test_counts = self.get_change_windows() - if len(groups) > 0: - plural = "s" - else: - plural = "" - p2 = """ - The streaming analysis identified {} statistically significant changepoints. These changepoints broadly fell - into {} window{} where tests identified changepoints not more than 10 years apart. For a full summary of which - tests identified changes at which dates, see table 2. - """.format( - self.num_2_word(len(self.cp_dict)), self.num_2_word(len(groups)), plural - ) - return "\n\n".join([p1, p2]) + groups, _ = self.get_change_windows() + + # format + payload = { + "evidence": evidence, + "min_p": min_p, + "p_count": num_2_word(p_count), + "plural": p_count > 1, + "len_cp": len(self.cp_dict), + "len_cp_str": num_2_word(len(self.cp_dict)), + "test_count": num_2_word( + len(self.cp_dict[next(iter(self.cp_dict))].split(",")) + ), + "grp_count": num_2_word(len(groups)), + "plural_2": len(groups) > 0, + } + return render_template("changepoint_summary_2.md", payload) @property def ffa_text(self) -> str: - return """ - Based on the changepoint analysis results, a modified flood frequency analysis was conducted - using truncated periods of record. The truncated periods correspond with times when the - hydrologic regime appears to be stationary across time. Results from this analysis are shown - in Figure 2 and Table 2. - """ + """Methodology for the modified FFA.""" + return render_template("changepoint_ffa.md") + + @property + def references(self) -> str: + """Citations.""" + return render_template("changepoint_references.md") @property def word_data(self) -> BytesIO: @@ -250,10 +209,7 @@ def word_data(self) -> BytesIO: self.add_markdown_to_doc(document, self.summary_text) document.add_picture(self.summary_png, width=Inches(6.5)) document.add_heading("Changepoint detection method", level=2) - self.add_markdown_to_doc( - document, - test_description.format(st.session_state.arlo_slider, st.session_state.burn_in, st.session_state.burn_in), - ) + self.add_markdown_to_doc(document, self.test_description) if len(self.cp_dict) > 0: document.add_heading("Changepoint detection results", level=2) self.add_markdown_to_doc(document, self.results_text) @@ -264,18 +220,22 @@ def word_data(self) -> BytesIO: document.add_heading("Modified flood frequency analysis", level=2) self.add_markdown_to_doc(document, self.ffa_text) document.add_picture(self.ffa_png, width=Inches(6.5)) - self.add_markdown_to_doc(document, "**Figure 2.** Modified flood frequency analysis.") + self.add_markdown_to_doc( + document, "**Figure 2.** Modified flood frequency analysis." + ) self.add_markdown_to_doc(document, "**Table 2.** Modified flood quantiles.") self.add_table_from_df(self.ffa_df, document, index_name="Regime Period") document.add_heading("References", level=2) - self.add_markdown_to_doc(document, references) + self.add_markdown_to_doc(document, self.references) out = BytesIO() document.save(out) return out - def add_table_from_df(self, df: pd.DataFrame, document: Document, index_name: str = None): + def add_table_from_df( + self, df: pd.DataFrame, document: Document, index_name: str = None + ): if index_name is not None: df = df.copy() cols = df.columns @@ -301,26 +261,20 @@ def add_markdown_to_doc(self, document: Document, text: str): p.add_run(r).bold = bold bold = not bold - def get_data(self, gage_id: int): + def validate_data(self): """Cache results of get_ams.""" - data = get_ams(gage_id) + data = self.gage.ams # Validate if data is None: st.session_state.valid_data = False st.session_state.data_comment = "Unable to retrieve data." - elif data["peaks"] is None: - st.session_state.valid_data = False - st.session_state.data_comment = "Unable to retrieve data." - elif len(data["peaks"]) < st.session_state.burn_in: + elif len(data) < st.session_state.burn_in: st.session_state.valid_data = False - st.session_state.data_comment = ( - "Not enough peaks available for analysis. {} peaks found, but burn-in length was {}".format( - len(data["peaks"]), st.session_state.burn_in - ) + st.session_state.data_comment = "Not enough peaks available for analysis. {} peaks found, but burn-in length was {}".format( + len(data["peaks"]), st.session_state.burn_in ) else: - self.data, self.missing_years = data["peaks"], data["missing_years"] st.session_state.valid_data = True @@ -338,51 +292,73 @@ def define_variables(): # Instantiate analysis class and get data if "changepoint" not in st.session_state: - st.session_state.changepoint = ChangePointAnalysis(st.session_state.gage_id) - st.session_state.changepoint.get_data(st.session_state.gage_id) + st.session_state.changepoint = ChangePointAnalysis(st.session_state.gage) + st.session_state.changepoint.validate_data() + + +def refresh_data_editor(): + """Set a new uuid for the data entry widget.""" + st.session_state.data_editor_key = str(uuid.uuid4()) def make_sidebar(): """User control for analysis.""" with st.sidebar: st.title("Settings") - st.session_state["gage_id"] = st.text_input("Enter USGS Gage Number:", st.session_state["gage_id"]) - st.select_slider( - "False Positive Rate (1 in #)", - options=VALID_ARL0S, - value=1000, - key="arlo_slider", - label_visibility="visible", + st.session_state["gage_id"] = st.text_input( + "Enter USGS Gage Number:", + st.session_state["gage_id"], + on_change=refresh_data_editor, ) + st.session_state.gage = Gage(st.session_state["gage_id"]) + with st.form("changepoint_params"): + st.select_slider( + "False Positive Rate (1 in #)", + options=VALID_ARL0S, + value=1000, + key="arlo_slider", + label_visibility="visible", + ) - st.number_input( - "Burn-in Period", - 0, - 100, - 20, - key="burn_in", - label_visibility="visible", - ) - st.text("Flood Frequency Analysis") - init_data = pd.DataFrame(columns=["Regime Start", "Regime End"]) - init_data["Regime Start"] = pd.to_datetime(init_data["Regime Start"]) - init_data["Regime End"] = pd.to_datetime(init_data["Regime End"]) - - start_config = st.column_config.DateColumn("Regime Start", format="D/M/YYYY") - end_config = st.column_config.DateColumn("Regime End", format="D/M/YYYY") - st.data_editor( - init_data, - num_rows="dynamic", - key="ffa_regimes", - column_config={"Regime Start": start_config, "Regime End": end_config}, - ) + st.number_input( + "Burn-in Period", + 0, + 100, + 20, + key="burn_in", + label_visibility="visible", + ) + st.text("Flood Frequency Analysis") + init_data = pd.DataFrame(columns=["Regime Start", "Regime End"]) + init_data["Regime Start"] = pd.to_datetime(init_data["Regime Start"]) + init_data["Regime End"] = pd.to_datetime(init_data["Regime End"]) + + # Make data editor. Unique key allows for refreshing + if "data_editor_key" not in st.session_state: + refresh_data_editor() + start_config = st.column_config.DateColumn( + "Regime Start", format="D/M/YYYY" + ) + end_config = st.column_config.DateColumn("Regime End", format="D/M/YYYY") + st.data_editor( + init_data, + num_rows="dynamic", + key=st.session_state.data_editor_key, + column_config={"Regime Start": start_config, "Regime End": end_config}, + ) + st.form_submit_button(label="Run Analysis") + + st.divider() + write_template("data_sources_side_bar.html") def run_analysis(): """Run the change point model analysis.""" cpa = st.session_state.changepoint - cpa.pval_df = get_pvalues(cpa.data) - cpa.cp_dict = get_changepoints(cpa.data, st.session_state.arlo_slider, st.session_state.burn_in) + cpa.pval_df = get_pvalues(cpa.gage.ams) + cpa.cp_dict = get_changepoints( + cpa.gage.ams, st.session_state.arlo_slider, st.session_state.burn_in + ) @st.cache_data @@ -412,19 +388,26 @@ def get_changepoints(data: pd.DataFrame, arl0: int, burn_in: int) -> dict: @st.cache_data def ffa_analysis(data: pd.DataFrame, regimes: list): """Run multiple flood frequency analyses for different regimes.""" - ffa_dict = {} + ffas = [] for r in regimes: if "Regime Start" in r and "Regime End" in r: sub = data.loc[r["Regime Start"] : r["Regime End"]].copy() - peaks = sub["peak_va"] - lp3 = log_pearson_iii(peaks) + peaks = sub["peak_va"].values label = f'{r["Regime Start"]} - {r["Regime End"]}' - ffa_dict[label] = {"peaks": sub, "lp3": lp3} - if len(ffa_dict) > 0: - ffa_plot = plot_lp3(ffa_dict, st.session_state.gage_id, multi_series=True) - ffa_df = pd.DataFrame.from_dict({k: ffa_dict[k]["lp3"] for k in ffa_dict}, orient="index") - renames = {c: f"Q{c}" for c in ffa_df.columns} - ffa_df = ffa_df.rename(columns=renames) + lp3 = LP3Analysis( + st.session_state.gage_id, + peaks, + use_map_skew=False, + est_method="MOM", + label=label, + ) + ffas.append(lp3) + if len(ffas) > 0: + ffa_plot = plot_lp3(ffas) + ffa_df = ffas[0].quantile_df[["Recurrence Interval (years)"]] + ffa_df = ffa_df.set_index("Recurrence Interval (years)") + for i in ffas: + ffa_df[i.label] = i.quantile_df["Discharge (cfs)"].values return ffa_plot, ffa_df else: return None, None @@ -434,38 +417,37 @@ def make_body(): """Assemble main app body.""" left_col, right_col = st.columns([2, 1]) # Formatting with left_col: - cpa = st.session_state.changepoint + cpa: ChangePointAnalysis = st.session_state.changepoint st.title(cpa.title) warnings() st.header("Summary") st.markdown(cpa.summary_text) st.plotly_chart(cpa.summary_plot, use_container_width=True) - st.markdown("**Figure 1.** Statistical changepoint analysis.") + st.markdown(CP_F1_CAPTION) st.header("Changepoint detection method") - st.markdown( - test_description.format(st.session_state.arlo_slider, st.session_state.burn_in, st.session_state.burn_in) - ) + st.markdown(cpa.test_description) if len(cpa.cp_dict) > 0: st.header("Changepoint detection results") st.markdown(cpa.results_text) if len(cpa.cp_dict) > 1: st.table(cpa.cp_df) - st.markdown( - "**Table 1.** Results of the changepoint analysis, listing dates when a significant change was identified for each test statistic." - ) + st.markdown(CP_T1_CAPTION) st.header("Modified flood frequency analysis") - if len(st.session_state.ffa_regimes["added_rows"]) > 0: - ffa_plot, ffa_df = ffa_analysis(cpa.data, st.session_state.ffa_regimes["added_rows"]) + if len(st.session_state[st.session_state.data_editor_key]["added_rows"]) > 0: + ffa_plot, ffa_df = ffa_analysis( + cpa.gage.ams, + st.session_state[st.session_state.data_editor_key]["added_rows"], + ) if ffa_plot is not None and ffa_df is not None: st.markdown(cpa.ffa_text) cpa.ffa_plot = ffa_plot cpa.ffa_df = ffa_df st.plotly_chart(ffa_plot, use_container_width=True) - st.markdown("**Figure 2.** Modified flood frequency analysis.") - st.markdown("**Table 2.** Modified flood quantiles.") + st.markdown(CP_F2_CAPTION) + st.markdown(CP_T2_CAPTION) st.table(ffa_df) else: st.info( @@ -473,17 +455,28 @@ def make_body(): ) st.header("References") - st.markdown(references) + st.markdown(cpa.references) - st.download_button("Download analysis", cpa.word_data, f"changepoint_analysis_{st.session_state.gage_id}.docx") + st.download_button( + "Download analysis", + cpa.word_data, + f"changepoint_analysis_{st.session_state.gage_id}.docx", + ) def warnings(): """Print warnings on data validity etc.""" - if st.session_state.changepoint.data is not None and "peak_va" in st.session_state.changepoint.data.columns: - if st.session_state.changepoint.missing_years: + if ( + st.session_state.changepoint.gage.ams is not None + and "peak_va" in st.session_state.changepoint.gage.ams.columns + ): + if st.session_state.changepoint.gage.missing_dates_ams: st.warning( - f"Missing {len(st.session_state.changepoint.missing_years)} dates between {st.session_state.changepoint.data.index.min()} and {st.session_state.changepoint.data.index.max()}" + "Missing {} dates between {} and {}".format( + len(st.session_state.changepoint.gage.missing_dates_ams), + st.session_state.changepoint.gage.ams.index.min(), + st.session_state.changepoint.gage.ams.index.max(), + ) ) diff --git a/hydroshift/_pages/homepage.py b/hydroshift/_pages/homepage.py new file mode 100644 index 0000000..f3c2a00 --- /dev/null +++ b/hydroshift/_pages/homepage.py @@ -0,0 +1,40 @@ +import streamlit as st + +from hydroshift.consts import DEFAULT_GAGE +from hydroshift.utils.jinja import write_template + + +def homepage(): + """Landing page for app.""" + st.set_page_config(page_title="HydroShift", layout="wide") + + left_col, right_col, _ = st.columns([2, 1.5, 0.2], gap="large") + + with left_col: + st.title("HydroShift") + st.subheader("USGS Streamflow Change Detection Tool") + write_template("app_description.md") + + st.write("") # blank line for more space + gage_input = st.text_input("Enter a USGS Gage Number to begin:") + col1, col2 = st.columns([1, 8]) + with col1: + submit = st.button("Submit") + with col2: + demo = st.button("Use Demo Data") + + if submit and gage_input: + st.session_state["gage_id"] = gage_input + if demo: + st.session_state["gage_id"] = DEFAULT_GAGE + if st.session_state["gage_id"] is not None: + # try: + # st.session_state["site_data"] = load_site_data(st.session_state["gage_id"]) + # except ValueError: + # st.error(f"Data not found for gage: {st.session_state['gage_id']}") + st.rerun() + with right_col: + st.title("") + st.image("hydroshift/images/logo_base.png") + + write_template("footer.html") diff --git a/hydroshift/_pages/summary.py b/hydroshift/_pages/summary.py index f1259b1..59caf23 100644 --- a/hydroshift/_pages/summary.py +++ b/hydroshift/_pages/summary.py @@ -1,12 +1,13 @@ +from datetime import date + import folium import streamlit as st -from data_retrieval import ( - get_ams, - get_daily_values, - get_flow_stats, - get_monthly_values, -) -from plots import ( +from streamlit_folium import st_folium + +from hydroshift.utils.data_retrieval import Gage +from hydroshift.utils.ffa import LP3Analysis +from hydroshift.utils.jinja import write_template +from hydroshift.utils.plots import ( plot_ams, plot_ams_seasonal, plot_daily_mean, @@ -14,7 +15,118 @@ plot_lp3, plot_monthly_mean, ) -from streamlit_folium import st_folium + + +def section_ams(gage: Gage): + """Display the AMS section.""" + if gage.ams is not None: + if gage.missing_dates_ams is not None and len(gage.missing_dates_ams) > 0: + st.warning(f"Missing {len(gage.missing_dates_ams)} AMS records") + st.plotly_chart(plot_ams(gage.ams, gage.gage_id)) + show_data = st.checkbox("Show AMS Data Table") + if show_data: + st.dataframe(gage.ams) + + +def section_flow_stats(gage: Gage): + """Display the flow statistics section.""" + if gage.flow_stats is not None: + st.plotly_chart(plot_flow_stats(gage.flow_stats, gage.gage_id)) + show_data = st.checkbox("Show Daily Stats Data Table") + if show_data: + st.dataframe(gage.flow_stats) + + +def section_lp3(gage: Gage): + """Display the FFA section.""" + if gage.ams is not None: + # Options + opt_col_1, opt_col_2 = st.columns(2) + with opt_col_1: + est_method = st.selectbox( + "Estimation Method", + ["L-moments", "Method of moments", "Maximum Likelihood"], + index=1, + ) + est_method = { + "L-moments": "LMOM", + "Method of moments": "MOM", + "Maximum Likelihood": "MLE", + }[est_method] + with opt_col_2: + st_skew = st.toggle( + "Use regional skew", value=False, disabled=not gage.has_regional_skew + ) + if st_skew: + use_map = True + else: + use_map = False + + # Analysis and display + lp3 = LP3Analysis(gage.gage_id, gage.ams_vals, use_map, est_method, "") + if gage.missing_dates_ams is not None and len(gage.missing_dates_ams) > 0: + st.warning(f"Missing {len(gage.missing_dates_ams)} LP3 records") + st.plotly_chart(plot_lp3(lp3), use_container_width=True) + show_data = st.checkbox("Show LP3 Data Table") + if show_data: + st.dataframe(lp3.quantile_df) + + +def section_ams_seasonal(gage: Gage): + """Display the ams with seasonal attributes section.""" + if gage.ams is not None: + if gage.missing_dates_ams: + st.warning(f"Missing {len(gage.missing_dates_ams)} AMS seasonal records") + st.plotly_chart( + plot_ams_seasonal(gage.ams, gage.gage_id), use_container_width=True + ) + show_data = st.checkbox("Show Ranked Seasonal Data Table") + if show_data: + st.dataframe(gage.ams) + + +def section_daily_mean(gage: Gage): + """Display the daily mean discharge section.""" + plot_col, input_col = st.columns([8, 2]) + + with input_col: + st.write("") # blank line for more space + st.write("Daily Mean Input Dates") + start_date = st.date_input("Start Date", value=date(2024, 1, 1)) + end_date = st.date_input("End Date", value=date(2024, 12, 31)) + + data, missing_dates = gage.get_daily_values( + start_date.strftime("%Y-%m-%d"), + end_date.strftime("%Y-%m-%d"), + ) + missing_dates = gage.missing_dates_daily_values(start_date, end_date) + with plot_col: + if data is not None: + if missing_dates: + st.warning(f"Missing {len(missing_dates)} daily mean records") + st.plotly_chart( + plot_daily_mean(data, gage.gage_id), use_container_width=True + ) + show_data = st.checkbox("Show Daily Mean Data Table") + if show_data: + st.dataframe(data) + + +def section_monthly_mean(gage: Gage): + """Display the monthly mean discharge section.""" + data = gage.get_monthly_values() + missing_dates = gage.missing_dates_monthly_values + if data is not None and "mean_va" in data.columns: + if missing_dates: + st.warning(f"Missing {len(missing_dates)} monthly records") + st.plotly_chart( + plot_monthly_mean(data, st.session_state["gage_id"]), + use_container_width=True, + ) + + show_data = st.checkbox("Show Monthly Mean Data Table") + if show_data: + st.dataframe(data) def summary(): @@ -23,7 +135,10 @@ def summary(): # Sidebar for input with st.sidebar: st.title("Settings") - st.session_state["gage_id"] = st.text_input("Enter USGS Gage Number:", st.session_state["gage_id"]) + st.session_state["gage_id"] = st.text_input( + "Enter USGS Gage Number:", st.session_state["gage_id"] + ) + gage = Gage(st.session_state["gage_id"]) # Toggle plots st.markdown("### Toggle Plots") @@ -34,119 +149,51 @@ def summary(): show_daily_mean = st.checkbox("Daily Mean Streamflow", value=True) show_monthly_mean = st.checkbox("Monthly Mean Streamflow", value=True) + # Data sources + st.divider() + write_template("data_sources_side_bar.html") + if st.session_state["gage_id"]: - with st.spinner("Loading gage data..."): # This is mainly here to clear previous pages while data loads. - try: - if st.session_state["gage_id"] == "testing": - site_data = { - "Site Number": "-99999", - "Station Name": "Wet River", - "Latitude": 45, - "Longitude": -103, - "Drainage Area": 0, - "HUC Code": 0, - "Elevation Datum": "NAVD88", - } - else: - site_data = st.session_state["site_data"] - lat, lon = site_data["Latitude"], site_data["Longitude"] - except ValueError as e: - lat, lon = None, None - st.error(f"{e}") + with st.spinner( + "Loading gage data..." + ): # This is here to clear previous pages while data loads. + pass col2, col3 = st.columns([6, 2], gap="large") - if lat and lon: - with col3: + if gage.latitude and gage.longitude: + with col3: # Site map st.subheader("Gage Location") # Create Folium Map - mini_map = folium.Map(location=[lat, lon], zoom_start=7, width=200, height=200) + mini_map = folium.Map( + location=[gage.latitude, gage.longitude], + zoom_start=7, + width=200, + height=200, + ) folium.Marker( - [lat, lon], popup=f"Gage {st.session_state['gage_id']}", icon=folium.Icon(color="green") + [gage.latitude, gage.longitude], + popup=f"Gage {st.session_state['gage_id']}", + icon=folium.Icon(color="green"), ).add_to(mini_map) st_folium(mini_map, width=250, height=250) # Display site metadata st.subheader("Site Information") - st.markdown( - f""" - **Site Number:** {site_data["Site Number"]}
- **Station Name:** {site_data["Station Name"]}
- **Latitude:** {site_data["Latitude"]}
- **Longitude:** {site_data["Longitude"]}
- **Drainage Area (sq.mi.):** {site_data["Drainage Area"]}
- **HUC Code:** {site_data["HUC Code"]}
- """, - unsafe_allow_html=True, - ) + write_template("site_summary.md", gage.site_data) with col2: # Center column for plots + gage.raise_warnings() if show_ams: - ams = get_ams(st.session_state["gage_id"]) - data, missing_years = ams["peaks"], ams["missing_years"] - if data is not None and "peak_va" in data.columns: - if missing_years: - st.warning(f"Missing {len(missing_years)} AMS records") - st.plotly_chart(plot_ams(data, st.session_state["gage_id"]), use_container_width=True) - show_data = st.checkbox("Show AMS Data Table") - if show_data: - st.dataframe(data) - + section_ams(gage) if show_daily_stats: - data = get_flow_stats(st.session_state["gage_id"]) - if data is not None and "mean_va" in data.columns: - st.plotly_chart(plot_flow_stats(data, st.session_state["gage_id"]), use_container_width=True) - show_data = st.checkbox("Show Daily Stats Data Table") - if show_data: - st.dataframe(data) - + section_flow_stats(gage) if show_lp3: - ams = get_ams(st.session_state["gage_id"]) - if ams["peaks"] is not None: - if ams["missing_years"]: - st.warning(f"Missing {len(ams["missing_years"])} LP3 records") - st.plotly_chart(plot_lp3(ams, st.session_state["gage_id"]), use_container_width=True) - show_data = st.checkbox("Show LP3 Data Table") - if show_data: - st.dataframe(ams["lp3"]) + section_lp3(gage) if show_ams_seasonal: - ams = get_ams(st.session_state["gage_id"]) - data, missing_years = ams["peaks"], ams["missing_years"] - if data is not None and "peak_va" in data.columns: - if missing_years: - st.warning(f"Missing {len(missing_years)} AMS seasonal records") - st.plotly_chart(plot_ams_seasonal(data, st.session_state["gage_id"]), use_container_width=True) - show_data = st.checkbox("Show Ranked Seasonal Data Table") - if show_data: - st.dataframe(data) - + section_ams_seasonal(gage) if show_daily_mean: - plot_col, input_col = st.columns([8, 2]) - - with input_col: - st.write("") # blank line for more space - st.write("Daily Mean Input Dates") - start_date = st.text_input("Start Date (YYYY-MM-DD)", "2024-01-01") - end_date = st.text_input("End Date (YYYY-MM-DD)", "2024-12-31") - - data, missing_dates = get_daily_values(st.session_state["gage_id"], start_date, end_date) - with plot_col: - if data is not None: - if missing_dates: - st.warning(f"Missing {len(missing_dates)} daily mean records") - st.plotly_chart(plot_daily_mean(data, st.session_state["gage_id"]), use_container_width=True) - show_data = st.checkbox("Show Daily Mean Data Table") - if show_data: - st.dataframe(data) - + section_daily_mean(gage) if show_monthly_mean: - data, missing_dates = get_monthly_values(st.session_state["gage_id"]) - if data is not None and "mean_va" in data.columns: - if missing_dates: - st.warning(f"Missing {len(missing_dates)} monthly records") - st.plotly_chart(plot_monthly_mean(data, st.session_state["gage_id"]), use_container_width=True) - - show_data = st.checkbox("Show Monthly Mean Data Table") - if show_data: - st.dataframe(data) + section_monthly_mean(gage) diff --git a/hydroshift/consts.py b/hydroshift/consts.py index d239930..8eaa36e 100644 --- a/hydroshift/consts.py +++ b/hydroshift/consts.py @@ -7,6 +7,8 @@ R_SERVER_PORT = 9999 R_SERVER_URL = f"http://127.0.0.1:{R_SERVER_PORT}" +### FFA ### + ### Changepoint Analysis ### VALID_ARL0S = [ 370, @@ -31,3 +33,24 @@ 50000, ] METRICS = ["Cramer-von-Mises", "Kolmogorov-Smirnov", "Lepage", "Mann-Whitney", "Mood"] + +### External URLs ### +NWIS_URL = "https://waterdata.usgs.gov/nwis?" +PEAKFQ_URL = "https://www.usgs.gov/tools/peakfq" + +### Text snippets ### +DATA_SOURCES_STR = f"Data: [USGS NWIS]({NWIS_URL}) and [USGS PEAKFQ]({PEAKFQ_URL})" + +CP_F1_CAPTION = "**Figure 1.** Statistical changepoint analysis." +CP_T1_CAPTION = "**Table 1.** Results of the changepoint analysis, listing dates when a significant change was identified for each test statistic." +CP_F2_CAPTION = "**Figure 2.** Modified flood frequency analysis." +CP_T2_CAPTION = "**Table 2.** Modified flood quantiles." + +### MISC ### +REGULATION_MAP = { + "3": "Discharge affected by Dam Failure", + "5": "Discharge affected to unknown degree by Regulation or Diversion", + "6": "Discharge affected by Regulation or Diversion", + "9": "Discharge due to Snowmelt, Hurricane, Ice-Jam or Debris Dam breakup", + "C": "All or part of the record affected by Urbanization, Mining, Agricultural changes, Channelization, or other", +} diff --git a/hydroshift/data/skewmap_4326.tif b/hydroshift/data/skewmap_4326.tif new file mode 100644 index 0000000..53b0fb7 Binary files /dev/null and b/hydroshift/data/skewmap_4326.tif differ diff --git a/hydroshift/data_retrieval.py b/hydroshift/data_retrieval.py deleted file mode 100644 index 2b41355..0000000 --- a/hydroshift/data_retrieval.py +++ /dev/null @@ -1,166 +0,0 @@ -import logging - -import numpy as np -import pandas as pd -import scipy.stats as stats -import streamlit as st -from dataretrieval import NoSitesError, nwis -from scipy.stats import genpareto - - -@st.cache_data -def get_ams(gage_id): - """Fetches Annual Maximum Series (AMS) peak flow data for a given gage.""" - try: - if gage_id == "testing": - df = fake_ams() - else: - df = nwis.get_record(service="peaks", sites=[gage_id], ssl_check=True) - except NoSitesError: - logging.warning(f"Peaks could not be found for gage id: {gage_id}") - return {"peaks": None, "lp3": None, "missing_years": None} - - df["season"] = ((df.index.month % 12 + 3) // 3).map({1: "Winter", 2: "Spring", 3: "Summer", 4: "Fall"}) - - missing_years = check_missing_dates(df, "water_year") - - df = df.dropna(subset="peak_va") - return {"peaks": df, "lp3": log_pearson_iii(df["peak_va"]), "missing_years": missing_years} - - -@st.cache_data -def get_flow_stats(gage_id): - """Fetches flow statistics for a given gage.""" - try: - df = nwis.get_stats(sites=gage_id, ssl_check=True)[0] - except IndexError: - logging.warning(f"Flow stats could not be found for gage_id: {gage_id}") - return None - - return df - - -@st.cache_data -def load_site_data(gage_number: str) -> dict: - """Query NWIS for site information""" - try: - resp = nwis.get_record(sites=gage_number, service="site", ssl_check=True) - - except ValueError: - raise ValueError(f"Gage {gage_number} not found") - - return { - "Site Number": resp["site_no"].iloc[0], - "Station Name": resp["station_nm"].iloc[0], - "Latitude": float(resp["dec_lat_va"].iloc[0]), - "Longitude": float(resp["dec_long_va"].iloc[0]), - "Drainage Area": resp["drain_area_va"].iloc[0], - "HUC Code": resp["huc_cd"].iloc[0], - "Elevation Datum": resp["alt_datum_cd"].iloc[0], - } - - -@st.cache_data -def log_pearson_iii(peak_flows: pd.Series, standard_return_periods: list = [2, 5, 10, 25, 50, 100, 500]): - log_flows = np.log10(peak_flows.values) - mean_log = np.mean(log_flows) - std_log = np.std(log_flows, ddof=1) - skew_log = stats.skew(log_flows) - - return { - str(rp): int(10 ** (mean_log + stats.pearson3.ppf(1 - 1 / rp, skew_log) * std_log)) - for rp in standard_return_periods - } - - -@st.cache_data -def get_daily_values(gage_id, start_date, end_date): - """Fetches mean daily flow values for a given gage.""" - try: - dv = nwis.get_dv(gage_id, start_date, end_date, ssl_check=True)[0] - except Exception: - logging.warning(f"Daily Values could not be found for gage_id: {gage_id}") - return None - - missing_dates = check_missing_dates(dv, "daily") - - return dv, missing_dates - - -@st.cache_data -def get_monthly_values(gage_id): - """Fetches mean monthly flow values for a given gage and assigns a datetime column based on the year and month.""" - try: - mv = nwis.get_stats(gage_id, statReportType="monthly", ssl_check=True)[0] - except Exception: - logging.warning(f"Monthly Values could not be found for gage_id: {gage_id}") - return None - - mv = mv.rename(columns={"year_nu": "year", "month_nu": "month"}) - - mv["date"] = pd.to_datetime(mv[["year", "month"]].assign(day=1)) - - mv = mv.sort_values("date") - missing_dates = check_missing_dates(mv, "monthly") - - return mv, missing_dates - - -def check_missing_dates(df, freq): - """Checks for missing dates in a DataFrame. - - Parameters - ---------- - df (pd.DataFrame): The DataFrame containing time-series data. - freq (str): Either 'daily', 'monthly', or 'water_year' to specify the type of data. - - """ - if freq == "daily": - if not isinstance(df.index, pd.DatetimeIndex): - df = df.set_index("datetime") - full_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq="D") - - elif freq == "monthly": - df["date"] = pd.to_datetime(df["date"]) - full_range = pd.date_range(start=df["date"].min(), end=df["date"].max(), freq="MS") - - elif freq == "water_year": - if not isinstance(df.index, pd.DatetimeIndex): - df = df.set_index("datetime") - - df["water_year"] = df.index.year.where(df.index.month < 10, df.index.year + 1) - - full_water_years = set(range(df["water_year"].min(), df["water_year"].max() + 1)) - existing_water_years = set(df["water_year"]) - missing_years = sorted(full_water_years - existing_water_years) - - return missing_years - else: - raise ValueError("Invalid frequency. Use 'daily', 'monthly', or 'water_year'.") - - missing_dates = full_range.difference(df.index if freq == "daily" else df["date"]) - - return list(missing_dates) - - -def fake_ams() -> pd.DataFrame: - """Generate a timeseries from two distributions.""" - rs = 1 - rvs = [] - segments = [ - {"length": 50, "loc": 1000, "scale": 50, "c": 0.1}, - {"length": 50, "loc": 900, "scale": 500, "c": 0.5}, - {"length": 50, "loc": 1000, "scale": 100, "c": 0.1}, - ] - for s in segments: - length = s["length"] - params = {k: v for k, v in s.items() if k != "length"} - dist = genpareto(**params) - rvs.append(dist.rvs(size=length, random_state=rs)) - rs += 1 - - rvs = np.concatenate(rvs, axis=0) - dates = pd.date_range(start="1900-01-01", periods=len(rvs), freq="YE") - water_year = dates.year - df = pd.DataFrame({"datetime": dates, "peak_va": rvs, "water_year": water_year}).set_index("datetime") - return df diff --git a/hydroshift/stats/__init__.py b/hydroshift/stats/__init__.py deleted file mode 100644 index 53b9be3..0000000 --- a/hydroshift/stats/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Statistical methods for tool.""" diff --git a/hydroshift/streamlit_app.py b/hydroshift/streamlit_app.py index d6c7d36..8647b21 100644 --- a/hydroshift/streamlit_app.py +++ b/hydroshift/streamlit_app.py @@ -1,51 +1,7 @@ import streamlit as st -from data_retrieval import load_site_data from session import init_session_state -from hydroshift._pages import changepoint, summary -from hydroshift.consts import DEFAULT_GAGE - - -def homepage(): - """Landing page for app.""" - st.set_page_config(page_title="HydroShift", layout="wide") - - left_col, right_col, _ = st.columns([2, 1.5, 0.2], gap="large") - - with left_col: - st.title("HydroShift") - st.subheader("USGS Streamflow Change Detection Tool") - st.markdown( - """ - **HydroShift** is a tool for exploring streamflow trends in USGS gage data. This web app - provides interactive plots of annual peak discharge, seasonality of flood peaks, and daily and monthly discharge - trends. Beyond data summary, the application includes a statistical changepoint analysis that will detect when - significant changes in the distribution of flood peaks have occured. - """ - ) - - st.write("") # blank line for more space - gage_input = st.text_input("Enter a USGS Gage Number to begin:") - col1, col2 = st.columns([1, 8]) - with col1: - submit = st.button("Submit") - with col2: - demo = st.button("Use Demo Data") - - if submit and gage_input: - st.session_state["gage_id"] = gage_input - if demo: - st.session_state["gage_id"] = DEFAULT_GAGE - - if st.session_state["gage_id"] is not None: - try: - st.session_state["site_data"] = load_site_data(st.session_state["gage_id"]) - except ValueError: - st.error(f"Data not found for gage: {st.session_state['gage_id']}") - st.rerun() - with right_col: - st.title("") - st.image("hydroshift/images/logo_base.png") +from hydroshift._pages import changepoint, homepage, summary def navigator(): diff --git a/hydroshift/templates/app_description.md b/hydroshift/templates/app_description.md new file mode 100644 index 0000000..cf6f1e0 --- /dev/null +++ b/hydroshift/templates/app_description.md @@ -0,0 +1,4 @@ +**HydroShift** is a tool for exploring streamflow trends in USGS gage data. This web app +provides interactive plots of annual peak discharge, seasonality of flood peaks, and daily and monthly discharge +trends. Beyond data summary, the application includes a statistical changepoint analysis that will detect when +significant changes in the distribution of flood peaks have occured. diff --git a/hydroshift/text/changepoint.py b/hydroshift/templates/changepoint_description.md similarity index 79% rename from hydroshift/text/changepoint.py rename to hydroshift/templates/changepoint_description.md index a88be2f..8668eae 100644 --- a/hydroshift/text/changepoint.py +++ b/hydroshift/templates/changepoint_description.md @@ -1,6 +1,3 @@ -"""Storage for large blocks of changepoint app text.""" - -test_description = """ A changepoint analysis was performed on the series of annual flood peaks Using the change point model of Ross (2015). For each date in the timeseries, the distribution of flood peaks leading up to that date were compared with the distribution after that date using a variety of statistical tests - **Mood**: Measures changes in the variance of the peak series. @@ -12,10 +9,8 @@ These test statistics were utilized in two distinct ways in this analysis. A static analysis was performed by evaluating the test statistic at each date of the timeseries and using a two-sample test to determine the statistical significance of the distribution differences. The P-values from this analysis are shown in the second panel of Figure 1. A streaming analysis was performed by treating the data as a stream of values and repeating the static analysis after each new value was added. If the test statistic exceeds a specified threshold at any point within the subseries, a changepoint is marked, and a new data series is initialized for all further test statistics. The resulting change points are shown as dashed red lines in the top panel of Figure 1. - - The threshold for identifying changepoints in the streaming analysis is defined using an Average Run Length (ARL0) parameter. ARL0 reflects the frequency with which a false positive would be raised on a stationary timeseries (e.g., for an ARL0 of 1,000 a false changepoint would be identified on a stationary timeseries on average every 1,000 samples.). For this analysis, an ARL0 of {} was used. - - A burn-in period of {} years was selected to ignore singificant change points in the first {} years of the record due to the influence of small sample sizes. + - The threshold for identifying changepoints in the streaming analysis is defined using an Average Run Length (ARL0) parameter. ARL0 reflects the frequency with which a false positive would be raised on a stationary timeseries (e.g., for an ARL0 of 1,000 a false changepoint would be identified on a stationary timeseries on average every 1,000 samples.). For this analysis, an ARL0 of {{ arl0 }} was used. + - A burn-in period of {{ burn_in }} years was selected to ignore singificant change points in the first {{ burn_in }} years of the record due to the influence of small sample sizes. + +If the flood peak series contained any gaps, the data around the gap was joined as if there was no gap between the data points. For large data gaps, this assumption could lead to inaccurate results. -""" -references = """ -Gordon J. Ross (2015)., "Parametric and Nonparametric Sequential Change Detection in R: The cpm Package.", Journal of Statistical Software, 66(3), 1-20., https://www.jstatsoft.org/v66/i03/. -""" diff --git a/hydroshift/templates/changepoint_ffa.md b/hydroshift/templates/changepoint_ffa.md new file mode 100644 index 0000000..5d7fd26 --- /dev/null +++ b/hydroshift/templates/changepoint_ffa.md @@ -0,0 +1,2 @@ +Based on the changepoint analysis results, a modified flood frequency analysis was conducted using truncated periods of record. The truncated periods correspond with times when the hydrologic regime appears to be stationary across time. Results from this analysis are shown +in Figure 2 and Table 2. diff --git a/hydroshift/templates/changepoint_references.md b/hydroshift/templates/changepoint_references.md new file mode 100644 index 0000000..078a43d --- /dev/null +++ b/hydroshift/templates/changepoint_references.md @@ -0,0 +1 @@ +Gordon J. Ross (2015)., "Parametric and Nonparametric Sequential Change Detection in R: The cpm Package.", Journal of Statistical Software, 66(3), 1-20., [https://www.jstatsoft.org/v66/i03/](https://www.jstatsoft.org/v66/i03/). diff --git a/hydroshift/templates/changepoint_summary_1.md b/hydroshift/templates/changepoint_summary_1.md new file mode 100644 index 0000000..d556d6d --- /dev/null +++ b/hydroshift/templates/changepoint_summary_1.md @@ -0,0 +1,15 @@ +There is **{{ evidence_level }}** evidence that the annual maximum series data at USGS gage {{ gage_id }} are nonstationary in time. Four change +point detection tests were completed to assess changes in the mean, variance, and overall distribution of flood +peaks across the period of record. Significant change points were identified using a Type I error rate of 1 in +{{ arl0 }} and ignoring significant changes in the first {{ burn_in }} years of data. {{ cp_count }} statistically significant +{% if plural -%} +changepoints were +{%- else -%} +changepoint was +{%- endif %} +identified, indicating that +{% if nonstationary -%} +some form of nonstationarity (e.g., land use change, climate change, flow regulation, etc) may be influencing flow patterns at this site. +{%- else -%} +an assumption of nonstationary conditions is likely reasonable. +{%- endif %} diff --git a/hydroshift/templates/changepoint_summary_2.md b/hydroshift/templates/changepoint_summary_2.md new file mode 100644 index 0000000..ef4445d --- /dev/null +++ b/hydroshift/templates/changepoint_summary_2.md @@ -0,0 +1,16 @@ +The static changepoint test showed {{ evidence }} evidence of a changepoint in the timeseries. +{% if min_p < 1 -%} +A minimum p-value of {{ min_p }} was obtained at {{ p_count }} contiguous time period{{ 's' if plural else '' }}. +{%- else -%} +There were no time-periods where a p-value less than 0.05 was observed in all tests. +{%- endif %} +The p-value reflects the probability that the distribution of +flood peaks before that date has the **same** distribution as the flood peaks after the date. + +{% if len_cp < 1 -%} +The streaming analysis identified one statistically significant changepoint. This changepoint was identified by {{ test_count }} distinct tests: {{ test_list }}. +{%- else -%} +The streaming analysis identified {{ len_cp_str }} statistically significant changepoints. These changepoints broadly fell +into {{ grp_count }} window{{ 's' if plural_2 else '' }} where tests identified changepoints not more than 10 years apart. For a full summary of which +tests identified changes at which dates, see table 2. +{%- endif %} diff --git a/hydroshift/templates/data_sources_side_bar.html b/hydroshift/templates/data_sources_side_bar.html new file mode 100644 index 0000000..dcec97b --- /dev/null +++ b/hydroshift/templates/data_sources_side_bar.html @@ -0,0 +1,10 @@ +
+ Data: USGS NWIS and + USGS PEAKFQ +
diff --git a/hydroshift/templates/footer.html b/hydroshift/templates/footer.html new file mode 100644 index 0000000..8bac78e --- /dev/null +++ b/hydroshift/templates/footer.html @@ -0,0 +1,33 @@ + + + diff --git a/hydroshift/templates/site_summary.md b/hydroshift/templates/site_summary.md new file mode 100644 index 0000000..1a23880 --- /dev/null +++ b/hydroshift/templates/site_summary.md @@ -0,0 +1,6 @@ +**Site Number:** {{ site_no }}
+**Station Name:** {{ station_nm }}
+**Latitude:** {{ dec_lat_va }}
+**Longitude:** {{ dec_long_va }}
+**Drainage Area (sq.mi.):** {{ drain_area_va }}
+**HUC Code:** {{ huc_cd }}
diff --git a/hydroshift/text/__init__.py b/hydroshift/text/__init__.py deleted file mode 100644 index 143f7ac..0000000 --- a/hydroshift/text/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Storage for large blocks of app text.""" diff --git a/hydroshift/stats/tests.py b/hydroshift/utils/changepoint.py similarity index 100% rename from hydroshift/stats/tests.py rename to hydroshift/utils/changepoint.py diff --git a/hydroshift/utils/common.py b/hydroshift/utils/common.py new file mode 100644 index 0000000..ca78ebd --- /dev/null +++ b/hydroshift/utils/common.py @@ -0,0 +1,53 @@ +from itertools import groupby +from typing import List + +import pandas as pd + +from hydroshift.consts import REGULATION_MAP + + +def num_2_word(number: int) -> str: + """Convert numbers less than 10 to words.""" + if number > 9: + return number + else: + d = { + 0: "no", + 1: "one", + 2: "two", + 3: "three", + 4: "four", + 5: "five", + 6: "six", + 7: "seven", + 8: "eight", + 9: "nine", + } + return d[number] + + +def group_consecutive_years(years: List[int]) -> List[str]: + """Group consecutive years together and returns a list of formatted year ranges.""" + sorted_years = sorted(years) + grouped_years = [] + + for _, group in groupby(enumerate(sorted_years), lambda ix: ix[0] - ix[1]): + g = [x[1] for x in group] + grouped_years.append(f"{g[0]}" if len(g) == 1 else f"{g[0]}-{g[-1]}") + + return grouped_years + + +def classify_regulation(code_string): + """Return True if any code in the string is considered regulated.""" + if pd.isna(code_string): + return False + for code in str(code_string).split(","): + code = code.strip() + try: + code = str(int(float(code))) # normalize numeric string like '6.0' to '6' + except ValueError: + pass + if code in REGULATION_MAP: + return True + return False diff --git a/hydroshift/utils/data_retrieval.py b/hydroshift/utils/data_retrieval.py new file mode 100644 index 0000000..d029899 --- /dev/null +++ b/hydroshift/utils/data_retrieval.py @@ -0,0 +1,342 @@ +import logging +from typing import List + +import numpy as np +import pandas as pd +import rasterio +import requests +import streamlit as st +from dataretrieval import NoSitesError, nwis +from scipy.stats import genpareto + +from hydroshift.consts import REGULATION_MAP +from hydroshift.utils.common import group_consecutive_years + + +class Gage: + """A USGS Gage.""" + + def __init__(self, gage_id: str): + """Construct class.""" + self.gage_id = gage_id + self.site_data = load_site_data(gage_id) + + @property + @st.cache_data( + hash_funcs={"hydroshift.utils.data_retrieval.Gage": lambda x: hash(x.gage_id)} + ) + def latitude(self) -> float: + """Latitude of gage.""" + return self.site_data.get("dec_lat_va") + + @property + @st.cache_data( + hash_funcs={"hydroshift.utils.data_retrieval.Gage": lambda x: hash(x.gage_id)} + ) + def longitude(self) -> float: + """Longitude of gage.""" + return self.site_data.get("dec_long_va") + + @property + @st.cache_data( + hash_funcs={"hydroshift.utils.data_retrieval.Gage": lambda x: hash(x.gage_id)} + ) + def elevation(self) -> float: + """Elevation of gage.""" + return self.site_data.get("alt_va") + + @property + @st.cache_data( + hash_funcs={"hydroshift.utils.data_retrieval.Gage": lambda x: hash(x.gage_id)} + ) + def mean_basin_elevation(self) -> float: + """Average elevation of gage watershed.""" + row = [ + r for r in self.streamstats["characteristics"] if r["variableTypeID"] == 6 + ] # Get ELEV param + return row[0]["value"] + + @property + @st.cache_data( + hash_funcs={"hydroshift.utils.data_retrieval.Gage": lambda x: hash(x.gage_id)} + ) + def streamstats(self) -> pd.DataFrame: + """Load AMS for this site.""" + r = requests.get( + f"https://streamstats.usgs.gov/gagestatsservices/stations/{self.gage_id}" + ) + return r.json() + + @property + @st.cache_data( + hash_funcs={"hydroshift.utils.data_retrieval.Gage": lambda x: hash(x.gage_id)} + ) + def ams(self) -> pd.DataFrame: + """Load AMS for this site.""" + return get_ams(self.gage_id) + + @property + def ams_vals(self) -> np.ndarray: + """Convenient ams column selection.""" + return self.ams["peak_va"].values + + @property + def missing_dates_ams(self) -> list: + """Get missing dates for AMS.""" + return check_missing_dates(self.ams, "water_year") + + @property + @st.cache_data( + hash_funcs={"hydroshift.utils.data_retrieval.Gage": lambda x: hash(x.gage_id)} + ) + def flow_stats(self) -> pd.DataFrame: + """Load flow statistics for this site.""" + return get_flow_stats(self.gage_id) + + @st.cache_data( + hash_funcs={"hydroshift.utils.data_retrieval.Gage": lambda x: hash(x.gage_id)} + ) + def get_daily_values(self, start_date: str, end_date: str) -> pd.DataFrame: + """Load daily mean discharge for this site.""" + return get_daily_values(self.gage_id, start_date, end_date) + + def missing_dates_daily_values(self, start_date: str, end_date: str) -> list: + """Get missing dates for mean daily value series.""" + return check_missing_dates(self.get_daily_values(start_date, end_date), "daily") + + @st.cache_data( + hash_funcs={"hydroshift.utils.data_retrieval.Gage": lambda x: hash(x.gage_id)} + ) + def get_monthly_values(self) -> pd.DataFrame: + """Load monthly mean discharge for this site.""" + return get_monthly_values(self.gage_id) + + @property + def missing_dates_monthly_values(self) -> list: + """Get missing dates for mean monthly value series.""" + return check_missing_dates(self.get_monthly_values(), "monthly") + + def get_regulation_summary(self, major_codes=["3", "9"]) -> List[str]: + """Run regulation summary for gage.""" + df = self.ams + df["water_year"] = df.index.year.where(df.index.month < 10, df.index.year + 1) + + regulation_years = {} + + for index, row in df.iterrows(): + if pd.notna(row.get("peak_cd")): + codes = str(row["peak_cd"]).split(",") + for code in codes: + code_str = code.strip() + try: + code_key = str(int(float(code_str))) # Normalize numeric codes + except ValueError: + code_key = code_str + + if code_key in REGULATION_MAP: + regulation_years.setdefault(code_key, set()).add( + row["water_year"] + ) + + results = {"major": [], "minor": []} + for code, years in regulation_years.items(): + grouped_year_ranges = group_consecutive_years(sorted(years)) + formatted_ranges = ", ".join(grouped_year_ranges) + if code in major_codes: + results["major"].append( + f"{REGULATION_MAP[code]} for water years {formatted_ranges}" + ) + else: + results["minor"].append( + f"{REGULATION_MAP[code]} for water years {formatted_ranges}" + ) + + return results + + def raise_warnings(self): + """Create any high level data warnings.""" + regulation_results = self.get_regulation_summary() + if regulation_results["minor"]: + for result in regulation_results["minor"]: + st.warning(result) + if regulation_results["major"]: + for result in regulation_results["major"]: + st.error(result) + + @property + def has_regional_skew(self) -> bool: + """Check if gage has regional skew available.""" + return self.regional_skew is not None + + @property + def regional_skew(self) -> float: + """The skew value from the PeakFQ skew map, if one exists.""" + raster = get_skew_raster() + try: + val = [i for i in raster.sample([(self.longitude, self.latitude)])][0][0] + if val in raster.nodatavals: + return None + except (KeyError, IndexError): + return None + if val == 9999: # California Eq. from USGS SIR 2010-5260 NL-ELEV eq + val = (0 - 0.62) + 1.3 * ( + 1 - np.exp(0 - ((self.mean_basin_elevation) / 6500) ** 2) + ) + return val + + +@st.cache_data +def get_ams(gage_id): + """Fetches Annual Maximum Series (AMS) peak flow data for a given gage.""" + try: + if gage_id == "testing": + df = fake_ams() + else: + df = nwis.get_record(service="peaks", sites=[gage_id], ssl_check=True) + except NoSitesError: + logging.warning(f"Peaks could not be found for gage id: {gage_id}") + return {"peaks": None, "lp3": None, "missing_years": None} + + df["season"] = ((df.index.month % 12 + 3) // 3).map( + {1: "Winter", 2: "Spring", 3: "Summer", 4: "Fall"} + ) # TODO: should add labels like Winter(JFM), Spring(AMJ), etc + + df = df.dropna(subset="peak_va") + return df + + +@st.cache_data +def get_flow_stats(gage_id): + """Fetches flow statistics for a given gage.""" + try: + df = nwis.get_stats(sites=gage_id, parameterCd="00060", ssl_check=True)[0] + except IndexError: + logging.warning(f"Flow stats could not be found for gage_id: {gage_id}") + return None + + return df + + +@st.cache_data +def load_site_data(gage_number: str) -> dict: + """Query NWIS for site information""" + try: + resp = nwis.get_record(sites=gage_number, service="site", ssl_check=True) + + except ValueError: + raise ValueError(f"Gage {gage_number} not found") + + return { + "site_no": resp["site_no"].iloc[0], + "station_nm": resp["station_nm"].iloc[0], + "dec_lat_va": float(resp["dec_lat_va"].iloc[0]), + "dec_long_va": float(resp["dec_long_va"].iloc[0]), + "drain_area_va": resp["drain_area_va"].iloc[0], + "huc_cd": resp["huc_cd"].iloc[0], + "alt_va": resp["alt_va"].iloc[0], + "alt_datum_cd": resp["alt_datum_cd"].iloc[0], + } + + +@st.cache_data +def get_daily_values(gage_id, start_date, end_date): + """Fetches mean daily flow values for a given gage.""" + try: + dv = nwis.get_dv(gage_id, start_date, end_date, ssl_check=True)[0] + except Exception: + logging.warning(f"Daily Values could not be found for gage_id: {gage_id}") + return None + + return dv + + +@st.cache_data +def get_monthly_values(gage_id): + """Fetches mean monthly flow values for a given gage and assigns a datetime column based on the year and month.""" + try: + mv = nwis.get_stats(gage_id, statReportType="monthly", ssl_check=True)[0] + except Exception: + logging.warning(f"Monthly Values could not be found for gage_id: {gage_id}") + return None + + mv = mv.rename(columns={"year_nu": "year", "month_nu": "month"}) + + mv["date"] = pd.to_datetime(mv[["year", "month"]].assign(day=1)) + + mv = mv.sort_values("date") + + return mv + + +def check_missing_dates(df, freq): + """Checks for missing dates in a DataFrame. + + Parameters + ---------- + df (pd.DataFrame): The DataFrame containing time-series data. + freq (str): Either 'daily', 'monthly', or 'water_year' to specify the type of data. + + """ + if freq == "daily": + if not isinstance(df.index, pd.DatetimeIndex): + df = df.set_index("datetime") + full_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq="D") + + elif freq == "monthly": + df["date"] = pd.to_datetime(df["date"]) + full_range = pd.date_range( + start=df["date"].min(), end=df["date"].max(), freq="MS" + ) + + elif freq == "water_year": + if not isinstance(df.index, pd.DatetimeIndex): + df = df.set_index("datetime") + + df["water_year"] = df.index.year.where(df.index.month < 10, df.index.year + 1) + + full_water_years = set( + range(df["water_year"].min(), df["water_year"].max() + 1) + ) + existing_water_years = set(df["water_year"]) + missing_years = sorted(full_water_years - existing_water_years) + + return missing_years + else: + raise ValueError("Invalid frequency. Use 'daily', 'monthly', or 'water_year'.") + + missing_dates = full_range.difference(df.index if freq == "daily" else df["date"]) + + return list(missing_dates) + + +def fake_ams() -> pd.DataFrame: + """Generate a timeseries from two distributions.""" + rs = 1 + rvs = [] + segments = [ + {"length": 50, "loc": 1000, "scale": 50, "c": 0.1}, + {"length": 50, "loc": 900, "scale": 500, "c": 0.5}, + {"length": 50, "loc": 1000, "scale": 100, "c": 0.1}, + ] + for s in segments: + length = s["length"] + params = {k: v for k, v in s.items() if k != "length"} + dist = genpareto(**params) + rvs.append(dist.rvs(size=length, random_state=rs)) + rs += 1 + + rvs = np.concatenate(rvs, axis=0) + dates = pd.date_range(start="1900-01-01", periods=len(rvs), freq="YE") + water_year = dates.year + df = pd.DataFrame( + {"datetime": dates, "peak_va": rvs, "water_year": water_year} + ).set_index("datetime") + return df + + +@st.cache_resource +def get_skew_raster(): + """Load the skew raster into memory.""" + return rasterio.open( + __file__.replace("utils/data_retrieval.py", "data/skewmap_4326.tif") + ) diff --git a/hydroshift/utils/ffa.py b/hydroshift/utils/ffa.py new file mode 100644 index 0000000..381b568 --- /dev/null +++ b/hydroshift/utils/ffa.py @@ -0,0 +1,134 @@ +from dataclasses import dataclass, field +from math import comb +from typing import List + +import numpy as np +import pandas as pd +from scipy.stats import pearson3, rv_continuous + +from hydroshift.utils.data_retrieval import Gage + + +@dataclass +class LP3Analysis: + """A log-pearson type 3 analysis.""" + + gage_id: str + peaks: list + use_map_skew: bool = False + est_method: str = "MLE" + label: str = "" + return_periods: List[str] = field( + default_factory=lambda: [1.1, 2, 5, 10, 25, 50, 100, 500] + ) + + # TODO: Add california equation. Likely best to subclass this. + + def __post_init__(self): + """Customize init.""" + self.peaks = np.sort(self.peaks) + + @property + def log_peaks(self) -> np.ndarray: + """Log 10 of peaks.""" + return np.log10(self.peaks) + + @property + def parameters(self) -> tuple[float]: + """Sample parameters for LP3 distribution.""" + if self.est_method == "MOM": + skew_log, mean_log, std_log = pearson3.fit(self.log_peaks, method="MM") + elif self.est_method == "MLE": + skew_log, mean_log, std_log = pearson3.fit(self.log_peaks, method="MLE") + elif self.est_method == "LMOM": + mean_log, std_log, l3 = l_moments(self.log_peaks) + skew_log = l3 / (std_log * 0.7797) # pseudo-stdev + if self.use_map_skew: + skew_log = self.weighted_skew + return mean_log, std_log, skew_log + + @property + def station_skew(self) -> tuple[float]: + """Skew of peaks.""" + return pearson3.fit(self.log_peaks, method="MM")[0] + + @property + def distribution(self) -> rv_continuous: + """The fitted LP3 distribution.""" + params = self.parameters + return pearson3(params[2], params[0], params[1]) + + @property + def plotting_positions(self) -> tuple[np.ndarray]: + """Empirical flood frequency curve.""" + aep = np.arange(1, len(self.peaks) + 1)[::-1] / (len(self.peaks) + 1) + return (aep, self.peaks) + + @property + def ffa_quantiles(self) -> tuple[np.ndarray]: + """Calculate some recurrence intervals from fitted distribution.""" + ris = np.array(self.return_periods) + aeps = 1 / ris + qs = np.power(10, self.distribution.ppf(1 - aeps)).astype(int) + return (aeps, qs) + + @property + def quantile_df(self): + """Put quantiles into a dataframe.""" + _, qs = self.ffa_quantiles + ris = [str(i) for i in self.return_periods] + return pd.DataFrame({"Recurrence Interval (years)": ris, "Discharge (cfs)": qs}) + + @property + def map_skew(self): + """The skew value from the PeakFQ skew map, if one exists.""" + gage = Gage(self.gage_id) + return gage.regional_skew + + @property + def mse_station_skew(self) -> float: + """Weighted station skew using USGS B17B method.""" + abs_g = abs(self.station_skew) + if abs_g <= 0.9: + a = -0.33 + (0.08 * abs_g) + else: + a = -0.52 + (0.3 * abs_g) + if abs_g <= 1.5: + b = 0.94 - (0.26 * abs_g) + else: + a = 0.55 + return 10 ** (a - (b * np.log10(len(self.peaks) / 10))) + + @property + def weighted_skew(self) -> float: + """Weighted station skew using USGS B17B method.""" + g_s = self.station_skew # Station Skew + g_g = self.map_skew # Generalized skew + mse_s = self.mse_station_skew # MSE of station skew + mse_g = 0.302 # From peakfq user manual page 7. TODO: Update for Cali equation + + # From Handbook of hydrology pg 18.44 + g_weighted = ((g_s / mse_s) + (g_g / mse_g)) / ((1 / mse_s) + (1 / mse_g)) + return g_weighted + + +def l_moments(series): + """Calculate first three L moments.""" + ### From Stedinger 1993 around page 7 (section 18.6.1) ### + series = np.sort(series)[::-1] + n = len(series) + b_values = list() + # This method could be streamlined, but as-is, it's easily debugged + for r in range(4): + running_sum = list() + for j in range(n - r): + cur = (comb(n - (j + 1), r) * series[j]) / comb(n - 1, r) + running_sum.append(cur) + running_sum = sum(running_sum) + b = running_sum / n + b_values.append(b) + + l1 = b_values[0] + l2 = (2 * b_values[1]) - b_values[0] + l3 = (6 * b_values[2]) - (6 * b_values[1]) + (1 * b_values[0]) + return l1, l2, l3 diff --git a/hydroshift/utils/jinja.py b/hydroshift/utils/jinja.py new file mode 100644 index 0000000..f087be9 --- /dev/null +++ b/hydroshift/utils/jinja.py @@ -0,0 +1,33 @@ +from pathlib import Path + +import streamlit as st +from jinja2 import Environment, FileSystemLoader, meta, select_autoescape + +from hydroshift import consts + +template_dir = Path(__file__).resolve().parent.parent / "templates" + +env = Environment(loader=FileSystemLoader(template_dir), autoescape=select_autoescape(["html", "xml"])) + + +def check_for_consts(template_name: str, context: dict) -> dict: + """Check if any of the template vars are in consts.py.""" + vars = meta.find_undeclared_variables(env.parse(env.loader.get_source(env, template_name)[0])) + for var in vars: + if hasattr(consts, var): + context[var] = getattr(consts, var) + return context + + +@st.cache_data +def render_template(template_name: str, context: dict = {}) -> str: + """Load a template from the environment and format it.""" + context = check_for_consts(template_name, context) + template = env.get_template(template_name) + return template.render(context) + + +def write_template(template_name: str, context: dict = {}): + """Format a template and then write it with st.""" + st.markdown(render_template(template_name, context), unsafe_allow_html=True) + st.empty() diff --git a/hydroshift/plots.py b/hydroshift/utils/plots.py similarity index 73% rename from hydroshift/plots.py rename to hydroshift/utils/plots.py index ac01f2c..2d84cfd 100644 --- a/hydroshift/plots.py +++ b/hydroshift/utils/plots.py @@ -1,24 +1,45 @@ -import numpy as np import pandas as pd import plotly.express as px import plotly.graph_objects as go import streamlit as st +from plotly.graph_objects import Figure from plotly.subplots import make_subplots +from scipy.stats import norm + +from hydroshift.utils.common import classify_regulation +from hydroshift.utils.ffa import LP3Analysis def plot_ams(ams_df, gage_id, cps: dict = {}): - """Plots AMS (Annual Peak Flow) using Plotly with only markers (no line).""" + """Plot AMS (Annual Peak Flow) with markers colored by regulation status.""" fig = go.Figure() + # Classify each point as regulated or not + ams_df["regulated"] = ams_df["peak_cd"].apply(classify_regulation) + + # Plot non-regulated points fig.add_trace( go.Scatter( - x=ams_df.index, - y=ams_df["peak_va"], + x=ams_df[~ams_df["regulated"]].index, + y=ams_df[~ams_df["regulated"]]["peak_va"], mode="markers", marker=dict(size=6, color="blue"), - name="Peak Flow", + name="Non-Regulated", + ) + ) + + # Plot regulated points + fig.add_trace( + go.Scatter( + x=ams_df[ams_df["regulated"]].index, + y=ams_df[ams_df["regulated"]]["peak_va"], + mode="markers", + marker=dict(size=6, color="red"), + name="Regulated", ) ) + + # Plot changepoints if provided if cps: for ind, cp in enumerate(cps): x = ams_df.index[cp] @@ -27,22 +48,22 @@ def plot_ams(ams_df, gage_id, cps: dict = {}): x=[x, x], y=[ams_df["peak_va"].min(), ams_df["peak_va"].max()], mode="lines", - line=dict(color="red", width=1, dash="dash"), + line=dict(color="black", width=1, dash="dash"), hovertext=[cps[cp], cps[cp]], hoverinfo="text", showlegend=False, ) ) - # Label line + # Legend label for changepoint fig.add_trace( go.Scatter( x=[0, 0], y=[0, 0], mode="lines", - line=dict(color="red", width=1, dash="dash"), # Dashed style - hoverinfo="skip", # Don't show hover on this trace - showlegend=True, # Hide the legend entry for this trace - name="Statistically\nSignificant\nChangepoint", + line=dict(color="black", width=1, dash="dash"), + hoverinfo="skip", + showlegend=True, + name="Statistically Significant Changepoint", ) ) @@ -64,16 +85,25 @@ def plot_ams(ams_df, gage_id, cps: dict = {}): def plot_flow_stats(stats_df, gage_id): - """Plots Flow Statistics using Plotly with correctly labeled legend entries.""" - # Ensure the data is sorted properly + """Plot Flow Statistics using Plotly with month-abbreviation x-axis labels.""" + # Ensure data is sorted + # Create a datetime column + stats_df["date"] = pd.to_datetime( + { + "year": 2000, # dummy leap year to support Feb 29 + "month": stats_df["month_nu"], + "day": stats_df["day_nu"], + }, + errors="coerce", + ) stats_df = stats_df.sort_values(by=["month_nu", "day_nu"]) # Approximate day of year stats_df["day_of_year"] = stats_df["month_nu"] * 30 + stats_df["day_nu"] fig = go.Figure() - # Add percentile shaded regions with correct labels + # Percentile bands fig.add_trace( go.Scatter( - x=stats_df["day_of_year"], + x=stats_df["date"], y=stats_df["p95_va"], mode="lines", line=dict(color="lightblue"), @@ -82,17 +112,18 @@ def plot_flow_stats(stats_df, gage_id): ) fig.add_trace( go.Scatter( - x=stats_df["day_of_year"], + x=stats_df["date"], y=stats_df["p05_va"], mode="lines", line=dict(color="lightblue"), fill="tonexty", showlegend=False, + name="5th-95th Percentile", ) ) fig.add_trace( go.Scatter( - x=stats_df["day_of_year"], + x=stats_df["date"], y=stats_df["p90_va"], mode="lines", line=dict(color="blue"), @@ -101,17 +132,18 @@ def plot_flow_stats(stats_df, gage_id): ) fig.add_trace( go.Scatter( - x=stats_df["day_of_year"], + x=stats_df["date"], y=stats_df["p10_va"], mode="lines", line=dict(color="blue"), fill="tonexty", showlegend=False, + name="10th-90th Percentile", ) ) fig.add_trace( go.Scatter( - x=stats_df["day_of_year"], + x=stats_df["date"], y=stats_df["p75_va"], mode="lines", line=dict(color="darkblue"), @@ -120,50 +152,56 @@ def plot_flow_stats(stats_df, gage_id): ) fig.add_trace( go.Scatter( - x=stats_df["day_of_year"], + x=stats_df["date"], y=stats_df["p25_va"], mode="lines", line=dict(color="darkblue"), fill="tonexty", showlegend=False, + name="25th-75th Percentile", ) ) - # Add mean flow + + # Mean flow line fig.add_trace( go.Scatter( - x=stats_df["day_of_year"], + x=stats_df["date"], y=stats_df["mean_va"], mode="lines+markers", line=dict(color="black"), name="Mean Flow", ) ) + + # Format layout with month abbreviations fig.update_layout( title=f"{gage_id} | Daily Flow Statistics", - xaxis_title="Day of Year", + xaxis_title="Month", yaxis_title="Flow (cfs)", legend_title="Flow Statistics", + xaxis=dict( + tickformat="%b", # abbreviated month name + dtick="M1", # monthly ticks + ), ) return fig -def plot_lp3(data: dict | list, gage_id: str, multi_series: bool = False): +def plot_lp3(data: LP3Analysis | list[LP3Analysis]): """Creates a Plotly chart for Log-Pearson Type III return period vs. peak flow.""" # Convert dict to lists for plotting - if not multi_series: - data = {"": data} + if isinstance(data, LP3Analysis): + data = [data] fig = go.Figure() for i in data: - name = i + " - Peaks" - peaks = data[i]["peaks"]["peak_va"].values - peaks.sort() - aep = np.arange(1, len(peaks) + 1)[::-1] / (len(peaks) + 1) - ri = 1 / aep + name = i.label + " - Peaks" + aep, peaks = i.plotting_positions + z = norm.ppf(1 - aep) fig.add_trace( go.Scatter( - x=ri, + x=z, y=peaks, mode="markers", marker=dict(size=8, symbol="circle-open"), @@ -171,14 +209,13 @@ def plot_lp3(data: dict | list, gage_id: str, multi_series: bool = False): ) ) - name = i + " - Log-Pearson III Fit" - lp3 = data[i]["lp3"] - return_periods = list(map(int, lp3.keys())) - peak_flows = list(lp3.values()) + name = i.label + " - Log-Pearson III Fit" + aep2, peaks2 = i.ffa_quantiles + z2 = norm.ppf(1 - aep2) fig.add_trace( go.Scatter( - x=return_periods, - y=peak_flows, + x=z2, + y=peaks2, mode="markers+lines", marker=dict(size=8), line=dict(dash="solid"), @@ -186,15 +223,22 @@ def plot_lp3(data: dict | list, gage_id: str, multi_series: bool = False): ) ) + # Formatting + return_periods = [ + int(i) if i.is_integer() else round(i, 1) for i in i.return_periods + ] + if i.use_map_skew: + skew_txt = "" + else: + skew_txt = " (No Regional Skew)" fig.update_layout( - title=f"{gage_id} | Log-Pearson Type III Estimates (No Regional Skew)", + title=f"{i.gage_id} | Log-Pearson Type III Estimates{skew_txt}", xaxis=dict( title="Return Period (years)", - type="log", - tickvals=return_periods, - ticktext=[str(rp) for rp in return_periods], + tickvals=z2, + ticktext=return_periods, ), - yaxis=dict(title="Peak Flow (cfs)"), + yaxis=dict(title="Peak Flow (cfs)", type="log"), showlegend=True, template="plotly_white", ) @@ -215,7 +259,12 @@ def plot_ams_seasonal(df, gage_id): color="season", title=f"Gage ID: {gage_id} | Seasonal AMS Ranked from Low to High", labels={"rank": "Rank", "peak_va": "Flow (cfs)"}, - color_discrete_map={"Winter": "blue", "Spring": "green", "Summer": "orange", "Fall": "brown"}, + color_discrete_map={ + "Winter": "blue", + "Spring": "green", + "Summer": "orange", + "Fall": "brown", + }, ) fig.update_layout( @@ -233,7 +282,13 @@ def plot_daily_mean(dv_df, gage_id): fig = go.Figure() fig.add_trace( - go.Scatter(x=dv_df.index, y=dv_df["00060_Mean"], mode="lines", line=dict(color="blue"), name="Daily Mean Flow") + go.Scatter( + x=dv_df.index, + y=dv_df["00060_Mean"], + mode="lines", + line=dict(color="blue"), + name="Daily Mean Flow", + ) ) fig.update_layout( @@ -316,7 +371,7 @@ def plot_cpm_heatmap(pval_df: pd.DataFrame): @st.cache_data -def combo_cpm(ams_df: pd.DataFrame, pval_df: pd.DataFrame, cps: dict = {}): +def combo_cpm(ams_df: pd.DataFrame, pval_df: pd.DataFrame, cps: dict = {}) -> Figure: """Plot a change point model with peak flows and statistical analysis.""" # Create subplots fig = make_subplots( @@ -329,7 +384,11 @@ def combo_cpm(ams_df: pd.DataFrame, pval_df: pd.DataFrame, cps: dict = {}): # Scatter plot for peak flow data fig.add_trace( go.Scatter( - x=ams_df.index, y=ams_df["peak_va"], mode="markers", marker=dict(size=6, color="blue"), name="Peak Flow" + x=ams_df.index, + y=ams_df["peak_va"], + mode="markers", + marker=dict(size=6, color="blue"), + name="Peak Flow", ), row=1, col=1, @@ -400,7 +459,7 @@ def combo_cpm(ams_df: pd.DataFrame, pval_df: pd.DataFrame, cps: dict = {}): ), legend_tracegroupgap=10, xaxis2=dict(title="Date"), - yaxis=dict(title="Peak Flow"), + yaxis=dict(title="Peak Flow (cfs)"), yaxis2=dict(title="Statistical Test"), legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0), height=600, diff --git a/pyproject.toml b/pyproject.toml index 60c1f4d..38df551 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ requires-python = ">=3.12" dependencies = [ "dataretrieval==1.0.11", "folium==0.19.4", + "jinja2>=3.1.6", "kaleido==0.2.0", "leafmap>=0.43.1", "matplotlib>=3.10.1", @@ -15,6 +16,7 @@ dependencies = [ "psutil>=7.0.0", "python-docx>=1.1.2", "radian>=0.6.13", + "rasterio>=1.4.3", "scipy==1.15.2", "streamlit==1.42.2", "streamlit-folium==0.24.0", diff --git a/tests/stat_tests/changepoint_test.py b/tests/stat_tests/changepoint_test.py index 3b604ff..2c43f77 100644 --- a/tests/stat_tests/changepoint_test.py +++ b/tests/stat_tests/changepoint_test.py @@ -5,7 +5,7 @@ from scipy.ndimage import gaussian_filter1d from scipy.stats import genpareto -from hydroshift.stats.tests import ( +from hydroshift.utils.changepoint import ( cpm_detect_change_point_batch, cpm_process_stream, get_batch_threshold, diff --git a/uv.lock b/uv.lock index d78bcaf..db553f3 100644 --- a/uv.lock +++ b/uv.lock @@ -2,6 +2,15 @@ version = 1 revision = 1 requires-python = ">=3.12" +[[package]] +name = "affine" +version = "2.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/69/98/d2f0bb06385069e799fc7d2870d9e078cfa0fa396dc8a2b81227d0da08b9/affine-2.4.0.tar.gz", hash = "sha256:a24d818d6a836c131976d22f8c27b8d3ca32d0af64c1d8d29deb7bafa4da1eea", size = 17132 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0b/f7/85273299ab57117850cc0a936c64151171fac4da49bc6fba0dad984a7c5f/affine-2.4.0-py3-none-any.whl", hash = "sha256:8a3df80e2b2378aef598a83c1392efd47967afec4242021a0b06b4c7cbc61a92", size = 15662 }, +] + [[package]] name = "altair" version = "5.5.0" @@ -197,6 +206,30 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 }, ] +[[package]] +name = "click-plugins" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5f/1d/45434f64ed749540af821fd7e42b8e4d23ac04b1eda7c26613288d6cd8a8/click-plugins-1.1.1.tar.gz", hash = "sha256:46ab999744a9d831159c3411bb0c79346d94a444df9a3a3742e9ed63645f264b", size = 8164 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e9/da/824b92d9942f4e472702488857914bdd50f73021efea15b4cad9aca8ecef/click_plugins-1.1.1-py2.py3-none-any.whl", hash = "sha256:5d262006d3222f5057fd81e1623d4443e41dcda5dc815c06b442aa3c02889fc8", size = 7497 }, +] + +[[package]] +name = "cligj" +version = "0.7.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "click" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ea/0d/837dbd5d8430fd0f01ed72c4cfb2f548180f4c68c635df84ce87956cff32/cligj-0.7.2.tar.gz", hash = "sha256:a4bc13d623356b373c2c27c53dbd9c68cae5d526270bfa71f6c6fa69669c6b27", size = 9803 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/73/86/43fa9f15c5b9fb6e82620428827cd3c284aa933431405d1bcf5231ae3d3e/cligj-0.7.2-py3-none-any.whl", hash = "sha256:c1ca117dbce1fe20a5809dc96f01e1c2840f6dcc939b3ddbb1111bf330ba82df", size = 7069 }, +] + [[package]] name = "colorama" version = "0.4.6" @@ -437,6 +470,7 @@ source = { virtual = "." } dependencies = [ { name = "dataretrieval" }, { name = "folium" }, + { name = "jinja2" }, { name = "kaleido" }, { name = "leafmap" }, { name = "matplotlib" }, @@ -445,6 +479,7 @@ dependencies = [ { name = "psutil" }, { name = "python-docx" }, { name = "radian" }, + { name = "rasterio" }, { name = "scipy" }, { name = "streamlit" }, { name = "streamlit-folium" }, @@ -454,6 +489,7 @@ dependencies = [ requires-dist = [ { name = "dataretrieval", specifier = "==1.0.11" }, { name = "folium", specifier = "==0.19.4" }, + { name = "jinja2", specifier = ">=3.1.6" }, { name = "kaleido", specifier = "==0.2.0" }, { name = "leafmap", specifier = ">=0.43.1" }, { name = "matplotlib", specifier = ">=3.10.1" }, @@ -462,6 +498,7 @@ requires-dist = [ { name = "psutil", specifier = ">=7.0.0" }, { name = "python-docx", specifier = ">=1.1.2" }, { name = "radian", specifier = ">=0.6.13" }, + { name = "rasterio", specifier = ">=1.4.3" }, { name = "scipy", specifier = "==1.15.2" }, { name = "streamlit", specifier = "==1.42.2" }, { name = "streamlit-folium", specifier = "==0.24.0" }, @@ -1335,6 +1372,32 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/e1/3b/c5ff0d431ea2bf87646600ba75a3786836d85edd9e8da0f468a8d07ab8e4/radian-0.6.13.tar.gz", hash = "sha256:197da7c44c7f21ec6926ec2bb3223c968e833099725cf1a76d2aa80c0b0235b7", size = 53891 } +[[package]] +name = "rasterio" +version = "1.4.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "affine" }, + { name = "attrs" }, + { name = "certifi" }, + { name = "click" }, + { name = "click-plugins" }, + { name = "cligj" }, + { name = "numpy" }, + { name = "pyparsing" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/de/19/ab4326e419b543da623ce4191f68e3f36a4d9adc64f3df5c78f044d8d9ca/rasterio-1.4.3.tar.gz", hash = "sha256:201f05dbc7c4739dacb2c78a1cf4e09c0b7265b0a4d16ccbd1753ce4f2af350a", size = 442990 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/f2/b7417292ceace70d815760f7e41fe5b0244ebff78ede11b1ffa9ca01c370/rasterio-1.4.3-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:e703e4b2c74c678786d5d110a3f30e26f3acfd65f09ccf35f69683a532f7a772", size = 21514543 }, + { url = "https://files.pythonhosted.org/packages/b2/ea/e21010457847b26bb4aea3983e9b44afbcefef07defc5e9a3285a8fe2f0c/rasterio-1.4.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:38a126f8dbf405cd3450b5bd10c6cc493a2e1be4cf83442d26f5e4f412372d36", size = 18735924 }, + { url = "https://files.pythonhosted.org/packages/67/72/331727423b28fffdfd8bf18bdc55c18d374c0fefd2dde390cd833f8f4477/rasterio-1.4.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8e90c2c300294265c16becc9822337ded0f01fb8664500b4d77890d633d8cd0e", size = 22251721 }, + { url = "https://files.pythonhosted.org/packages/be/cc/453816b489af94b9a243eda889865973d518989ba6923b2381f6d6722b43/rasterio-1.4.3-cp312-cp312-win_amd64.whl", hash = "sha256:a962ad4c29feaf38b1d7a94389313127de3646a5b9b734fbf9a04e16051a27ff", size = 25430154 }, + { url = "https://files.pythonhosted.org/packages/2e/e0/718c06b825d1f62077913e5bff1e70b71ac673718b135d55a0256d88d4ba/rasterio-1.4.3-cp313-cp313-macosx_10_15_x86_64.whl", hash = "sha256:5d4fcb635379b3d7b2f5e944c153849e3d27e93f35ad73ad4d3f0b8a580f0c8e", size = 21532284 }, + { url = "https://files.pythonhosted.org/packages/bb/a8/3b6b11923300d6835453d1157fabb518338067a67366c5c52e9df9a2314f/rasterio-1.4.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:98a9c89eade8c779e8ac1e525269faaa18c6b9818fc3c72cfc4627df71c66d0d", size = 18729960 }, + { url = "https://files.pythonhosted.org/packages/05/19/94d6c66184c7d0f9374330c714f62c147dbb53eda9efdcc8fc6e2ac454c5/rasterio-1.4.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9bab1a0bb22b8bed1db34b5258db93d790ed4e61ef21ac055a7c6933c8d5e84", size = 22237518 }, + { url = "https://files.pythonhosted.org/packages/df/88/9db5f49ebfdd9c12365e4cac76c34ccb1a642b1c8cbab4124b3c681495de/rasterio-1.4.3-cp313-cp313-win_amd64.whl", hash = "sha256:1839960e2f3057a6daa323ccf67b330f8f2f0dbd4a50cc7031e88e649301c5c0", size = 25424949 }, +] + [[package]] name = "rchitect" version = "0.4.7"