Source code for xhydro.indicators.signatures

# Created on Mon Sep 22 2025
# @author: Ève Larose (user e-larose)
"""
Aggregated hydrological signature package for xhydro, for model evaluation.

This signature package is useful for watershed comparisons.
For temporal analysis see xclim.indices._hydrology library
"""
# Import packages

import numpy as np
import xarray as xr
from scipy import signal, stats
from xclim.core.units import amount2rate, convert_units_to, units2pint
from xscen.utils import standardize_periods

from xhydro.indicators import generic
from xhydro.utils import health_checks


__all__ = ["elasticity_index", "flow_duration_curve_slope", "hurst_exp", "total_runoff_ratio"]



[docs]
def elasticity_index(
    q: xr.DataArray,
    pr: xr.DataArray,
    periods: list[str] | list[list[str]] | None = None,
    missing: str = "skip",
    missing_options: dict | None = None,
) -> xr.DataArray:
    """
    Elasticity index.

    Compute the median of yearly streamflow elasticity index for given catchments,
    where elasticity (εₚ) is defined as the relative change in streamflow (ΔQ/Q)
    divided by the relative change in precipitation (ΔP/P).

    Parameters
    ----------
    q : xr.DataArray
        Daily discharge data.
    pr : xr.DataArray
        Daily precipitation data.
    periods : list of str or list of list of str, optional
        Either [start, end] or list of [start, end] of periods to be considered.
        If multiple periods are given, the output will have a `horizon` dimension.
        If None, all data is used.
    missing : str
        How to handle missing values. One of "skip", "any", "at_least_n", "pct", "wmo".
        See :py:func:`xclim.core.missing` for more information.
    missing_options : dict, optional
        Dictionary of options for the missing values' method. See :py:func:`xclim.core.missing` for more information.

    Returns
    -------
    xr.DataArray
        Nonparametric estimator for streamflow elasticity index (dimensionless).

    Notes
    -----
    A value of εp greater than 1 indicates that streamflow is highly sensitive to precipitation changes,
    meaning a 1% change in precipitation will lead to a greater than 1% change in streamflow.
    A value less than 1 suggests a less sensitive relationship.
    It is recommended to use yearly frequency in order to have more robust elasticity_index

    References
    ----------
    Sankarasubramanian, A., Vogel, R. M., & Limbrunner, J. F. (2001). Climate elasticity of streamflow
    in the United States. Water Resources Research, 37(6), 1771–1781. https://doi.org/10.1029/2000WR900330
    """
    ds = xr.Dataset()  # get_yearly_op needs a Dataset input
    ds["q"] = q
    ds["q"].attrs["units"] = ds["q"].attrs.get("units", "undefined")
    ds["pr"] = pr
    ds["pr"].attrs["units"] = ds["pr"].attrs.get("units", "undefined")

    periods = (
        standardize_periods(periods, multiple=True) if periods is not None else [[str(int(ds.time.dt.year.min())), str(int(ds.time.dt.year.max()))]]
    )
    out = []
    for period in periods:
        ds_subset = ds.sel(time=slice(period[0], period[1]))

        # Annual mean from raw data
        q_annual = generic.get_yearly_op(
            ds_subset, input_var="q", op="mean", timeargs={"annual": {}}, missing=missing, missing_options=missing_options
        )
        p_annual = generic.get_yearly_op(
            ds_subset, input_var="pr", op="mean", timeargs={"annual": {}}, missing=missing, missing_options=missing_options
        )

        # Year-to-year changes
        delta_q = q_annual.diff(dim="time")
        delta_p = p_annual.diff(dim="time")

        # Avoid division by zero
        # TODO: Maybe eventually replace with xsdba.jitter_under_thresh, but this would add a dependency
        epsilon = 1e-10
        with xr.set_options(keep_attrs=True):
            # Relative changes
            rel_delta_p = delta_p / xr.where(p_annual == 0, epsilon, p_annual)
            rel_delta_q = delta_q / xr.where(q_annual == 0, epsilon, q_annual)

            # Compute yearly streamflow elasticity (not mathematically robust values)
            yearly_elasticity = rel_delta_q["q_mean_annual"] / xr.where(rel_delta_p["pr_mean_annual"] == 0, epsilon, rel_delta_p["pr_mean_annual"])

        yearly_elasticity = yearly_elasticity.expand_dims({"horizon": ["-".join(period)]})
        out.append(yearly_elasticity)

    out_combined = xr.concat(out, dim="horizon")

    # Median over selected periods
    ei = out_combined.median(dim="time")

    return xr.DataArray(ei, attrs={"units": "", "long_name": "Elasticity index"})




[docs]
def flow_duration_curve_slope(
    q: xr.DataArray,
    periods: list[str] | list[list[str]] | None = None,
    missing: str = "skip",
    missing_options: dict | None = None,
) -> xr.DataArray:
    """
    Calculate the slope of the flow duration curve mid-section between the 33% and 66% exceedance probabilities.

    Aggregated analysis : Single value as a long-term benchmark.

    Parameters
    ----------
    q : xarray.DataArray
        Daily streamflow data, expected to have a discharge unit.
    periods : list of str or list of list of str, optional
        Either [start, end] or list of [start, end] of periods to be considered.
        If multiple periods are given, the output will have a `horizon` dimension.
        If None, all data is used.
    missing : str
        How to handle missing values. One of "skip", "any", "at_least_n", "pct", "wmo".
        See :py:func:`xclim.core.missing` for more information.
    missing_options : dict, optional
        Dictionary of options for the missing values' method. See :py:func:`xclim.core.missing` for more information.

    Returns
    -------
    xarray.DataArray
        Slope of the FDC between the 33% and 66% exceedance probabilities, unitless.

    Notes
    -----
    Single Returned numeric values are positive.

    High slope value :
    Steep slopes of the midsegment FDC are typically a characteristic behavior for watersheds having ‘flashy’ responses
    (for example due to small soil storage capacity and hence larger percentage of overland flow).

    Lower slope value :
    Flatter slopes of the midsegment FDC are associated with watersheds having slower and
    more sustained groundwater low response.

    References
    ----------
    Vogel, R. M., & Fennessey, N. M. (1994). Flow-duration curves. I: New interpretation and confidence intervals.
    Journal of Water Resources Planning and Management, 120(4), 485-504.
    DOI:10.1061/(ASCE)0733-9496(1994)120:4(485)

    Yilmaz, K. K., Gupta, H. V., & Wagener, T. (2008). A process‐based diagnostic approach to model evaluation:
    Application to the NWS distributed hydrologic model. Water resources research, 44(9).
    DOI:10.1029/2007WR006716
    """
    if missing != "skip":
        # Compatibility with health_checks
        str_to_hc = {
            "any": "missing_any",
            "at_least_n": "at_least_n_valid",
            "pct": "missing_pct",
            "wmo": "missing_wmo",
        }
        missing_options = missing_options or {}
        health_checks(q, missing={str_to_hc[missing]: missing_options}, raise_on="all")

    periods = (
        standardize_periods(periods, multiple=True) if periods is not None else [[str(int(q.time.dt.year.min())), str(int(q.time.dt.year.max()))]]
    )
    out = []
    for period in periods:
        subset_q = q.sel(time=slice(period[0], period[1]))
        # Calculate the natural logarithm of the 33rd and 66th percentiles directly across the 'time' dimension
        q33 = np.log(subset_q.quantile(0.33, dim="time", skipna=True))
        q66 = np.log(subset_q.quantile(0.66, dim="time", skipna=True))

        # Calculate the slope (unitless)
        slope = (q33 - q66) / (0.33 - 0.66)

        slope = slope.expand_dims({"horizon": ["-".join(period)]})
        out.append(slope)

    out_combined = xr.concat(out, dim="horizon")
    out_combined.attrs = {
        "units": "",
        "long_name": "Slope of FDC between 33% and 66% exceedance probabilities",
        "description": "Slope of the flow duration curve mid-section between the 33% and 66% exceedance probabilities.",
    }
    return out_combined




[docs]
def total_runoff_ratio(
    q: xr.DataArray,
    drainage_area: xr.DataArray,
    pr: xr.DataArray,
    periods: list[str] | list[list[str]] | None = None,
    missing: str = "skip",
    missing_options: dict | None = None,
) -> xr.DataArray:
    """
    Total runoff ratio.

    Compute the ratio of streamflow measured at a stream gauge station to the total precipitation over the watershed.
    Also known as the runoff coefficient, it is higher in watersheds with steep slopes,
    impervious surfaces or high evapotranspiration.

    Parameters
    ----------
    q : xarray.DataArray
        Streamflow.
    drainage_area : xarray.DataArray
        Watershed.
    pr : xarray.DataArray
        Mean daily Precipitation.
    periods : list of str or list of list of str, optional
        Either [start, end] or list of [start, end] of periods to be considered.
        If multiple periods are given, the output will have a `horizon` dimension.
        If None, all data is used.
    missing : str
        How to handle missing values. One of "skip", "any", "at_least_n", "pct", "wmo".
        See :py:func:`xclim.core.missing` for more information.
    missing_options : dict, optional
        Dictionary of options for the missing values' method. See :py:func:`xclim.core.missing` for more information.

    Returns
    -------
    xarray.DataArray
        Single value rainfall-runoff ratio (RRR) as long-term benchmark.

    Notes
    -----
    - Total Runoff ratio values are comparable to Runoff coefficients,
    - Values near 0 mean most precipitation infiltrates watershed soil or is lost to evapotranspiration.
    - Values near 1 mean most precipitation leaves the watershed as runoff;
      possible causes are impervious surfaces from urban sprawl, thin soils, steep slopes, etc.
    - Long-term averages are typically ≤ 1.

    References
    ----------
    HydroBM https://hydrobm.readthedocs.io/en/latest/usage.html#benchmarks
    """
    if missing != "skip":
        # Compatibility with health_checks
        str_to_hc = {
            "any": "missing_any",
            "at_least_n": "at_least_n_valid",
            "pct": "missing_pct",
            "wmo": "missing_wmo",
        }
        missing_options = missing_options or {}
        health_checks(q, missing={str_to_hc[missing]: missing_options}, raise_on="all")

    # Convert units
    q = convert_units_to(q, "mm3/hr")
    drainage_area = convert_units_to(drainage_area, "mm2")
    runoff = q / drainage_area
    runoff.attrs["units"] = "mm h-1"

    if units2pint(pr).dimensionality.get("[time]") == 0:
        pr = amount2rate(pr)
    pr = convert_units_to(pr, "mm h-1", context="hydro")

    periods = (
        standardize_periods(periods, multiple=True) if periods is not None else [[str(int(q.time.dt.year.min())), str(int(q.time.dt.year.max()))]]
    )
    out = []
    for period in periods:
        subset_runoff = runoff.sel(time=slice(period[0], period[1]))
        subset_pr = pr.sel(time=slice(period[0], period[1]))

        total_runoff = subset_runoff.sum(dim="time", skipna=True)
        total_pr = subset_pr.sum(dim="time", skipna=True)

        runoff_ratio = total_runoff / total_pr
        runoff_ratio = runoff_ratio.expand_dims({"horizon": ["-".join(period)]})
        out.append(runoff_ratio)

    total_rr = xr.concat(out, dim="horizon")
    total_rr.attrs = {
        "units": "",
        "long_name": "Total Rainfall-Runoff Ratio",
        "description": "Ratio of total streamflow to total precipitation over the watershed.",
    }
    return total_rr




[docs]
def hurst_exp(
    q: xr.DataArray,
    selected_low_frequecy: int = 0.01,
    periods: list[str] | list[list[str]] | None = None,
    missing: str = "skip",
    missing_options: dict | None = None,
) -> xr.DataArray:
    """
    Hurst Exponent.

    Compute the Hurst Exponent (H) of time-series obtained from the slope of
    the power spectral density (estimated by periodogram) of the streamflow time series at near-zero frequency.

    Parameters
    ----------
    q : xarray.DataArray
        Streamflow in [discharge] units.
    selected_low_frequecy : int
        Near-zero frequency, can be adjusted to best represent the persistency of observed streamflow. Default is 0.01.
    periods : list of str or list of list of str, optional
        Either [start, end] or list of [start, end] of periods to be considered.
        If multiple periods are given, the output will have a `horizon` dimension.
        If None, all data is used.
    missing : str
        How to handle missing values. One of "skip", "any", "at_least_n", "pct", "wmo".
        See :py:func:`xclim.core.missing` for more information.
    missing_options : dict, optional
        Dictionary of options for the missing values' method. See :py:func:`xclim.core.missing` for more information.

    Returns
    -------
    xarray.DataArray
        Single value Hurst Exponent (H).

    Notes
    -----
    In order to compute Hurst Exponent, missing values (if any) will be linearly interpolated along the time dimension.
    Leading and trailing NaNs will be filled using nearest valid neighbor method.
    Remaining single NaNs (if any) will be filled using linear interpolation.

    - Hurst Exponent serves as a statistical health check for observed and simulated streamflows;
    - H>0.5: persistence (long-term memory, common in hydrology) acceptable range: 0.5 to 1.
    - H<0.5: noise and anti-persistence.

    References
    ----------
    Gupta, A., Hantush, M. M., Govindaraju, R. S., & Beven, K. (2024). Evaluation of hydrological models
    at gauged and ungauged basins using machine learning-based limits-of-acceptability and hydrological signatures.
    Journal of Hydrology, 641, 131774. https://doi.org/10.1016/j.jhydrol.2024.131774.

    Koutsoyiannis, D., and A. Montanari (2007), Statistical analysis of hydroclimatic time series:
    Uncertainty and insights, Water Resour. Res., 43, W05429, doi:10.1029/2006WR005592.
    """
    if missing != "skip":
        # Compatibility with health_checks
        str_to_hc = {
            "any": "missing_any",
            "at_least_n": "at_least_n_valid",
            "pct": "missing_pct",
            "wmo": "missing_wmo",
        }
        missing_options = missing_options or {}
        health_checks(q, missing={str_to_hc[missing]: missing_options}, raise_on="all")

    q_interp = q.interpolate_na("time")

    # Fill leading/trailing NaNs using nearest valid neighbor
    q_filled = q_interp.ffill("time").bfill("time")
    q_filled = q_filled.where(np.isfinite(q_filled), np.nan)

    # Final fill for remaining single points (if any)
    q_filled = q_filled.interpolate_na("time")
    q_filled = q_filled.astype(float)

    out = []
    periods = (
        standardize_periods(periods, multiple=True)
        if periods is not None
        else [[str(int(q_filled.time.dt.year.min())), str(int(q_filled.time.dt.year.max()))]]
    )
    for period in periods:
        subset_q = q_filled.sel(time=slice(period[0], period[1]))

        # Detrend to remove strong low-freq bias
        q_detrended = xr.DataArray(
            signal.detrend(subset_q),
            coords=subset_q.coords,
            dims=subset_q.dims,
        )

        # TODO: Wrap this in a apply_ufunc for performance

        # Compute periodogram using correct fs
        f, pxx = signal.periodogram(q_detrended, fs=1, scaling="density")  # freq default fs = 1day

        mask = (f > 0) & (f < f.max() * selected_low_frequecy)  # select near zero freq

        if mask.sum() < 20:
            mask = (f > 0) & (f < f.max() * 0.02)  # check for at least 20 frequency bins

        freqs_low = f[mask]
        pxx_low = pxx[mask]

        slope, intercept, r_value, p_value, std_err = stats.linregress(np.log10(freqs_low), np.log10(pxx_low))
        beta = -slope  # slope is negative, so Beta = -slope
        h = (beta + 1) / 2

        h = xr.DataArray(h, coords=subset_q.drop_vars("time").coords, dims=tuple(set(subset_q.dims) - {"time"}))
        h = h.expand_dims({"horizon": ["-".join(period)]})
        out.append(h)
    h_out = xr.concat(out, dim="horizon")
    h_out.attrs = {
        "units": "",
        "long_name": "Hurst Exponent",
    }

    return h_out