diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e0f7656 --- /dev/null +++ b/.gitignore @@ -0,0 +1,42 @@ +# Python +__pycache__/ +*.py[cod] +*.pyo +*.pyd +.Python +*.egg-info/ +*.egg +dist/ +build/ + +# Virtual environment +.venv/ +venv/ +env/ + +# Pytest / coverage +.pytest_cache/ +.coverage +htmlcov/ + +# Data files (large, transferred separately) +data/*.parquet +data/*.xlsx +data/*.csv +data/*.tif +data/*.geojson + +# Pipeline cache and intermediate outputs +cache/ + +# Jupyter +.ipynb_checkpoints/ +*.ipynb + +# OS +.DS_Store +Thumbs.db + +# IDE +.vscode/settings.json +.idea/ diff --git a/docs/study_boundary/study_area.gpkg b/docs/study_boundary/study_area.gpkg new file mode 100644 index 0000000..b7b266a Binary files /dev/null and b/docs/study_boundary/study_area.gpkg differ diff --git a/scripts/open_parquet.py b/scripts/open_parquet.py new file mode 100644 index 0000000..a58bc10 --- /dev/null +++ b/scripts/open_parquet.py @@ -0,0 +1,4 @@ +# impport parquet +import pyarrow.parquet as pq +table = pq.read_table(r"C:\Users\Ali\OneDrive - CUNY\Desktop\SI\fimbox_SI26\data\usgs_rating_curves.parquet") +print(table) \ No newline at end of file diff --git a/scripts/plot_calibration_analysis.py b/scripts/plot_calibration_analysis.py new file mode 100644 index 0000000..09c64f2 --- /dev/null +++ b/scripts/plot_calibration_analysis.py @@ -0,0 +1,1229 @@ +""" +Calibration analysis and visualization — full study area. + +Individual plots (E:/SI/out/HUC{huc8}/src_plots/): + rc_{huc8}_{bid}.png Rating curve comparison per calibrated branch + +Study-area aggregate (E:/SI/out/calibration_analysis/): + 01_n_profiles.png Manning's n vs HAND stage — all gauges overlaid + 02_q_scatter.png Q_orig vs Q_USGS / Q_calib vs Q_USGS (log scale) + 03_adj_ratio.png Q_calib / Q_orig by zone — violin plots + 04_n_by_zone.png Calibrated n distribution by zone — box plots + 05_coverage.png Calibrated vs total branches per HUC + 06_metrics.png NSE / KGE / PBIAS / RMSE before-vs-after scatter + 07_n_vs_slope.png Calibrated n vs channel slope by zone + 08_stage_shift.png HAND stage shift (Δh) at each recurrence flow + 09_n_direction.png Count/% of gauges where n increased vs decreased (stacked bar + table) + 10_src_ensemble_original.png All original SRCs overlaid with median + IQR band + 11_src_ensemble_calibrated.png All calibrated SRCs overlaid with median + IQR band + metrics_summary.csv Per-gauge tabular metrics + +Run: + .venv\\Scripts\\python.exe scripts/plot_calibration_analysis.py +""" +from __future__ import annotations + +import logging +from pathlib import Path + +import matplotlib as mpl +import matplotlib.patches as mpatches +import matplotlib.pyplot as plt +import matplotlib.ticker as mticker +import numpy as np +import pandas as pd +from matplotlib.lines import Line2D + +# ── Style ───────────────────────────────────────────────────────── +mpl.rcParams.update({ + "font.family": "DejaVu Sans", + "font.size": 10.5, + "axes.labelsize": 12, + "axes.titlesize": 11.5, + "axes.titleweight": "bold", + "axes.spines.top": False, + "axes.spines.right": False, + "axes.grid": True, + "grid.alpha": 0.22, + "grid.linestyle": ":", + "legend.fontsize": 9.5, + "legend.framealpha": 0.92, + "figure.dpi": 150, +}) + +# ── Palette ─────────────────────────────────────────────────────── +C_USGS = "#111111" +C_ORIG = "#2471a3" +C_CALIB = "#c0392b" +C_NAVY = "#1b4f72" +C_GRAY = "#7f8c8d" +ZONE_CLR = ["#aed6f1", "#a9dfbf", "#fad7a0", "#f9e79f", "#f1948a", "#d2b4de"] +GAUGE_PALETTE = [ + "#e41a1c", "#377eb8", "#4daf4a", "#984ea3", + "#ff7f00", "#a65628", "#f781bf", "#999999", + "#1b9e77", "#d95f02", "#7570b3", "#e7298a", + "#66a61e", "#e6ab02", "#a6761d", "#666666", + "#8dd3c7", "#fb8072", "#80b1d3", "#fdb462", +] + +# ── CONFIG ──────────────────────────────────────────────────────── +EXCEL_PATH = Path(r"C:\Users\Ali\OneDrive - CUNY\Desktop\SI\fimbox_SI26\data\study_area.xlsx") +HUC_CODE_COL = "HUC_CODE" +OUT_DIR = Path("E:/SI/out") +DATA_DIR = Path("data") +SUMMARY_DIR = Path("E:/SI/out/calibration_analysis") +RECURRENCE_YEARS = [2, 5, 10, 25, 50] +RECUR_COLS = {yr: f"{yr}_0_year_recurrence_flow_17C" for yr in RECURRENCE_YEARS} +# ───────────────────────────────────────────────────────────────── + +logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S") +log = logging.getLogger(__name__) + + +# ══════════════════════════════════════════════════════════════════ +# SECTION 1 — Metrics +# ══════════════════════════════════════════════════════════════════ + +def _nse(obs, sim): + d = np.sum((obs - obs.mean()) ** 2) + return float(1 - np.sum((sim - obs) ** 2) / d) if d > 0 else np.nan + +def _kge(obs, sim): + if obs.std() == 0 or sim.std() == 0: + return np.nan + r = np.corrcoef(obs, sim)[0, 1] + alpha = sim.std() / obs.std() + beta = sim.mean() / obs.mean() if obs.mean() != 0 else np.nan + return float(1 - np.sqrt((r - 1)**2 + (alpha - 1)**2 + (beta - 1)**2)) + +def _pbias(obs, sim): + return float(100 * (sim - obs).sum() / obs.sum()) if obs.sum() != 0 else np.nan + +def _rmse(obs, sim): + return float(np.sqrt(((sim - obs) ** 2).mean())) + + +# ══════════════════════════════════════════════════════════════════ +# SECTION 2 — Data helpers +# ══════════════════════════════════════════════════════════════════ + +def _load_shared_data(): + rc = pd.read_parquet(DATA_DIR / "usgs_rating_curves.parquet") + rc["location_id"] = rc["location_id"].astype(str) + rc["flow_cms"] = rc["flow"].astype(float) / 35.3147 + rc["elev_m"] = rc["elevation_navd88"].astype(float) / 3.28084 + + recur = pd.read_parquet(DATA_DIR / "nwm3_17C_recurrence_flows_cfs.parquet") + for yr, col in RECUR_COLS.items(): + recur[f"Q_{yr}yr_cms"] = recur[col].astype(float) * 0.028317 + recur["feature_id"] = recur["feature_id"].astype("Int64") + return rc, recur + + +def _load_src(branch_dir: Path, bid: str, original: bool) -> pd.DataFrame | None: + suffix = ".pre_n_calib.csv" if original else ".csv" + p = branch_dir / f"src_full_crosswalked_{bid}{suffix}" + if not p.exists(): + if original: + p = branch_dir / f"src_full_crosswalked_{bid}.csv" + if not p.exists(): + return None + df = pd.read_csv(p, low_memory=False) + df["HydroID"] = df["HydroID"].astype(int) + return df + + +def _zone_pts(loc_id, feat_id, dem_adj_m, rc, recur): + loc_pad = str(loc_id).zfill(8) + grc = rc[rc["location_id"] == loc_pad].sort_values("flow_cms").drop_duplicates("flow_cms") + if grc.empty: + return [] + feat = recur[recur["feature_id"] == feat_id] + if feat.empty: + return [] + pts = [] + for yr in RECURRENCE_YEARS: + Q = float(feat[f"Q_{yr}yr_cms"].iloc[0]) + if Q < grc["flow_cms"].min() or Q > grc["flow_cms"].max(): + continue + elev = float(np.interp(Q, grc["flow_cms"].values, grc["elev_m"].values)) + h = elev - dem_adj_m + if h > 0: + pts.append((yr, h, Q)) + return sorted(pts, key=lambda x: x[0]) + + +def _n_step(zp, src_calib, hydroid, y_max): + """Return (n_vals, h_vals) step arrays for the Manning's n twin axis.""" + hdf = src_calib[src_calib["HydroID"] == hydroid].sort_values("Stage") + if hdf.empty or "zonal_n_applied" not in hdf.columns: + return [], [] + h_brk = [0.0] + [h for _, h, _ in zp] + if h_brk[-1] < y_max: + h_brk.append(y_max) + nx, hy = [], [] + for i, (h_lo, h_hi) in enumerate(zip(h_brk[:-1], h_brk[1:])): + idx = (hdf["Stage"] - (h_lo + h_hi) / 2).abs().idxmin() + n_v = float(hdf.loc[idx, "zonal_n_applied"]) + nx.extend([n_v, n_v]) + hy.extend([h_lo, h_hi]) + if i < len(h_brk) - 2: + nx.append(float(hdf.loc[ + (hdf["Stage"] - h_brk[i+1]).abs().idxmin(), "zonal_n_applied"])) + hy.append(h_hi) + return nx, hy + + +def _discover_calibrated(huc8: str, rc: pd.DataFrame, elev: pd.DataFrame): + """ + Returns list of dicts with gauge info for branches that were actually + calibrated (have a .pre_n_calib.csv backup AND have USGS RC data). + """ + branches_dir = OUT_DIR / f"HUC{huc8}" / "watershed-data" / "branches" + if not branches_dir.exists(): + return [] + rc_ids = set(rc["location_id"].astype(str)) + results = [] + for branch_dir in sorted(branches_dir.iterdir()): + if not branch_dir.is_dir(): + continue + bid = branch_dir.name + if bid == "0": + continue + pre_exists = (branch_dir / f"src_full_crosswalked_{bid}.pre_n_calib.csv").exists() + if not pre_exists: + continue + branch_elev = elev[elev["levpa_id"].astype(str) == bid] + for _, gage in branch_elev.iterrows(): + loc_id = str(gage["location_id"]).zfill(8) + if loc_id not in rc_ids: + continue + if pd.isna(gage.get("feature_id")): + continue + results.append({ + "bid": bid, + "location_id": loc_id, + "hydroid": int(gage["HydroID"]), + "feature_id": int(gage["feature_id"]), + "dem_adj_m": float(gage["dem_adj_elevation"]), + "branch_dir": branch_dir, + }) + return results + + +def _total_branches(huc8: str) -> int: + lst = OUT_DIR / f"HUC{huc8}" / "watershed-data" / "branch_ids.lst" + if not lst.exists(): + return 0 + lines = [l.strip() for l in lst.read_text().splitlines() if l.strip()] + return len(lines) + 1 # +1 for Branch Zero + + +# ══════════════════════════════════════════════════════════════════ +# SECTION 3 — Individual RC plots +# ══════════════════════════════════════════════════════════════════ + +def _draw_rc_panel(ax, bid, loc_id, hydroid, feat_id, dem_adj_m, + src_orig, src_calib, rc, recur): + zp = _zone_pts(loc_id, feat_id, dem_adj_m, rc, recur) + if not zp: + return + + loc_pad = str(loc_id).zfill(8) + grc = rc[rc["location_id"] == loc_pad].sort_values("flow_cms").copy() + grc["hand"] = grc["elev_m"] - dem_adj_m + grc = grc[grc["hand"] > 0] + + h_max_zone = max(h for _, h, _ in zp) + y_max = max(h_max_zone * 1.5, 3.0) + rc_q_max = float(grc[grc["hand"] <= y_max * 1.05]["flow_cms"].max()) if not grc.empty else 0.0 + x_max = max(rc_q_max, max(Q for _, _, Q in zp) * 1.4) * 1.08 + + # Zone shading + h_edges = [0.0] + [h for _, h, _ in zp] + if h_edges[-1] < y_max: + h_edges.append(y_max) + for i, (hlo, hhi) in enumerate(zip(h_edges[:-1], h_edges[1:])): + ax.axhspan(hlo, min(hhi, y_max), color=ZONE_CLR[min(i, 5)], + alpha=0.28, zorder=0, lw=0) + + # Zone lines + labels + prev_lh = -1.0 + min_sep = y_max * 0.07 + for yr, h, _ in zp: + ax.axhline(h, color="#888", ls=":", lw=0.9, alpha=0.7, zorder=1) + lh = max(h, prev_lh + min_sep) + ax.text(x_max * 0.985, lh, f"{yr} yr", + ha="right", va="bottom", fontsize=8.5, + color="#555", fontweight="bold", zorder=10) + prev_lh = lh + + # USGS RC + if not grc.empty: + rcp = grc[grc["hand"] <= y_max * 1.02] + ax.plot(rcp["flow_cms"], rcp["hand"], + color=C_USGS, lw=2.5, zorder=5, label="USGS Rating Curve") + for _, h, Q in zp: + ax.scatter(Q, h, color=C_USGS, s=55, zorder=9, + edgecolors="white", linewidths=0.8) + + # SRC curves + s_o = src_orig[src_orig["HydroID"] == hydroid].sort_values("Stage") + s_c = src_calib[src_calib["HydroID"] == hydroid].sort_values("Stage") + if not s_o.empty: + ax.plot(s_o["Discharge (m3s-1)"], s_o["Stage"], + color=C_ORIG, lw=1.8, ls="-", zorder=4, label="Original SRC (n=0.06)") + if not s_c.empty: + ax.plot(s_c["Discharge (m3s-1)"], s_c["Stage"], + color=C_CALIB, lw=2.8, ls="--", zorder=6, label="Calibrated SRC") + + # Per-zone n value labels inside each zone band + if not s_c.empty and "zonal_n_applied" in s_c.columns and zp: + h_brk = [0.0] + [h for _, h, _ in zp] + [y_max] + for i, (hlo, hhi) in enumerate(zip(h_brk[:-1], h_brk[1:])): + hlo_c, hhi_c = min(hlo, y_max), min(hhi, y_max) + if hhi_c <= hlo_c: + continue + mid_h = (hlo_c + hhi_c) / 2 + idx = (s_c["Stage"] - mid_h).abs().idxmin() + n_val = float(s_c.loc[idx, "zonal_n_applied"]) + ax.text(x_max * 0.015, mid_h, f"n = {n_val:.3f}", + ha="left", va="center", fontsize=8.5, color="#222", + fontweight="bold", zorder=12, + bbox=dict(boxstyle="round,pad=0.22", facecolor="white", + alpha=0.80, edgecolor="none")) + # Footer note explaining kinks + ax.text(0.5, -0.10, + "Kinks in calibrated SRC at zone boundaries are expected — Manning's n changes\n" + "abruptly between zones; the curve is continuous within each zone.", + transform=ax.transAxes, ha="center", va="top", + fontsize=8, color="#666", style="italic") + + # Manning's n twin axis + ax_n = ax.twiny() + nx, hy = _n_step(zp, src_calib, hydroid, y_max) + if nx: + ax_n.plot(nx, hy, color="#444", alpha=0.4, lw=2.0, zorder=3) + ax_n.fill_betweenx(hy, 0, nx, color="#888", alpha=0.07, zorder=2) + n_vals = [v for v in nx if v > 0] if nx else [0.06] + ax_n.set_xlim(0, max(max(n_vals) * 1.5, 0.35)) + ax_n.set_xlabel("Manning's n →", fontsize=9, color="#666", labelpad=4) + ax_n.tick_params(axis="x", colors="#777", labelsize=8) + for sp in ["right", "left", "bottom"]: + ax_n.spines[sp].set_visible(False) + ax_n.spines["top"].set_color("#aaa") + + ax.set_xlim(0, x_max) + ax.set_ylim(0, y_max) + ax.set_xlabel("Discharge (m³/s)", labelpad=6) + ax.set_ylabel("HAND Stage (m above thalweg)", labelpad=6) + ax.xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f"{x:,.0f}")) + ax.set_title(f"Gauge {loc_id} · HydroID {hydroid}", pad=26) + ax.legend(loc="upper left", fontsize=9) + + +def plot_individual_branches(hucs, rc, recur): + log.info("=== Individual RC plots ===") + for huc8 in hucs: + elev_path = OUT_DIR / f"HUC{huc8}" / "usgs_elev_table.csv" + if not elev_path.exists(): + continue + elev = pd.read_csv(elev_path, dtype={"location_id": str, "feature_id": "Int64"}) + gauges = _discover_calibrated(huc8, rc, elev) + if not gauges: + continue + + # Group by branch + by_branch: dict[str, list] = {} + for g in gauges: + by_branch.setdefault(g["bid"], []).append(g) + + save_dir = OUT_DIR / f"HUC{huc8}" / "src_plots" + save_dir.mkdir(parents=True, exist_ok=True) + + for bid, gage_list in by_branch.items(): + branch_dir = gage_list[0]["branch_dir"] + src_orig = _load_src(branch_dir, bid, original=True) + src_calib = _load_src(branch_dir, bid, original=False) + if src_orig is None or src_calib is None: + continue + + n_g = len(gage_list) + fig, axes = plt.subplots(1, n_g, figsize=(12 * n_g, 8.5), + squeeze=False, constrained_layout=True) + for col, g in enumerate(gage_list): + _draw_rc_panel(axes[0, col], bid, + g["location_id"], g["hydroid"], g["feature_id"], + g["dem_adj_m"], + src_orig, src_calib, rc, recur) + + fig.suptitle( + f"Manning's n Recurrence Calibration · " + f"HUC8 {huc8} · Branch {bid}", + fontsize=13, fontweight="bold", y=1.01, + ) + out = save_dir / f"rc_{huc8}_{bid}.png" + fig.savefig(out, dpi=150, bbox_inches="tight", facecolor="white") + plt.close(fig) + log.info(" Saved: %s", out.name) + + +# ══════════════════════════════════════════════════════════════════ +# SECTION 4 — Data aggregation loop +# ══════════════════════════════════════════════════════════════════ + +def collect_all_data(hucs, rc, recur): + """ + Returns (master_df, metrics_df, zone_df, coverage_df). + + master_df : one row per USGS RC data point (gauge × stage) + metrics_df : one row per gauge (NSE, KGE, PBIAS, RMSE before/after) + zone_df : one row per (gauge × zone) with calibrated n and stage shift + coverage_df : one row per HUC with total and calibrated branch counts + """ + master_rows = [] + metric_rows = [] + zone_rows = [] + coverage_rows = [] + + for huc8 in hucs: + elev_path = OUT_DIR / f"HUC{huc8}" / "usgs_elev_table.csv" + n_total = _total_branches(huc8) + if not elev_path.exists(): + coverage_rows.append({"huc8": huc8, "total": n_total, "calibrated": 0}) + continue + elev = pd.read_csv(elev_path, dtype={"location_id": str, "feature_id": "Int64"}) + gauges = _discover_calibrated(huc8, rc, elev) + calibrated_bids = {g["bid"] for g in gauges} + coverage_rows.append({ + "huc8": huc8, + "total": n_total, + "calibrated": len(calibrated_bids), + }) + if not gauges: + continue + + by_branch: dict[str, list] = {} + for g in gauges: + by_branch.setdefault(g["bid"], []).append(g) + + for bid, gage_list in by_branch.items(): + branch_dir = gage_list[0]["branch_dir"] + src_orig = _load_src(branch_dir, bid, original=True) + src_calib = _load_src(branch_dir, bid, original=False) + if src_orig is None or src_calib is None: + continue + + for g in gage_list: + loc_id = g["location_id"] + hydroid = g["hydroid"] + feat_id = g["feature_id"] + dem_adj = g["dem_adj_m"] + + zp = _zone_pts(loc_id, feat_id, dem_adj, rc, recur) + if not zp: + continue + + # USGS RC in HAND space + loc_pad = str(loc_id).zfill(8) + grc = rc[rc["location_id"] == loc_pad].copy() + grc["hand"] = grc["elev_m"] - dem_adj + grc = grc[(grc["hand"] > 0)].sort_values("hand") + if grc.empty: + continue + + s_o = src_orig[src_orig["HydroID"] == hydroid].sort_values("Stage") + s_c = src_calib[src_calib["HydroID"] == hydroid].sort_values("Stage") + if s_o.empty or s_c.empty: + continue + + slope = float(s_o["SLOPE"].iloc[0]) + src_max = float(s_o["Stage"].max()) + valid = grc[grc["hand"] <= src_max * 1.05] + + q_usgs = valid["flow_cms"].values + q_orig = np.interp(valid["hand"].values, + s_o["Stage"].values, + s_o["Discharge (m3s-1)"].values) + q_calib = np.interp(valid["hand"].values, + s_c["Stage"].values, + s_c["Discharge (m3s-1)"].values) + + # Zone assignment + h_bounds = [0.0] + [h for _, h, _ in zp] + zones = np.searchsorted(h_bounds, valid["hand"].values, side="right") + + # n_applied (from calibrated SRC if column present, else NaN) + if "zonal_n_applied" in s_c.columns: + n_app = np.interp(valid["hand"].values, + s_c["Stage"].values, + s_c["zonal_n_applied"].values) + else: + n_app = np.full(len(valid), np.nan) + + # Master records + for i in range(len(valid)): + master_rows.append({ + "huc8": huc8, "bid": bid, + "location_id": loc_id, "hydroid": hydroid, + "hand_m": float(valid["hand"].values[i]), + "q_usgs": float(q_usgs[i]), + "q_orig": float(q_orig[i]), + "q_calib": float(q_calib[i]), + "zone": int(zones[i]), + "n_applied": float(n_app[i]), + "slope": slope, + }) + + # Metrics + mask = (q_usgs > 0) & (q_orig > 0) & (q_calib > 0) + if mask.sum() >= 5: + m = { + "huc8": huc8, "bid": bid, "location_id": loc_id, + "hydroid": hydroid, "n_pts": int(mask.sum()), + "nse_orig": _nse(q_usgs[mask], q_orig[mask]), + "nse_calib": _nse(q_usgs[mask], q_calib[mask]), + "kge_orig": _kge(q_usgs[mask], q_orig[mask]), + "kge_calib": _kge(q_usgs[mask], q_calib[mask]), + "pbias_orig": _pbias(q_usgs[mask], q_orig[mask]), + "pbias_calib": _pbias(q_usgs[mask], q_calib[mask]), + "rmse_orig": _rmse(q_usgs[mask], q_orig[mask]), + "rmse_calib": _rmse(q_usgs[mask], q_calib[mask]), + } + metric_rows.append(m) + + # Zone-level stats + h_sorted = sorted(zp, key=lambda x: x[0]) + for z_idx, (yr, h_zone, Q_r) in enumerate(h_sorted, 1): + idx = (s_o["Stage"] - h_zone).abs().idxmin() + row_o = s_o.loc[idx] + idx_c = (s_c["Stage"] - h_zone).abs().idxmin() + # Use the actually-applied n (already clipped to N_MIN/N_MAX by s05) + if "zonal_n_applied" in s_c.columns: + n_zone = float(s_c.loc[idx_c, "zonal_n_applied"]) + else: + A = float(row_o.get("WetArea (m2)", 0)) + R = float(row_o.get("HydraulicRadius (m)", 0)) + n_zone = float(A * (R ** (2/3)) * slope**0.5 / Q_r) if Q_r > 0 and A > 0 and R > 0 else np.nan + + # Stage shift at Q_r (invert SRC) + qo = s_o["Discharge (m3s-1)"].values + ho = s_o["Stage"].values + qc = s_c["Discharge (m3s-1)"].values + hc = s_c["Stage"].values + # Only invert where Q is monotonically increasing + h_at_Qr_orig = float(np.interp(Q_r, qo, ho)) if Q_r <= qo.max() else np.nan + h_at_Qr_calib = float(np.interp(Q_r, qc, hc)) if Q_r <= qc.max() else np.nan + delta_h = h_at_Qr_calib - h_at_Qr_orig if not (np.isnan(h_at_Qr_orig) or np.isnan(h_at_Qr_calib)) else np.nan + + zone_rows.append({ + "huc8": huc8, "bid": bid, "location_id": loc_id, + "hydroid": hydroid, "recurrence_yr": yr, + "zone": z_idx, "h_zone": h_zone, "Q_r": Q_r, + "n_calibrated": n_zone, "slope": slope, + "h_at_Qr_orig": h_at_Qr_orig, + "h_at_Qr_calib": h_at_Qr_calib, + "delta_h": delta_h, + }) + + master_df = pd.DataFrame(master_rows) + metrics_df = pd.DataFrame(metric_rows) + zone_df = pd.DataFrame(zone_rows) + coverage_df = pd.DataFrame(coverage_rows) + return master_df, metrics_df, zone_df, coverage_df + + +# ══════════════════════════════════════════════════════════════════ +# SECTION 5 — Aggregate plots +# ══════════════════════════════════════════════════════════════════ + +def _gauge_label(loc_id, huc8): + return f"{loc_id}\n({huc8})" + + +def plot_01_n_profiles(zone_df: pd.DataFrame, out_dir: Path): + """Manning's n vs HAND stage — step-function for every gauge.""" + gauges = zone_df[["location_id", "huc8"]].drop_duplicates() + if gauges.empty: + return + + fig, ax = plt.subplots(figsize=(12, 7), constrained_layout=True) + legend_handles = [] + + for ci, (_, row) in enumerate(gauges.iterrows()): + loc_id = row["location_id"] + huc8 = row["huc8"] + gdf = zone_df[(zone_df["location_id"] == loc_id) & + (zone_df["huc8"] == huc8)].sort_values("zone") + if gdf.empty: + continue + col = GAUGE_PALETTE[ci % len(GAUGE_PALETTE)] + + # Build step function: [(h_lo, h_hi, n)] + h_vals = [0.0] + gdf["h_zone"].tolist() + n_vals = gdf["n_calibrated"].tolist() + for i, (hlo, hhi) in enumerate(zip(h_vals[:-1], h_vals[1:])): + ax.plot([n_vals[i], n_vals[i]], [hlo, hhi], + color=col, lw=1.8, alpha=0.85, zorder=4) + if i < len(n_vals) - 1: + ax.plot([n_vals[i], n_vals[i + 1]], [hhi, hhi], + color=col, lw=1.8, alpha=0.85, zorder=4) + # Extend to 0 at bottom + ax.plot([n_vals[0], n_vals[0]], [0, h_vals[1]], + color=col, lw=1.8, alpha=0.85, zorder=4) + legend_handles.append(Line2D([0], [0], color=col, lw=2, + label=_gauge_label(loc_id, huc8))) + + ax.axvline(0.06, color="#333", ls="--", lw=1.2, alpha=0.6, label="Default n = 0.06") + ax.axvline(0.12, color="#777", ls=":", lw=1.2, alpha=0.6, label="Overbank n = 0.12") + ax.set_xlabel("Manning's n", labelpad=8) + ax.set_ylabel("HAND Stage (m above thalweg)", labelpad=8) + ax.set_title("Calibrated Manning's n Profiles — All Gauges", pad=12) + ax.legend(handles=legend_handles + [ + Line2D([0], [0], color="#333", ls="--", lw=1.2, label="Default n = 0.06"), + Line2D([0], [0], color="#777", ls=":", lw=1.2, label="Overbank n = 0.12"), + ], loc="upper right", fontsize=8.5, ncol=2) + # Clip x-axis to 95th percentile — suppresses outliers without losing the main distribution + all_n = zone_df["n_calibrated"].dropna().values + x_clip = float(np.percentile(all_n, 95)) * 1.2 if len(all_n) > 0 else 0.5 + x_clip = max(x_clip, 0.4) + n_excl = int((all_n > x_clip).sum()) if len(all_n) > 0 else 0 + ax.set_xlim(0, x_clip) + if n_excl > 0: + ax.text(0.98, 0.02, + f"Note: {n_excl} n value(s) > {x_clip:.2f} not shown (outliers)", + transform=ax.transAxes, ha="right", va="bottom", + fontsize=8.5, color="#777", style="italic") + out = out_dir / "01_n_profiles.png" + fig.savefig(out, dpi=150, bbox_inches="tight", facecolor="white") + plt.close(fig) + log.info(" Saved: %s", out.name) + + +def plot_02_q_scatter(master_df: pd.DataFrame, out_dir: Path): + """Q comparison scatter (log scale) — original vs calibrated vs USGS.""" + if master_df.empty: + return + df = master_df[(master_df["q_usgs"] > 0) & + (master_df["q_orig"] > 0) & + (master_df["q_calib"] > 0)].copy() + df["zone"] = df["zone"].clip(1, 6) + + zone_colors = {z: ZONE_CLR[z - 1] for z in range(1, 7)} + ec = "#333" + + fig, axes = plt.subplots(1, 3, figsize=(17, 6), constrained_layout=True) + pairs = [ + (axes[0], "q_orig", "q_usgs", "Original SRC vs USGS RC"), + (axes[1], "q_calib", "q_usgs", "Calibrated SRC vs USGS RC"), + (axes[2], "q_orig", "q_calib","Original SRC vs Calibrated SRC"), + ] + for ax, xcol, ycol, title in pairs: + x, y = df[xcol].values, df[ycol].values + colors = [zone_colors.get(z, "#ccc") for z in df["zone"]] + ax.scatter(x, y, c=colors, s=12, alpha=0.6, edgecolors="none", zorder=4) + lims = [min(x.min(), y.min()) * 0.9, max(x.max(), y.max()) * 1.1] + ax.plot(lims, lims, "k--", lw=1.0, alpha=0.5, zorder=3, label="1:1") + ax.set_xscale("log"); ax.set_yscale("log") + ax.set_xlim(lims); ax.set_ylim(lims) + ax.set_xlabel(xcol.replace("_", " ").replace("q ", "Q ") + " (m³/s)") + ax.set_ylabel(ycol.replace("_", " ").replace("q ", "Q ") + " (m³/s)") + ax.set_title(title) + ax.legend(loc="upper left", fontsize=9) + + # Zone legend + patches = [mpatches.Patch(color=ZONE_CLR[i], label=f"Zone {i+1}") + for i in range(6)] + fig.legend(handles=patches, loc="lower center", ncol=6, + bbox_to_anchor=(0.5, -0.04), fontsize=9, + title="Zone", title_fontsize=9) + fig.suptitle("Discharge Comparison — All USGS RC Stages (log scale)", fontsize=13) + out = out_dir / "02_q_scatter.png" + fig.savefig(out, dpi=150, bbox_inches="tight", facecolor="white") + plt.close(fig) + log.info(" Saved: %s", out.name) + + +def plot_03_adj_ratio(master_df: pd.DataFrame, out_dir: Path): + """Q_calib / Q_orig by zone — violin + box plots.""" + if master_df.empty: + return + df = master_df[(master_df["q_orig"] > 0) & (master_df["q_calib"] > 0)].copy() + df["ratio"] = df["q_calib"] / df["q_orig"] + df["zone"] = df["zone"].clip(1, 6) + + fig, ax = plt.subplots(figsize=(11, 6), constrained_layout=True) + zone_labels = [f"Zone {z}" for z in range(1, 7)] + data_by_zone = [df[df["zone"] == z]["ratio"].dropna().values for z in range(1, 7)] + + vp = ax.violinplot(data_by_zone, positions=range(1, 7), + showmedians=True, showextrema=False, widths=0.6) + for i, body in enumerate(vp["bodies"]): + body.set_facecolor(ZONE_CLR[i]) + body.set_alpha(0.65) + vp["cmedians"].set_color("#111") + vp["cmedians"].set_linewidth(2) + + # Overlay box + bp = ax.boxplot(data_by_zone, positions=range(1, 7), + widths=0.18, patch_artist=False, + medianprops=dict(color="#333", lw=0), + whiskerprops=dict(color="#555"), + capprops=dict(color="#555"), + flierprops=dict(marker=".", ms=3, color="#aaa"), + manage_ticks=False) + + ax.axhline(1.0, color="#c0392b", lw=1.5, ls="--", alpha=0.8, + label="No change (ratio = 1)") + ax.set_xticks(range(1, 7)) + ax.set_xticklabels([f"Zone {z}\n({len(d)} pts)" for z, d in + zip(range(1, 7), data_by_zone)]) + ax.set_ylabel("Q_calibrated / Q_original") + ax.set_title("Discharge Adjustment Ratio by Zone\n" + "(ratio > 1 = calibration increased discharge; < 1 = decreased)") + ax.legend() + out = out_dir / "03_adj_ratio.png" + fig.savefig(out, dpi=150, bbox_inches="tight", facecolor="white") + plt.close(fig) + log.info(" Saved: %s", out.name) + + +def plot_04_n_by_zone(zone_df: pd.DataFrame, out_dir: Path): + """Calibrated n distribution by zone — box + strip.""" + if zone_df.empty: + return + df = zone_df.dropna(subset=["n_calibrated"]) + + fig, ax = plt.subplots(figsize=(11, 6), constrained_layout=True) + data = [df[df["zone"] == z]["n_calibrated"].values for z in range(1, 6)] + labels = [f"Zone {z}\n({RECURRENCE_YEARS[z-1]} yr boundary)" for z in range(1, 6)] + n_pts = [len(d) for d in data] + + bp = ax.boxplot(data, patch_artist=True, + medianprops=dict(color="#111", lw=2), + whiskerprops=dict(color="#555"), + capprops=dict(color="#555"), + flierprops=dict(marker="o", ms=5, alpha=0.5)) + for patch, col in zip(bp["boxes"], ZONE_CLR[:5]): + patch.set_facecolor(col) + patch.set_alpha(0.7) + + # Strip plot + rng = np.random.default_rng(42) + for z_idx, d in enumerate(data, 1): + jitter = rng.uniform(-0.12, 0.12, len(d)) + ax.scatter(np.full(len(d), z_idx) + jitter, d, + color=ZONE_CLR[z_idx - 1], s=28, alpha=0.7, + edgecolors="#333", linewidths=0.5, zorder=5) + + ax.axhline(0.06, color=C_NAVY, ls="--", lw=1.5, alpha=0.8, label="Default n = 0.06") + ax.axhline(0.12, color=C_GRAY, ls=":", lw=1.5, alpha=0.8, label="Overbank n = 0.12") + # Clip y-axis to 97th percentile — outlier protection + n_all = df["n_calibrated"].values + y_clip = float(np.percentile(n_all, 97)) * 1.15 if len(n_all) > 0 else 0.5 + y_clip = max(y_clip, 0.4) + n_excl = int((n_all > y_clip).sum()) if len(n_all) > 0 else 0 + ax.set_ylim(0, y_clip) + if n_excl > 0: + ax.text(0.98, 0.98, + f"Note: {n_excl} outlier(s) > {y_clip:.2f} not shown", + transform=ax.transAxes, ha="right", va="top", + fontsize=9, color="#777", style="italic") + ax.set_xticks(range(1, 6)) + ax.set_xticklabels([f"{l}\n(n={n})" for l, n in zip(labels, n_pts)]) + ax.set_ylabel("Calibrated Manning's n") + ax.set_title("Distribution of Calibrated Manning's n by Zone\n" + "(each point = one gauge; zone number = depth interval)") + ax.legend() + out = out_dir / "04_n_by_zone.png" + fig.savefig(out, dpi=150, bbox_inches="tight", facecolor="white") + plt.close(fig) + log.info(" Saved: %s", out.name) + + +def plot_05_coverage(coverage_df: pd.DataFrame, out_dir: Path): + """Calibrated vs total branches per HUC — horizontal stacked bar.""" + if coverage_df.empty: + return + df = coverage_df.sort_values("huc8").reset_index(drop=True) + df["uncalibrated"] = df["total"] - df["calibrated"] + + fig, ax = plt.subplots(figsize=(11, max(5, len(df) * 0.45 + 1.5)), + constrained_layout=True) + y = np.arange(len(df)) + ax.barh(y, df["uncalibrated"], color="#d0d3d4", label="Uncalibrated") + ax.barh(y, df["calibrated"], color=C_CALIB, alpha=0.8, + left=df["uncalibrated"], label="Calibrated") + + for i, row in df.iterrows(): + if row["calibrated"] > 0: + ax.text(row["total"] + 0.15, i, + f"{row['calibrated']}/{row['total']}", + va="center", fontsize=9, color="#333") + + ax.set_yticks(y) + ax.set_yticklabels(df["huc8"]) + ax.set_xlabel("Number of Branches") + ax.set_ylabel("HUC8") + ax.set_title("Calibration Coverage per HUC\n" + f"(Total: {df['calibrated'].sum()} calibrated / " + f"{df['total'].sum()} branches across {len(df)} HUCs)") + ax.legend(loc="lower right") + out = out_dir / "05_coverage.png" + fig.savefig(out, dpi=150, bbox_inches="tight", facecolor="white") + plt.close(fig) + log.info(" Saved: %s", out.name) + + +def plot_06_metrics(metrics_df: pd.DataFrame, out_dir: Path): + """Before-vs-after metric scatter — one panel per metric.""" + if metrics_df.empty: + return + + fig, axes = plt.subplots(2, 2, figsize=(12, 10), constrained_layout=True) + pairs = [ + (axes[0, 0], "nse_orig", "nse_calib", "NSE", + "higher is better", True), + (axes[0, 1], "kge_orig", "kge_calib", "KGE", + "higher is better", True), + (axes[1, 0], "pbias_orig", "pbias_calib", "Percent Bias (%)", + "closer to 0 is better", False), + (axes[1, 1], "rmse_orig", "rmse_calib", "RMSE (m³/s)", + "lower is better", False), + ] + + for ax, xcol, ycol, label, note, higher_better in pairs: + x = metrics_df[xcol].values + y = metrics_df[ycol].values + valid = ~(np.isnan(x) | np.isnan(y)) + x, y = x[valid], y[valid] + if len(x) == 0: + continue + + lims = [min(x.min(), y.min()) - abs(min(x.min(), y.min())) * 0.05, + max(x.max(), y.max()) * 1.05] + ax.plot(lims, lims, "--", color="#aaa", lw=1.2, zorder=2, label="No change") + + colors = [] + for xi, yi in zip(x, y): + if higher_better: + colors.append("#27ae60" if yi > xi else "#c0392b") + else: + colors.append("#27ae60" if abs(yi) < abs(xi) else "#c0392b") + + for xi, yi, c in zip(x, y, colors): + ax.annotate("", xy=(yi, xi), xytext=(xi, xi), + arrowprops=dict(arrowstyle="->", color=c, lw=1.0, alpha=0.4)) + ax.scatter(x, x, marker="o", s=60, color="#555", zorder=5, label="Before") + ax.scatter(y, x, marker="D", s=60, color=colors, zorder=6, edgecolors="#333", + linewidths=0.6, label="After") + + ax.set_xlim(lims); ax.set_ylim(lims) + ax.set_xlabel(f"{label} (before)") + ax.set_ylabel(f"{label} (after)") + ax.set_title(f"{label} — {note}") + ax.legend(fontsize=8.5) + + n_imp = sum(1 for xi, yi in zip(x, y) + if (yi > xi if higher_better else abs(yi) < abs(xi))) + ax.text(0.03, 0.97, f"Improved: {n_imp}/{len(x)}", + transform=ax.transAxes, va="top", fontsize=9.5, + color="#27ae60", fontweight="bold") + + fig.suptitle("Calibration Performance Metrics — Before vs After\n" + "(green arrow = improvement, red = degradation)", fontsize=13) + out = out_dir / "06_metrics.png" + fig.savefig(out, dpi=150, bbox_inches="tight", facecolor="white") + plt.close(fig) + log.info(" Saved: %s", out.name) + + +def plot_07_n_vs_slope(zone_df: pd.DataFrame, out_dir: Path): + """Calibrated n vs channel slope by zone — scatter + per-zone trend.""" + df = zone_df.dropna(subset=["n_calibrated", "slope"]) + df = df[df["slope"] > 0] + if df.empty: + return + + fig, ax = plt.subplots(figsize=(11, 6.5), constrained_layout=True) + legend_handles = [] + for z in range(1, 6): + sub = df[df["zone"] == z] + if sub.empty: + continue + col = ZONE_CLR[z - 1] + ax.scatter(sub["slope"], sub["n_calibrated"], + color=col, s=65, alpha=0.8, + edgecolors="#333", linewidths=0.6, zorder=5) + # Log-linear trend line + log_s = np.log10(sub["slope"].values) + n_v = sub["n_calibrated"].values + if len(sub) >= 3: + p = np.polyfit(log_s, n_v, 1) + s_rng = np.logspace(np.log10(sub["slope"].min()), + np.log10(sub["slope"].max()), 50) + ax.plot(s_rng, np.polyval(p, np.log10(s_rng)), + color=col, lw=1.8, ls="-", alpha=0.6, zorder=4) + legend_handles.append( + mpatches.Patch(color=col, alpha=0.8, + label=f"Zone {z} ({RECURRENCE_YEARS[z-1]} yr)")) + + ax.set_xscale("log") + ax.set_xlabel("Channel Slope (IRIS-SWORD, m/m)", labelpad=8) + ax.set_ylabel("Calibrated Manning's n", labelpad=8) + ax.set_title("Manning's n vs Channel Slope by Zone\n" + "(line = log-linear trend per zone)", pad=10) + ax.legend(handles=legend_handles, loc="upper right") + out = out_dir / "07_n_vs_slope.png" + fig.savefig(out, dpi=150, bbox_inches="tight", facecolor="white") + plt.close(fig) + log.info(" Saved: %s", out.name) + + +def plot_08_stage_shift(zone_df: pd.DataFrame, out_dir: Path): + """HAND stage shift (Δh = h_calib - h_orig) at recurrence flows — box plot.""" + df = zone_df.dropna(subset=["delta_h"]) + if df.empty: + return + + fig, ax = plt.subplots(figsize=(11, 6), constrained_layout=True) + yr_order = RECURRENCE_YEARS + data = [df[df["recurrence_yr"] == yr]["delta_h"].values for yr in yr_order] + n_pts = [len(d) for d in data] + + bp = ax.boxplot(data, patch_artist=True, + medianprops=dict(color="#111", lw=2.2), + whiskerprops=dict(color="#555"), + capprops=dict(color="#555"), + flierprops=dict(marker="o", ms=5, alpha=0.5)) + for patch, col in zip(bp["boxes"], ZONE_CLR[:5]): + patch.set_facecolor(col) + patch.set_alpha(0.7) + + rng = np.random.default_rng(0) + for i, d in enumerate(data, 1): + jitter = rng.uniform(-0.14, 0.14, len(d)) + ax.scatter(np.full(len(d), i) + jitter, d, + color=ZONE_CLR[i - 1], s=30, alpha=0.75, + edgecolors="#333", linewidths=0.5, zorder=5) + + ax.axhline(0, color="#c0392b", ls="--", lw=1.5, alpha=0.8, + label="No change (Δh = 0)") + ax.set_xticks(range(1, 6)) + ax.set_xticklabels([f"{yr} yr\n(n={n})" for yr, n in zip(yr_order, n_pts)]) + ax.set_xlabel("Recurrence Interval") + ax.set_ylabel("HAND Stage Shift Δh (m)\n(positive = calibration raises flood stage)") + ax.set_title("Flood Stage Shift from Calibration at NWM Recurrence Flows\n" + "(impact on FIM extent: Δh > 0 means larger predicted inundation)") + ax.legend() + out = out_dir / "08_stage_shift.png" + fig.savefig(out, dpi=150, bbox_inches="tight", facecolor="white") + plt.close(fig) + log.info(" Saved: %s", out.name) + + +def plot_09_n_direction(zone_df: pd.DataFrame, out_dir: Path): + """Stacked bar + table: how many gauges had n increase vs decrease from 0.06.""" + df = zone_df.dropna(subset=["n_calibrated"]).copy() + tol = 0.005 + df["direction"] = "No change" + df.loc[df["n_calibrated"] > 0.06 + tol, "direction"] = "Increased" + df.loc[df["n_calibrated"] < 0.06 - tol, "direction"] = "Decreased" + + zones = list(range(1, 6)) + yr_labels = [f"Zone {z}\n({RECURRENCE_YEARS[z-1]} yr)" for z in zones] + counts = {k: [] for k in ("Increased", "Decreased", "No change")} + for z in zones: + sub = df[df["zone"] == z] + for k in counts: + counts[k].append((sub["direction"] == k).sum()) + + fig = plt.figure(figsize=(13, 9), constrained_layout=False) + fig.subplots_adjust(top=0.91, bottom=0.30, left=0.08, right=0.96) + ax = fig.add_subplot(111) + + x = np.arange(len(zones)) + w = 0.52 + c_inc = "#c0392b" + c_dec = "#2471a3" + c_none = "#bdc3c7" + + b_inc = ax.bar(x, counts["Increased"], w, label="Increased (n > 0.06 + ε)", color=c_inc, alpha=0.85) + b_dec = ax.bar(x, counts["Decreased"], w, bottom=counts["Increased"], + label="Decreased (n < 0.06 − ε)", color=c_dec, alpha=0.85) + b_none = ax.bar(x, counts["No change"], w, + bottom=[i + d for i, d in zip(counts["Increased"], counts["Decreased"])], + label=f"No change (|n − 0.06| ≤ {tol})", color=c_none, alpha=0.75) + + for i in range(len(zones)): + total = sum(v[i] for v in counts.values()) + if total == 0: + continue + n_i = counts["Increased"][i] + n_d = counts["Decreased"][i] + n_s = counts["No change"][i] + if n_i > 0: + ax.text(i, n_i / 2, f"{n_i}\n({100*n_i/total:.0f}%)", + ha="center", va="center", fontsize=9.5, color="white", fontweight="bold") + if n_d > 0: + ax.text(i, n_i + n_d / 2, f"{n_d}\n({100*n_d/total:.0f}%)", + ha="center", va="center", fontsize=9.5, color="white", fontweight="bold") + if n_s > 2: + ax.text(i, n_i + n_d + n_s / 2, f"{n_s}\n({100*n_s/total:.0f}%)", + ha="center", va="center", fontsize=9, color="#444") + + ax.set_xticks(x) + ax.set_xticklabels(yr_labels, fontsize=10.5) + ax.set_ylabel("Number of Gauges") + ax.set_title("Direction of Manning's n Change from Default (n = 0.06)\nby Calibration Zone", + pad=10) + ax.legend(loc="upper right", fontsize=9.5) + + # Summary table below the chart + t_inc = sum(counts["Increased"]) + t_dec = sum(counts["Decreased"]) + t_none = sum(counts["No change"]) + grand = t_inc + t_dec + t_none + + totals = [sum(v[i] for v in counts.values()) for i in range(len(zones))] + pct = lambda n, t: f"{100*n/t:.0f}%" if t > 0 else "—" + + col_lbl = [f"Zone {z}" for z in zones] + ["TOTAL"] + row_data = [ + [str(counts["Increased"][i]) for i in range(5)] + [str(t_inc)], + [pct(counts["Increased"][i], totals[i]) for i in range(5)] + [pct(t_inc, grand)], + [str(counts["Decreased"][i]) for i in range(5)] + [str(t_dec)], + [pct(counts["Decreased"][i], totals[i]) for i in range(5)] + [pct(t_dec, grand)], + [str(counts["No change"][i]) for i in range(5)] + [str(t_none)], + [pct(counts["No change"][i], totals[i]) for i in range(5)] + [pct(t_none, grand)], + [str(t) for t in totals] + [str(grand)], + ] + row_lbl = ["n Increased", " % Increased", + "n Decreased", " % Decreased", + "No Change", " % No Change", + "TOTAL"] + + ax_tab = fig.add_axes([0.05, 0.02, 0.91, 0.24]) + ax_tab.axis("off") + tbl = ax_tab.table(cellText=row_data, rowLabels=row_lbl, colLabels=col_lbl, + loc="center", cellLoc="center") + tbl.auto_set_font_size(False) + tbl.set_fontsize(9.5) + tbl.scale(1, 1.35) + row_bg = {0: c_inc, 2: c_dec, 4: c_none} + for (r, c), cell in tbl.get_celld().items(): + if r == 0: + cell.set_facecolor("#2c3e50") + cell.set_text_props(color="white", fontweight="bold") + elif (r - 1) in row_bg: + cell.set_facecolor(row_bg[r - 1]) + cell.set_alpha(0.35) + cell.set_edgecolor("#ccc") + + fig.suptitle("Manning's n Calibration Direction Analysis", fontsize=14, fontweight="bold") + out = out_dir / "09_n_direction.png" + fig.savefig(out, dpi=150, bbox_inches="tight", facecolor="white") + plt.close(fig) + log.info(" Saved: %s", out.name) + + +def collect_src_curves(hucs, rc, recur): + """ + Returns two parallel lists (orig_curves, calib_curves). + Each element: dict with keys huc8, bid, loc_id, q (ndarray), h (ndarray). + One entry per calibrated HydroID. + """ + orig_curves, calib_curves = [], [] + for huc8 in hucs: + elev_path = OUT_DIR / f"HUC{huc8}" / "usgs_elev_table.csv" + if not elev_path.exists(): + continue + elev = pd.read_csv(elev_path, dtype={"location_id": str, "feature_id": "Int64"}) + gauges = _discover_calibrated(huc8, rc, elev) + if not gauges: + continue + by_branch: dict[str, list] = {} + for g in gauges: + by_branch.setdefault(g["bid"], []).append(g) + for bid, gage_list in by_branch.items(): + branch_dir = gage_list[0]["branch_dir"] + src_o = _load_src(branch_dir, bid, original=True) + src_c = _load_src(branch_dir, bid, original=False) + if src_o is None or src_c is None: + continue + for g in gage_list: + hid = g["hydroid"] + s_o = src_o[src_o["HydroID"] == hid].sort_values("Stage") + s_c = src_c[src_c["HydroID"] == hid].sort_values("Stage") + if s_o.empty or s_c.empty: + continue + base = {"huc8": huc8, "bid": bid, "loc_id": g["location_id"]} + orig_curves.append({**base, + "q": s_o["Discharge (m3s-1)"].values.copy(), + "h": s_o["Stage"].values.copy()}) + calib_curves.append({**base, + "q": s_c["Discharge (m3s-1)"].values.copy(), + "h": s_c["Stage"].values.copy()}) + return orig_curves, calib_curves + + +def _draw_src_ensemble(ax, curves, title, line_color, + shared_xlim=None, shared_ylim=None): + """Draw individual SRC ensemble with median and IQR band. + + shared_xlim / shared_ylim: pass the same values to both original and + calibrated figures so the axes are directly comparable. + """ + h_max = float(max(c["h"].max() for c in curves)) + h_grid = np.linspace(0.01, h_max, 350) + + # Interpolate each curve onto common h grid (Q as function of h) + q_mat = np.full((len(curves), len(h_grid)), np.nan) + for i, c in enumerate(curves): + q_mat[i] = np.interp(h_grid, c["h"], c["q"], left=np.nan, right=np.nan) + + # Individual curves — slightly darker so they read against white background + for c in curves: + ax.plot(c["q"], c["h"], color="#777", lw=0.7, alpha=0.28, zorder=2) + + # Percentile bands + with np.errstate(all="ignore"): + q_med = np.nanmedian(q_mat, axis=0) + q_p25 = np.nanpercentile(q_mat, 25, axis=0) + q_p75 = np.nanpercentile(q_mat, 75, axis=0) + q_p10 = np.nanpercentile(q_mat, 10, axis=0) + q_p90 = np.nanpercentile(q_mat, 90, axis=0) + + ax.fill_betweenx(h_grid, q_p10, q_p90, color=line_color, alpha=0.13, + zorder=3, label="10th–90th %ile") + ax.fill_betweenx(h_grid, q_p25, q_p75, color=line_color, alpha=0.28, + zorder=4, label="IQR (25th–75th %ile)") + ax.plot(q_med, h_grid, color=line_color, lw=3.0, zorder=6, label="Median") + + # Axes — use shared limits when provided so both plots are directly comparable + if shared_xlim is not None: + ax.set_xlim(0, shared_xlim) + else: + q_all = np.concatenate([c["q"] for c in curves]) + ax.set_xlim(0, float(np.nanpercentile(q_all, 99)) * 1.05) + ax.set_ylim(0, shared_ylim if shared_ylim is not None else h_max) + ax.set_xlabel("Discharge (m³/s)", labelpad=8) + ax.set_ylabel("HAND Stage (m above thalweg)", labelpad=8) + ax.set_title(title, pad=10) + ax.legend(loc="upper left", fontsize=9.5) + ax.text(0.98, 0.02, f"n = {len(curves)} SRC curves", + transform=ax.transAxes, ha="right", va="bottom", fontsize=9.5, color="#555") + ax.xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f"{x:,.0f}")) + + +def plot_src_overlay(orig_curves, calib_curves, out_dir: Path): + """Two figures: all original SRCs overlaid / all calibrated SRCs overlaid. + + Both figures use the same x and y axis limits so ensemble shapes can be + compared directly side-by-side. + """ + if not orig_curves: + return + + # Compute shared scales from both ensembles combined + all_q = np.concatenate([c["q"] for c in orig_curves + calib_curves]) + shared_xlim = float(np.nanpercentile(all_q, 99)) * 1.05 + shared_ylim = float(max(c["h"].max() for c in orig_curves + calib_curves)) + + fig1, ax1 = plt.subplots(figsize=(12, 7.5), constrained_layout=True) + _draw_src_ensemble(ax1, orig_curves, + "All Original SRCs — Gauged Branches (pre-calibration, n = 0.06)", + C_ORIG, shared_xlim=shared_xlim, shared_ylim=shared_ylim) + fig1.suptitle("Synthetic Rating Curve Ensemble — Original", fontsize=13, fontweight="bold") + out1 = out_dir / "10_src_ensemble_original.png" + fig1.savefig(out1, dpi=150, bbox_inches="tight", facecolor="white") + plt.close(fig1) + log.info(" Saved: %s", out1.name) + + fig2, ax2 = plt.subplots(figsize=(12, 7.5), constrained_layout=True) + _draw_src_ensemble(ax2, calib_curves, + "All Calibrated SRCs — Gauged Branches (post-calibration)", + C_CALIB, shared_xlim=shared_xlim, shared_ylim=shared_ylim) + fig2.suptitle("Synthetic Rating Curve Ensemble — Calibrated", fontsize=13, fontweight="bold") + out2 = out_dir / "11_src_ensemble_calibrated.png" + fig2.savefig(out2, dpi=150, bbox_inches="tight", facecolor="white") + plt.close(fig2) + log.info(" Saved: %s", out2.name) + + +# ══════════════════════════════════════════════════════════════════ +# SECTION 6 — Main +# ══════════════════════════════════════════════════════════════════ + +def main(): + SUMMARY_DIR.mkdir(parents=True, exist_ok=True) + + df = pd.read_excel(EXCEL_PATH) + hucs = [str(int(c)).zfill(8) for c in df[HUC_CODE_COL]] + log.info("Study area: %d HUCs", len(hucs)) + + log.info("Loading shared tables …") + rc, recur = _load_shared_data() + + # ── Individual RC plots ──────────────────────────────────────── + plot_individual_branches(hucs, rc, recur) + + # ── Collect aggregated data ──────────────────────────────────── + log.info("Collecting data for aggregate plots …") + master_df, metrics_df, zone_df, coverage_df = collect_all_data(hucs, rc, recur) + + n_gauges = metrics_df["location_id"].nunique() if not metrics_df.empty else 0 + log.info(" %d calibrated gauge(s) with metrics", n_gauges) + + # ── Aggregate plots ──────────────────────────────────────────── + log.info("=== Aggregate plots ===") + plot_01_n_profiles(zone_df, SUMMARY_DIR) + plot_02_q_scatter(master_df, SUMMARY_DIR) + plot_03_adj_ratio(master_df, SUMMARY_DIR) + plot_04_n_by_zone(zone_df, SUMMARY_DIR) + plot_05_coverage(coverage_df, SUMMARY_DIR) + plot_06_metrics(metrics_df, SUMMARY_DIR) + plot_07_n_vs_slope(zone_df, SUMMARY_DIR) + plot_08_stage_shift(zone_df, SUMMARY_DIR) + plot_09_n_direction(zone_df, SUMMARY_DIR) + + # ── SRC ensemble plots ───────────────────────────────────────── + log.info("Collecting SRC curves for ensemble plots …") + orig_curves, calib_curves = collect_src_curves(hucs, rc, recur) + log.info(" %d SRC curves collected", len(orig_curves)) + plot_src_overlay(orig_curves, calib_curves, SUMMARY_DIR) + + # ── Metrics summary CSV ──────────────────────────────────────── + if not metrics_df.empty: + cols_order = [ + "huc8", "bid", "location_id", "hydroid", "n_pts", + "nse_orig", "nse_calib", "kge_orig", "kge_calib", + "pbias_orig", "pbias_calib", "rmse_orig", "rmse_calib", + ] + metrics_df[cols_order].round(4).to_csv( + SUMMARY_DIR / "metrics_summary.csv", index=False + ) + log.info(" Saved: metrics_summary.csv") + + # Print summary + log.info("\n── Performance Summary ──────────────────────────────────") + for _, row in metrics_df.iterrows(): + log.info( + " %-10s %-13s NSE %+.3f→%+.3f KGE %+.3f→%+.3f " + "PBIAS %+.1f%%→%+.1f%% RMSE %.1f→%.1f m³/s", + row["huc8"], row["location_id"], + row["nse_orig"], row["nse_calib"], + row["kge_orig"], row["kge_calib"], + row["pbias_orig"], row["pbias_calib"], + row["rmse_orig"], row["rmse_calib"], + ) + + log.info("\nAll outputs written to:") + log.info(" Individual plots → E:/SI/out/HUC{{huc8}}/src_plots/") + log.info(" Aggregate plots → %s", SUMMARY_DIR) + + +if __name__ == "__main__": + main() diff --git a/scripts/plot_calibration_slides.py b/scripts/plot_calibration_slides.py new file mode 100644 index 0000000..8ef3227 --- /dev/null +++ b/scripts/plot_calibration_slides.py @@ -0,0 +1,749 @@ +""" +PPT-ready technical slide deck for the FIMbox calibration pipeline. + +Slides: + 01 — Branch Processing: What is a Branch? + 02 — Reference Frame Reconciliation: USGS RC vs SRC + 03 — Our Calibration: Manning's n Back-Calculation (s05) + 04 — Main Repo Calibration: Discharge Correction Coefficient (Stage 4) + 05 — Side-by-Side Comparison Table + 06 — The Collision: What Happens When Stage 4 Runs + 07 — Items Needing Attention Before Stage 5 + +Run: + .venv\\Scripts\\python.exe scripts/plot_calibration_slides.py +""" +from pathlib import Path + +import matplotlib as mpl +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches +from matplotlib.patches import FancyBboxPatch + +mpl.rcParams["font.family"] = "DejaVu Sans" + +SAVE_TO = Path("D:/SI/out/HUC03020102/src_plots/slides") + +# ── Palette ─────────────────────────────────────────────────────────────────── +C_NAVY = "#1b4f72" +C_DBLUE = "#1a5276" +C_ORANGE = "#a04000" +C_GREEN = "#1a6b3c" +C_TEAL = "#0e6655" +C_PURPLE = "#6c3483" +C_WARN = "#922b21" +C_AMBER = "#7d6608" +C_GRAY = "#566573" +C_BG = "#f4f6f7" +C_PLAIN = "#0b3d5e" # plain-language box header +C_TECH = "#3b0f5e" # technical box header +C_STRIPE = "#d5d8dc" # table stripe + +W, H = 16, 9 # figure size in inches (16:9) + + +# ── Low-level helpers ───────────────────────────────────────────────────────── + +def new_slide(): + fig, ax = plt.subplots(figsize=(W, H)) + ax.set_xlim(0, W) + ax.set_ylim(0, H) + ax.axis("off") + fig.patch.set_facecolor(C_BG) + ax.set_facecolor(C_BG) + return fig, ax + + +def header_bar(ax, title, subtitle="", slide_n=""): + ax.add_patch(FancyBboxPatch( + (0, H - 1.15), W, 1.15, + boxstyle="square,pad=0", facecolor=C_NAVY, + edgecolor="none", zorder=3, + )) + ax.text(0.30, H - 0.50, title, + ha="left", va="center", fontsize=16, fontweight="bold", + color="white", zorder=4) + if subtitle: + ax.text(0.30, H - 0.88, subtitle, + ha="left", va="center", fontsize=9.5, color="#a9cce3", zorder=4) + if slide_n: + ax.text(W - 0.25, H - 0.60, slide_n, + ha="right", va="center", fontsize=9, color="#aaaaaa", zorder=4) + + +def content_box(ax, x, y, w, h, box_title, lines, header_color, + bg="#e8ecef", title_fs=11, body_fs=9.5, line_gap=1.6): + ax.add_patch(FancyBboxPatch( + (x, y), w, h, + boxstyle="round,pad=0.10", facecolor=bg, + edgecolor="white", linewidth=1.8, zorder=3, + )) + # coloured left accent strip + ax.add_patch(FancyBboxPatch( + (x, y), 0.12, h, + boxstyle="square,pad=0", facecolor=header_color, + edgecolor="none", zorder=4, + )) + ax.text(x + 0.22, y + h - 0.30, box_title, + ha="left", va="top", fontsize=title_fs, fontweight="bold", + color=header_color, zorder=5) + ax.text(x + 0.22, y + h - 0.62, "\n".join(lines), + ha="left", va="top", fontsize=body_fs, + color="#1a1a1a", linespacing=line_gap, zorder=5) + + +def footnote(ax, text, y=0.18): + ax.text(W / 2, y, text, + ha="center", va="center", fontsize=8.5, color="#555", + style="italic", zorder=4) + + +def divider(ax, y, x0=0.25, x1=None): + ax.plot([x0, x1 or W - 0.25], [y, y], + color="#b2bec3", lw=0.8, ls="--", zorder=2) + + +def save_slide(fig, name): + SAVE_TO.mkdir(parents=True, exist_ok=True) + out = SAVE_TO / name + fig.savefig(out, dpi=180, bbox_inches="tight", facecolor=C_BG) + plt.close(fig) + print(f" Saved: {out.name}") + + +# ── Slide 01 — Branch Processing ────────────────────────────────────────────── + +def slide_01(): + fig, ax = new_slide() + header_bar(ax, + "Branch Processing — How a HUC8 is Split Into Workable Units", + subtitle="Level-path derivation from NWM stream network topology", + slide_n="01 / 07") + + content_box(ax, 0.25, 1.05, 7.35, 6.55, + "Plain Language", + [ + "A branch = one river corridor.", + "Starting where a tributary joins the main river,", + "running all the way up to the tributary's headwater.", + "", + "HUC 03020102 has 17 branches:", + " Branch 0 — whole-basin trunk (processed first)", + " 3351000014–3351000029 — 16 individual stream corridors", + "", + "Each branch is processed independently:", + " its own DEM clip, HAND raster, and rating curve table.", + "", + "Why split at all?", + " Memory: a full-basin HAND raster is too large to compute", + " in one pass. Branches let us parallelize and tile.", + ], + C_PLAIN, bg="#e8f4fc", body_fs=10.0, line_gap=1.55) + + content_box(ax, 7.90, 1.05, 7.85, 6.55, + "Technical Detail (branch_derivation.py)", + [ + "Input: NWM streams for the HUC", + " → filter: drop stream orders 1 & 2 (line 137)", + " → result: only order ≥ 3 tributaries remain", + "", + "_assign_levelpaths() (lines 704–803):", + " 1. Find all network outlets (no downstream reach)", + " 2. Walk upstream from each outlet", + " 3. At each confluence — pick ONE upstream reach", + " to continue the same level path:", + " → highest stream order wins", + " → tie: largest arbolate sum; then reach ID", + " 4. All other upstream tributaries at that junction", + " → start a NEW level path (new branch ID)", + "", + "Branch IDs are synthetic (e.g. 3351000014 = first 4 chars", + "of NWM reach ID + sequential suffix)", + "", + "_remove_branches_without_catchments() (line 806):", + " Drops any branch with zero NWM catchment polygons", + " → prevents empty-DEM crashes downstream", + ], + C_TECH, bg="#f3eafd", body_fs=9.2, line_gap=1.50) + + ax.add_patch(FancyBboxPatch( + (0.25, 0.18), 15.50, 0.72, + boxstyle="round,pad=0.08", facecolor=C_TEAL, + edgecolor="none", linewidth=0, zorder=3, + )) + ax.text(8.0, 0.54, + "HUC 03020102: 17 branches = 1 trunk (Branch 0) + 16 level-path branches " + "(3351000014 → 3351000029)", + ha="center", va="center", fontsize=10.5, fontweight="bold", + color="white", zorder=4) + + save_slide(fig, "slide_01_branch_processing.png") + + +# ── Slide 02 — Reference Frame Reconciliation ───────────────────────────────── + +def slide_02(): + fig, ax = new_slide() + header_bar(ax, + "Reference Frame Reconciliation — USGS RC vs Synthetic RC (SRC)", + subtitle="Converting absolute NAVD88 elevation to HAND (height above thalweg)", + slide_n="02 / 07") + + content_box(ax, 0.25, 4.30, 7.35, 3.40, + "Plain Language", + [ + "USGS gauges report water-surface elevation in feet", + "above sea level (NAVD88 datum).", + "", + "Our Synthetic Rating Curves use a different reference:", + "height above the riverbed (HAND stage).", + "", + "To compare them we subtract the riverbed elevation", + "(sampled from the DEM at the snapped gauge location).", + "", + "The SRC has only discrete 1-foot stage steps, so we", + 'find the "closest row" rather than an exact match.', + ], + C_PLAIN, bg="#e8f4fc", body_fs=10.0, line_gap=1.60) + + content_box(ax, 0.25, 1.05, 7.35, 3.00, + "Technical Detail (s05_calibrate_n_recurrence.py lines 95–194)", + [ + "Unit conversion (lines 95–100):", + " rc['flow_cms'] = rc['flow'] / 35.3147 # cfs → m³/s", + " rc['elev_m'] = rc['elevation_navd88'] / 3.28084 # ft → m", + "", + "Reference conversion (lines 154–155):", + " elev_r = np.interp(Q_r, gage_rc['flow_cms'],", + " gage_rc['elev_m']) # NAVD88 water surface (m)", + " hand_r = elev_r − dem_adj_elevation_m # → HAND stage", + "", + "Nearest-neighbour SRC lookup (line 192):", + " idx = (hdf['Stage'] − hand_r).abs().idxmin()", + " row = hdf.loc[idx] # → A, R geometry at that stage", + ], + C_TECH, bg="#f3eafd", body_fs=9.0, line_gap=1.55) + + # ── Diagram ──────────────────────────────────────────────────────────────── + dx = 9.80 + bx = dx # left edge of diagram area + # Ground / thalweg line + ax.plot([bx, bx + 5.8], [3.00, 3.00], color="#7d6608", lw=2.5, zorder=4) + ax.text(bx + 5.85, 3.00, "Thalweg / riverbed", va="center", fontsize=9, color="#7d6608") + # NAVD88 datum + ax.plot([bx, bx + 5.8], [1.45, 1.45], color="#566573", lw=1.5, ls="--", zorder=4) + ax.text(bx + 5.85, 1.45, "NAVD88 datum", va="center", fontsize=9, color="#566573") + # Water surface + ax.plot([bx, bx + 5.8], [5.50, 5.50], color="#2980b9", lw=2.5, zorder=4) + ax.text(bx + 5.85, 5.50, "Water surface", va="center", fontsize=9, color="#2980b9") + + # Arrow: NAVD88 → thalweg (dem_adj_elevation_m) + cx = bx + 1.2 + ax.annotate("", xy=(cx, 3.00), xytext=(cx, 1.45), + arrowprops=dict(arrowstyle="<->", color=C_ORANGE, lw=1.8), zorder=5) + ax.text(cx - 0.12, 2.22, "dem_adj_elevation_m\n(thalweg elev, from DEM)", + ha="right", va="center", fontsize=8.5, color=C_ORANGE) + + # Arrow: thalweg → water surface (HAND stage) + cx2 = bx + 2.8 + ax.annotate("", xy=(cx2, 5.50), xytext=(cx2, 3.00), + arrowprops=dict(arrowstyle="<->", color=C_GREEN, lw=2.2), zorder=5) + ax.text(cx2 + 0.12, 4.25, "HAND stage = hand_r\n(what SRC Stage column uses)", + ha="left", va="center", fontsize=8.5, color=C_GREEN) + + # Arrow: NAVD88 → water surface (elev_m from USGS RC) + cx3 = bx + 4.4 + ax.annotate("", xy=(cx3, 5.50), xytext=(cx3, 1.45), + arrowprops=dict(arrowstyle="<->", color=C_NAVY, lw=2.2), zorder=5) + ax.text(cx3 + 0.12, 3.47, "elev_r (NAVD88 water surface)\n(what USGS RC reports)", + ha="left", va="center", fontsize=8.5, color=C_NAVY) + + # Equation box + ax.add_patch(FancyBboxPatch( + (bx - 0.10, 5.80), 5.90, 0.75, + boxstyle="round,pad=0.08", facecolor="#fdfefe", + edgecolor="#aab7b8", linewidth=1.2, zorder=3, + )) + ax.text(bx + 2.85, 6.18, + "hand_r = elev_r − dem_adj_elevation_m", + ha="center", va="center", fontsize=11, fontweight="bold", + color=C_NAVY, zorder=5, + fontfamily="monospace") + + ax.add_patch(FancyBboxPatch( + (0.25, 0.18), 15.50, 0.60, + boxstyle="round,pad=0.08", facecolor="#eaecee", + edgecolor="#aab7b8", linewidth=0.8, zorder=2, + )) + ax.text(W / 2, 0.48, + "dem_adj_elevation_m comes from usgs_elev_table.csv" + " — DEM elevation sampled at the snapped gauge point during the USGS Crosswalk step (s04)", + ha="center", va="center", fontsize=8.8, color="#444", style="italic", zorder=3) + + save_slide(fig, "slide_02_reference_frames.png") + + +# ── Slide 03 — Our Calibration ──────────────────────────────────────────────── + +def slide_03(): + fig, ax = new_slide() + header_bar(ax, + "Our Calibration — Manning's n Back-Calculation", + subtitle="s05_calibrate_n_recurrence.py | direct algebraic inversion at 5 NWM recurrence points", + slide_n="03 / 07") + + content_box(ax, 0.25, 3.80, 7.35, 3.95, + "Plain Language", + [ + "We ask: at each of 5 flood levels (2, 5, 10, 25, 50-year", + "recurrence), what roughness value n makes the SRC predict", + "the same flow as the USGS gauge?", + "", + "Result: 6 zones — one per recurrence interval boundary,", + "plus everything above the 50-yr level.", + "Each zone gets its own constant n value.", + "", + "Only 2 of 17 branches get this treatment:", + " → those with a USGS gauge AND an observed rating curve.", + "", + "The same n is applied uniformly to every reach (HydroID)", + "on that branch within each zone.", + ], + C_PLAIN, bg="#e8f4fc", body_fs=10.0, line_gap=1.60) + + content_box(ax, 0.25, 1.05, 7.35, 2.50, + "Technical Detail", + [ + "Manning's equation (inverse): n = A · R^(2/3) · S^(1/2) / Q", + "", + "At each recurrence flow Q_r (2, 5, 10, 25, 50 yr):", + " 1. Interpolate USGS RC → elev_r (NAVD88 water surface)", + " 2. Convert: hand_r = elev_r − dem_adj_elevation_m (HAND stage)", + " 3. Nearest-neighbour SRC lookup → get A, R for that stage", + " 4. n_zone = A·R^(2/3)·sqrt(SLOPE) / Q_r clipped to [0.01, 0.30]", + " 5. Recompute Q: Discharge = A·R^(2/3)·sqrt(SLOPE) / n_zone", + " → for every HydroID × every Stage row in that zone", + "Writes ManningN + Discharge (m3s-1) → src_full_crosswalked_{bid}.csv", + "Does NOT touch: default_ManningN, default_SLOPE, hydroTable_{bid}.csv", + ], + C_TECH, bg="#f3eafd", body_fs=9.2, line_gap=1.52) + + # ── Zone diagram ────────────────────────────────────────────────────────── + zx = 8.10 + zy_base = 1.05 + zone_h = 6.30 / 6 # 6 zones stacked in 6.30 inches + zone_labels = ["Zone 1\n< 2yr", "Zone 2\n2–5yr", "Zone 3\n5–10yr", + "Zone 4\n10–25yr", "Zone 5\n25–50yr", "Zone 6\n> 50yr"] + n_14 = [0.065, 0.049, 0.039, 0.016, 0.011, 0.011] + n_23 = [0.090, 0.083, 0.044, 0.024, 0.011, 0.011] + zone_cols = ["#aed6f1", "#a9dfbf", "#fad7a0", "#f9e79f", "#f1948a", "#d2b4de"] + + for i in range(6): + by = zy_base + i * zone_h + col = zone_cols[i] + ax.add_patch(FancyBboxPatch( + (zx, by), 7.65, zone_h - 0.04, + boxstyle="square,pad=0", facecolor=col, + edgecolor="#cccccc", linewidth=0.6, alpha=0.55, zorder=3, + )) + # Zone label + ax.text(zx + 0.12, by + zone_h / 2 - 0.02, zone_labels[i], + ha="left", va="center", fontsize=8, color="#333", + fontweight="bold", zorder=4, linespacing=1.3) + # n values for branch 14 + ax.text(zx + 2.85, by + zone_h / 2 - 0.02, f"n = {n_14[i]:.3f}", + ha="center", va="center", fontsize=9.5, color=C_NAVY, + fontweight="bold", zorder=4) + # n values for branch 23 + ax.text(zx + 5.85, by + zone_h / 2 - 0.02, f"n = {n_23[i]:.3f}", + ha="center", va="center", fontsize=9.5, color=C_ORANGE, + fontweight="bold", zorder=4) + + # Column headers + ax.text(zx + 2.85, zy_base + 6 * zone_h + 0.22, + "Branch 3351000014\n(Gauge 02083000)", + ha="center", va="bottom", fontsize=9, color=C_NAVY, fontweight="bold", + linespacing=1.4, zorder=4) + ax.text(zx + 5.85, zy_base + 6 * zone_h + 0.22, + "Branch 3351000023\n(Gauge 02082950)", + ha="center", va="bottom", fontsize=9, color=C_ORANGE, fontweight="bold", + linespacing=1.4, zorder=4) + + # Jump annotation for Branch 23 + ax.annotate( + "8.2× jump\n(non-monotonic SRC risk)", + xy=(zx + 5.85, zy_base + zone_h + 0.03), + xytext=(zx + 7.30, zy_base + zone_h * 1.5), + fontsize=8, color=C_WARN, fontweight="bold", + arrowprops=dict(arrowstyle="->", color=C_WARN, lw=1.2), + ha="left", va="center", zorder=6, + ) + + save_slide(fig, "slide_03_our_calibration.png") + + +# ── Slide 04 — Main Repo Calibration ───────────────────────────────────────── + +def slide_04(): + fig, ax = new_slide() + header_bar(ax, + "Main Repo Calibration — Discharge Correction Coefficient (Stage 4)", + subtitle="src_calibrate.py / src_optimization.py | empirical scalar correction per HydroID", + slide_n="04 / 07") + + content_box(ax, 0.25, 3.75, 7.35, 4.00, + "Plain Language", + [ + "The main pipeline's question:", + " Is the SRC's predicted flow too high or too low", + " compared to the gauge? Apply a single multiplier.", + "", + "Unlike our approach, this does NOT recompute n.", + "It just scales the existing discharge curve up or down", + "by one factor per stream segment (HydroID).", + "", + "The correction is then spread to nearby un-gauged", + "segments within 8 km by network tracing.", + "", + "It runs AFTER several other adjustments:", + " bathymetry → bankfull → channel/overbank subdivision", + " → nonmonotonic fix → USGS rating calibration (← here)", + "", + "So it calibrates an already-modified curve, not the", + "raw Stage 2 SRC that our script worked on.", + ], + C_PLAIN, bg="#e8f4fc", body_fs=9.8, line_gap=1.55) + + content_box(ax, 0.25, 1.05, 7.35, 2.45, + "Technical Detail (src_optimization.py)", + [ + "1. _build_usgs_database: match USGS RC to each NWM recurrence flow", + " → (hand, Q_obs) pairs at each recurrence interval", + "2. For each pair: find nearest hydroTable row → get Q_src", + " calb_coef = Q_src / Q_obs (per HydroID, per interval)", + "3. Per HydroID: calb_coef_final = median(calb_coef over all intervals)", + " → ONE scalar per HydroID, not stage-varying", + "4. Q_new(h) = Q_old(h) / calb_coef_final for ALL stage rows", + "5. Network propagation: _trace_network ± 8 km (same stream order)", + " group_manningn_calc: running-mean coef downstream 8 km", + "6. Writes: hydroTable_{bid}.csv (discharge_cms, calb_coef_usgs)", + " Gated: src_adjust_usgs=True AND src_subdiv_toggle=True", + ], + C_TECH, bg="#f3eafd", body_fs=9.2, line_gap=1.52) + + # ── Propagation diagram ─────────────────────────────────────────────────── + px = 8.10 + py_base = 1.40 + # River network sketch + # Main reach (horizontal) ───────────────────── + ax.annotate("", xy=(px + 6.80, py_base + 2.40), + xytext=(px, py_base + 2.40), + arrowprops=dict(arrowstyle="-|>", color="#2980b9", lw=2.0), zorder=4) + # Tributary + ax.annotate("", xy=(px + 3.60, py_base + 2.40), + xytext=(px + 2.20, py_base + 4.20), + arrowprops=dict(arrowstyle="-|>", color="#2980b9", lw=1.5), zorder=4) + + # Gauge HydroID (orange fill) + gx, gy = px + 3.60, py_base + 2.40 + ax.add_patch(plt.Circle((gx, gy), 0.28, color=C_ORANGE, zorder=5)) + ax.text(gx, gy + 0.55, "Gauge\nHydroID", ha="center", va="bottom", + fontsize=8, color=C_ORANGE, fontweight="bold", linespacing=1.3) + + # Traced reaches (±8 km fill) + for dx, label in [(px + 1.8, "−8 km"), (px + 5.4, "+8 km")]: + ax.add_patch(plt.Circle((dx, py_base + 2.40), 0.20, color=C_NAVY, + alpha=0.55, zorder=5)) + ax.text(dx, py_base + 1.85, label, ha="center", fontsize=8, + color=C_NAVY, fontweight="bold") + + # Downstream propagation arrows + for ddx in [px + 5.4, px + 6.8]: + ax.add_patch(plt.Circle((ddx, py_base + 2.40), 0.15, + color=C_TEAL, alpha=0.55, zorder=5)) + ax.text(px + 6.10, py_base + 1.50, + "group mean coef\ncarried ≤ 8 km\ndownstream", + ha="center", va="top", fontsize=8, color=C_TEAL, linespacing=1.4) + + # Legend + for col, lbl, yx in [ + (C_ORANGE, "Direct gauge HydroID", py_base + 5.10), + (C_NAVY, "Traced neighborhood (±8 km, same order)", py_base + 4.65), + (C_TEAL, "Downstream group carry (≤ 8 km)", py_base + 4.20), + ]: + ax.add_patch(plt.Circle((px + 0.22, yx), 0.12, color=col, zorder=5)) + ax.text(px + 0.45, yx, lbl, va="center", fontsize=9, color="#222") + + ax.text(px + 3.60, py_base + 6.10, + "One scalar coefficient per HydroID — applied uniformly across the full discharge curve", + ha="center", fontsize=9.5, fontweight="bold", color=C_NAVY) + + save_slide(fig, "slide_04_main_repo_calibration.png") + + +# ── Slide 05 — Comparison Table ─────────────────────────────────────────────── + +def slide_05(): + fig, ax = new_slide() + header_bar(ax, + "Calibration Methods — Side-by-Side Comparison", + subtitle="Both use the same USGS rating curves and NWM recurrence flows — but produce discharge differently", + slide_n="05 / 07") + + rows = [ + ("Method", + "Manning's n back-calculation\n(algebraic inversion)", + "Discharge correction coefficient\n(empirical scalar)"), + ("What changes", + "ManningN per zone → Q fully\nrecomputed from geometry", + "Q_old × (1 / coef), n unchanged,\ncurve shape preserved"), + ("Discharge output", + "Stage-varying (different n per zone\n→ different curve shape per zone)", + "Single multiplier per HydroID\n(uniform shift up or down)"), + ("Spatial scope", + "All HydroIDs on the gauged branch\n(uniform within each zone)", + "Gauge HydroID ± 8 km network trace\n+ downstream group carry"), + ("Baseline curve", + "Raw SRC from Stage 2\n(src_full_crosswalked_{bid}.csv)", + "Post-subdivision hydroTable\n(bankfull + bathy + subdiv already applied)"), + ("Output file", + "src_full_crosswalked_{bid}.csv\n(ManningN + Discharge (m3s-1))", + "hydroTable_{bid}.csv\n(discharge_cms + calb_coef_usgs)"), + ("Branches affected", + "2 of 17 (gauged with RC data only)\n3351000014, 3351000023", + "Up to all 17 via network propagation\n(any HydroID within 8 km of a gauge)"), + ("Handles ungauged HydroIDs", + "No — only directly gauged branches", + "Yes — network propagation and\nfeature_id-level mean fallback"), + ("Run order in pipeline", + "s05 — runs before Stage 4", + "Stage 4 — runs after s05;\ncalibration_rerun=True resets first"), + ] + + col_w = [3.40, 5.70, 5.70] + col_x = [0.30, 3.90, 9.80] + row_h = 0.61 + y_start = 7.55 + + hdr_colors = ["#2c3e50", C_NAVY, C_ORANGE] + hdr_labels = ["Aspect", "Our Script (s05)", "Main Repo (Stage 4 / UsgsRatingCalibrator)"] + alt_bgs = ["#dce4ec", "#e8f4fc", "#fef5e7"] + base_bgs = ["#eaeff4", "#f0f7fd", "#fdf3e3"] + + for ci, (lbl, cw, cx) in enumerate(zip(hdr_labels, col_w, col_x)): + ax.add_patch(FancyBboxPatch( + (cx, y_start), cw, 0.58, + boxstyle="square,pad=0", facecolor=hdr_colors[ci], + edgecolor="none", zorder=3, + )) + ax.text(cx + cw / 2, y_start + 0.29, lbl, + ha="center", va="center", fontsize=10.5, fontweight="bold", + color="white", zorder=4) + + for ri, row_data in enumerate(rows): + ry = y_start - (ri + 1) * row_h + bg_set = alt_bgs if ri % 2 == 0 else base_bgs + for ci, (cell, cw, cx) in enumerate(zip(row_data, col_w, col_x)): + ax.add_patch(FancyBboxPatch( + (cx, ry), cw, row_h - 0.03, + boxstyle="square,pad=0", facecolor=bg_set[ci], + edgecolor="#d5d8dc", linewidth=0.6, zorder=3, + )) + ax.text(cx + 0.12, ry + row_h / 2, cell, + ha="left", va="center", fontsize=8.5, + color="#1a1a1a", linespacing=1.35, zorder=4) + + footnote(ax, + "Both methods draw on the same source data (usgs_rating_curves.parquet + " + "nwm3_17C_recurrence_flows_cfs.parquet) — they differ in HOW they apply the information.", + y=0.25) + save_slide(fig, "slide_05_comparison_table.png") + + +# ── Slide 06 — The Collision ────────────────────────────────────────────────── + +def slide_06(): + fig, ax = new_slide() + header_bar(ax, + "Critical Issue — Calibration Collision When Stage 4 Runs", + subtitle="calibration_rerun=True in s07_stage4_single_huc.py triggers HydroTableReset, " + "which erases s05 edits", + slide_n="06 / 07") + + content_box(ax, 0.25, 4.05, 7.35, 3.65, + "Plain Language", + [ + "Stage 4 is currently configured to RESET everything", + "back to defaults before it runs its own calibration.", + "", + "The reset reads 'default_ManningN' and 'default_SLOPE'", + "— the original, untouched values from Stage 2.", + "", + "Our s05 script only wrote to 'ManningN' and", + "'Discharge (m3s-1)' — it never updated the default_ columns.", + "", + "So the reset has no way to know we calibrated those values.", + "It just overwrites them silently.", + "", + "Result: if Stage 4 runs, our n-calibration is gone.", + "Whoever runs last wins.", + ], + C_PLAIN, bg="#fef9e7", body_fs=10.0, line_gap=1.55) + + content_box(ax, 0.25, 1.05, 7.35, 2.75, + "Technical Detail (reset.py lines 77–169)", + [ + "HydroTableReset._reset_one() reads:", + " src_base_{bid}.csv + full['default_SLOPE']", + " + full['default_ManningN']", + "Then recomputes:", + " Q = A·R^(2/3)·sqrt(default_SLOPE) / default_ManningN", + "Then overwrites in src_full_crosswalked_{bid}.csv:", + " ManningN ← default_ManningN", + " Discharge (m3s-1) ← recomputed Q", + "", + "s05 writes ONLY: ManningN, Discharge (m3s-1)", + "s05 NEVER writes: default_ManningN, default_SLOPE", + "→ reset has no memory that s05 ran; it reverts blindly", + ], + C_TECH, bg="#fce4e4", body_fs=9.2, line_gap=1.52) + + # ── Flow diagram ────────────────────────────────────────────────────────── + fx = 8.10 + steps = [ + (fx + 3.7, 7.40, "s05 runs", C_TEAL, + "Writes calibrated n\nto src_full_crosswalked_*.csv"), + (fx + 3.7, 5.75, "Stage 4 starts\n(calibration_rerun=True)", C_WARN, + "s07_stage4_single_huc.py line 46"), + (fx + 3.7, 4.05, "HydroTableReset fires\n(pipeline.py line 145)", C_WARN, + "Reads default_ManningN / default_SLOPE\n→ rewrites ManningN + Discharge\n→ s05 work ERASED"), + (fx + 3.7, 2.20, "Stage 4 calibration runs", C_NAVY, + "UsgsRatingCalibrator applies\nits own coef to the reset baseline"), + ] + for sx, sy, title, col, note in steps: + ax.add_patch(FancyBboxPatch( + (sx - 2.60, sy - 0.38), 5.20, 0.82, + boxstyle="round,pad=0.10", facecolor=col, + edgecolor="white", linewidth=1.5, alpha=0.88, zorder=4, + )) + ax.text(sx, sy + 0.04, title, + ha="center", va="center", fontsize=9.5, fontweight="bold", + color="white", zorder=5, linespacing=1.25) + ax.text(sx + 2.75, sy, note, + ha="left", va="center", fontsize=8, color="#333", + linespacing=1.35, zorder=5) + + # Arrows between steps + for i in range(len(steps) - 1): + _, y1, _, _, _ = steps[i] + _, y2, _, _, _ = steps[i + 1] + ax.annotate("", xy=(steps[i+1][0], y2 + 0.44), + xytext=(steps[i][0], y1 - 0.38), + arrowprops=dict(arrowstyle="-|>", color="#555", lw=1.5), zorder=3) + + # Three resolution options — side-by-side boxes (full width, bottom strip) + opt_data = [ + (C_WARN, "Option A (not recommended)", + "Set calibration_rerun=False", + "Skip the reset so Stage 4 stacks on top of s05.\nMethods compound in an uncontrolled way."), + (C_AMBER, "Option B (surgical)", + "Disable src_adjust_usgs for gauged branches", + "Main repo applies bankfull/subdiv/bathy only.\ns05 n-calibration is the final word on gauged branches."), + (C_GREEN, "Option C (safest)", + "Copy s05 n into default_ManningN before Stage 4", + "Makes s05 the new baseline so the reset preserves\nour work and Stage 4 calibrates on top of it."), + ] + bw = (W - 0.50) / 3 # box width per option + for oi, (col, tag, title, body) in enumerate(opt_data): + ox = 0.25 + oi * (bw + 0.02) + ax.add_patch(FancyBboxPatch( + (ox, 0.18), bw, 0.95, + boxstyle="round,pad=0.08", facecolor=col, + edgecolor="white", linewidth=1.2, alpha=0.90, zorder=3, + )) + ax.text(ox + 0.15, 0.99, tag, + ha="left", va="top", fontsize=8.2, color="white", + fontweight="bold", zorder=4) + ax.text(ox + 0.15, 0.76, title, + ha="left", va="top", fontsize=9.5, color="white", + fontweight="bold", zorder=4) + ax.text(ox + 0.15, 0.55, body, + ha="left", va="top", fontsize=8.0, color="#f0f0f0", + linespacing=1.40, zorder=4) + + save_slide(fig, "slide_06_collision.png") + + +# ── Slide 07 — Items Needing Attention ──────────────────────────────────────── + +def slide_07(): + fig, ax = new_slide() + header_bar(ax, + "Items Needing Attention Before Stage 5 (FIM Generation)", + subtitle="HUC 03020102 | Checklist of open issues, risks, and design decisions", + slide_n="07 / 07") + + items = [ + (C_WARN, "CRITICAL — Calibration Collision", + "Stage 4 (s07) is hardcoded with calibration_rerun=True.\n" + "HydroTableReset will silently overwrite s05's ManningN / Discharge back to defaults.\n" + "Decision needed before Stage 4 runs on this HUC (or any of the 24 HUCs)."), + + (C_ORANGE, "SRC Discontinuity — Branch 3351000023", + "n jumps 8.2× at the 2-yr zone boundary (0.011 → 0.090).\n" + "This creates a non-monotonic section in the calibrated SRC: discharge decreases with rising stage.\n" + "Risk: Stage 3–5 inundation mapping may invert the stage–discharge relationship at that level."), + + (C_AMBER, "Gauged Branches With No Rating Curve Data", + "Branches 3351000015 (Gauge 02083410) and 3351000026 (Gauge 02082835)\n" + "have USGS gauges crosswalked but no rating curve rows in usgs_rating_curves.parquet.\n" + "Neither s05 nor Stage 4's UsgsRatingCalibrator can calibrate them → default n=0.06 applies."), + + (C_GRAY, "15 Ungauged Branches — Default n Only", + "Branches 3351000015–3351000029 (excluding 14 & 23) run on global default n = 0.06 (channel)\n" + "and n = 0.12 (overbank) from mannings_global_optz.parquet.\n" + "ML-based n regionalization is future work — not blocking Stage 5, but limits FIM accuracy."), + + (C_PURPLE, "Scale — 24 HUCs Not Yet Tested", + "The full pipeline (including Stage 4) has only been validated on HUC 03020102.\n" + "Before running s07_stage4_all_hucs.py across all 24, confirm collision handling\n" + "and that all HUCs have Stage 2 outputs (branches/ + hydroTable + src_full_crosswalked)."), + ] + + item_h = 1.28 + y0 = 7.60 + for i, (col, title, body) in enumerate(items): + iy = y0 - i * (item_h + 0.04) + ax.add_patch(FancyBboxPatch( + (0.25, iy - item_h), 15.50, item_h, + boxstyle="round,pad=0.08", facecolor="#ffffff", + edgecolor=col, linewidth=2.0, zorder=3, + )) + # left accent + ax.add_patch(FancyBboxPatch( + (0.25, iy - item_h), 0.18, item_h, + boxstyle="square,pad=0", facecolor=col, + edgecolor="none", zorder=4, + )) + ax.text(0.55, iy - 0.30, title, + ha="left", va="top", fontsize=10.5, fontweight="bold", + color=col, zorder=5) + ax.text(0.55, iy - 0.62, body, + ha="left", va="top", fontsize=9.0, color="#222", + linespacing=1.50, zorder=5) + + save_slide(fig, "slide_07_items_attention.png") + + +# ── Main ────────────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + print(f"Saving slides to: {SAVE_TO}") + slide_01() + slide_02() + slide_03() + slide_04() + slide_05() + slide_06() + slide_07() + print(f"\nDone — {SAVE_TO}") diff --git a/scripts/plot_src_calibration.py b/scripts/plot_src_calibration.py new file mode 100644 index 0000000..e452c41 --- /dev/null +++ b/scripts/plot_src_calibration.py @@ -0,0 +1,330 @@ +""" +Publication / PPT-ready calibration comparison plots. + +Each subplot shows: + • Coloured horizontal bands — the 6 piecewise-n zones + • White box label (right) — zone range + calibrated Manning's n + • Blue (thick) — original SRC for the gauged HydroID + • Red (thick) — calibrated SRC for the same HydroID + • Green dashed — USGS rating curve in HAND coordinates + • Diamond + straight arrow — NWM recurrence-Q points (cascade-staggered) + • Grey step curve (top axis) — Manning's n as a function of HAND stage + +x-axis: USGS RC Q at 1.5 × h_50yr (~100-yr view) +y-axis: 1.5 × h_50yr HAND stage +Top x-axis: Manning's n (step function across zones, semi-transparent) + +HAND stage = height above thalweg (dem.tif elevation sampled at snapped gauge). + +Run: + .venv\\Scripts\\python.exe scripts/plot_src_calibration.py +""" +from __future__ import annotations +from pathlib import Path + +import matplotlib as mpl +import matplotlib.pyplot as plt +import matplotlib.ticker as mticker +import numpy as np +import pandas as pd + +# ── Publication style ───────────────────────────────────────────── +mpl.rcParams.update({ + "font.family": "DejaVu Sans", + "font.size": 11, + "axes.labelsize": 13, + "axes.titlesize": 12, + "xtick.labelsize": 11, + "ytick.labelsize": 11, + "legend.fontsize": 10.5, + "legend.framealpha": 0.93, + "legend.edgecolor": "#cccccc", + "axes.spines.top": False, + "axes.spines.right": False, + "axes.grid": True, + "grid.alpha": 0.28, + "grid.linestyle": ":", +}) + +# ── CONFIG ──────────────────────────────────────────────────────── +HUC8 = "03020102" +OUT_DIR = Path("D:/SI/out") +DATA_DIR = Path("data") +# ───────────────────────────────────────────────────────────────── + +AOI_ROOT = OUT_DIR / f"HUC{HUC8}" +WATERSHED = AOI_ROOT / "watershed-data" +BRANCHES = WATERSHED / "branches" +SAVE_TO = AOI_ROOT / "src_plots" + +RECURRENCE_YEARS = [2, 5, 10, 25, 50] + +ZONE_COLOURS = ["#aed6f1", "#a9dfbf", "#fad7a0", "#f9e79f", "#f1948a", "#d2b4de"] +ZONE_ALPHA = 0.40 + + +# ── Loaders ─────────────────────────────────────────────────────── + +def load_data(): + elev = pd.read_csv( + AOI_ROOT / "usgs_elev_table.csv", + dtype={"location_id": str, "feature_id": "Int64"}, + ) + rc = pd.read_parquet(DATA_DIR / "usgs_rating_curves.parquet") + rc["flow_cms"] = rc["flow"].astype(float) / 35.3147 + rc["elev_m"] = rc["elevation_navd88"].astype(float) / 3.28084 + recur = pd.read_parquet(DATA_DIR / "nwm3_17C_recurrence_flows_cfs.parquet") + for yr in RECURRENCE_YEARS: + recur[f"Q_{yr}yr_cms"] = ( + recur[f"{yr}_0_year_recurrence_flow_17C"].astype(float) * 0.028317 + ) + return elev, rc, recur + + +def load_src(bid: str, original: bool) -> pd.DataFrame: + branch_dir = BRANCHES / bid + fname = f"src_full_crosswalked_{bid}" + path = branch_dir / (fname + (".pre_n_calib.csv" if original else ".csv")) + if original and not path.exists(): + path = branch_dir / f"{fname}.csv" + return pd.read_csv(path, low_memory=False) + + +def find_calibrated_branches() -> list[str]: + return [ + bd.name for bd in sorted(BRANCHES.iterdir()) + if bd.is_dir() + and (bd / f"src_full_crosswalked_{bd.name}.pre_n_calib.csv").exists() + ] + + +# ── Zone helpers ────────────────────────────────────────────────── + +def zone_pts(loc_id, feature_id, dem_adj_m, rc_all, recur): + loc = loc_id.zfill(8) + grc = rc_all[rc_all["location_id"] == loc].sort_values("flow_cms") + if grc.empty: + return [] + feat = recur[recur["feature_id"] == feature_id] + if feat.empty: + return [] + pts = [] + for yr in RECURRENCE_YEARS: + Q = float(feat[f"Q_{yr}yr_cms"].iloc[0]) + if Q < grc["flow_cms"].min() or Q > grc["flow_cms"].max(): + continue + elev = float(np.interp(Q, grc["flow_cms"].values, grc["elev_m"].values)) + h = elev - dem_adj_m + if h > 0: + pts.append((yr, h, Q)) + return sorted(pts, key=lambda x: x[0]) + + +def back_calc_n(hydroid, src_df, zp): + hdf = src_df[src_df["HydroID"] == hydroid].sort_values("Stage") + if hdf.empty: + return {} + slope = float(hdf["SLOPE"].iloc[0]) + if slope <= 0: + return {} + out = {} + for yr, h, Q in zp: + idx = (hdf["Stage"] - h).abs().idxmin() + row = hdf.loc[idx] + A = float(row.get("WetArea (m2)", 0.0)) + R = float(row.get("HydraulicRadius (m)", 0.0)) + if A > 0 and R > 0 and Q > 0: + out[yr] = float(np.clip(A * (R ** (2 / 3)) * slope**0.5 / Q, 0.01, 0.30)) + return out + + +# ── n-step function for secondary axis ─────────────────────────── + +def n_step_curve(zp, n_dict, y_max): + """ + Returns (n_vals, h_vals) describing a horizontal step function: + n is constant within each zone, steps at zone boundaries. + Suitable for plotting on a twiny() axis (n on x, HAND on y). + """ + yrs = [yr for yr, _, _ in zp] + n_arr = [n_dict.get(yr, 0.06) for yr in yrs] + n_arr.append(n_dict.get(yrs[-1], 0.06) if yrs else 0.06) # >50yr zone + + h_brk = [0.0] + [h for _, h, _ in zp] + if h_brk[-1] < y_max: + h_brk.append(y_max) + + nx, hy = [], [] + for i, (h_lo, h_hi) in enumerate(zip(h_brk[:-1], h_brk[1:])): + n_v = n_arr[i] + nx.extend([n_v, n_v]) + hy.extend([h_lo, h_hi]) + if i < len(h_brk) - 2: # horizontal jump at zone boundary + nx.append(n_arr[i + 1]) + hy.append(h_hi) + + return nx, hy + + +# ── Main subplot drawing ────────────────────────────────────────── + +def draw_gauge(ax, bid, loc_id, hydroid, feature_id, dem_adj_m, + src_orig, src_calib, rc_all, recur, branch_avg): + + zp = zone_pts(loc_id, feature_id, dem_adj_m, rc_all, recur) + n_dict = back_calc_n(hydroid, src_orig, zp) + + loc_pad = loc_id.zfill(8) + grc = rc_all[rc_all["location_id"] == loc_pad].sort_values("flow_cms").copy() + grc["hand"] = grc["elev_m"] - dem_adj_m + grc = grc[grc["hand"] > 0] + + # ── Axis limits (~100-yr view) ── + h_max_yr = max((h for _, h, _ in zp), default=5.0) + y_max = max(h_max_yr * 1.50, 3.0) + + target_elev = dem_adj_m + y_max + rc_raw = rc_all[rc_all["location_id"] == loc_pad].sort_values("elev_m") + rc_below = rc_raw[rc_raw["elev_m"] <= target_elev] + rc_q_max = float(rc_below["flow_cms"].max()) if not rc_below.empty else 0.0 + x_max = max(rc_q_max, max((Q for _, _, Q in zp), default=50) * 1.4) * 1.08 + + go_h = src_orig[src_orig["HydroID"] == hydroid].sort_values("Stage") + gc_h = src_calib[src_calib["HydroID"] == hydroid].sort_values("Stage") + + # ── Zone shading ── + h_edges = [0.0] + [h for _, h, _ in zp] + if h_edges[-1] < y_max: + h_edges.append(y_max) + + for i, (h_lo, h_hi) in enumerate(zip(h_edges[:-1], h_edges[1:])): + col = ZONE_COLOURS[min(i, len(ZONE_COLOURS) - 1)] + ax.axhspan(h_lo, min(h_hi, y_max), color=col, alpha=0.28, zorder=0, lw=0) + + # ── Zone transition lines + return period labels (right-aligned, staggered) ── + min_lbl_sep = y_max * 0.07 + prev_lbl_h = -1.0 + for yr, h, _ in zp: + ax.axhline(h, color="#777", ls=":", lw=1.0, alpha=0.75, zorder=1) + lh = max(h, prev_lbl_h + min_lbl_sep) + lh = min(lh, y_max * 0.96) + ax.text(x_max * 0.988, lh, f"{yr} yr", + ha="right", va="bottom", fontsize=9.0, color="#555", + fontweight="bold", zorder=10) + prev_lbl_h = lh + + # ── USGS RC — solid dark reference line ── + if not grc.empty: + rc_plt = grc[grc["hand"] <= y_max * 1.02] + ax.plot(rc_plt["flow_cms"], rc_plt["hand"], + color="#222222", lw=2.8, ls="-", zorder=5, + label=f"USGS RC") + + # ── Recurrence markers — small circles on USGS RC, no text ── + for _, h, Q in zp: + ax.scatter(Q, h, color="#222222", s=50, zorder=8, marker="o", + edgecolors="white", linewidths=0.9) + + # ── SRC curves — original thin solid, calibrated thick dashed ── + if not go_h.empty: + ax.plot(go_h["Discharge (m3s-1)"], go_h["Stage"], + color="#2471a3", lw=1.8, ls="-", zorder=4, + label=f"Original SRC") + if not gc_h.empty: + ax.plot(gc_h["Discharge (m3s-1)"], gc_h["Stage"], + color="#c0392b", lw=3.0, ls="--", zorder=5, + label="Calibrated SRC") + # Legend proxy for recurrence markers + ax.scatter([], [], color="#222222", s=50, marker="o", + edgecolors="white", linewidths=0.9, + label="Recurrence interval") + + # ── Secondary top x-axis: Manning's n step function ── + ax_n = ax.twiny() + nx, hy = n_step_curve(zp, n_dict, y_max) + ax_n.plot(nx, hy, color="#444", alpha=0.38, lw=2.2, ls="-", zorder=3) + ax_n.fill_betweenx(hy, 0, nx, color="#888", alpha=0.07, zorder=2) + + all_n = [v for v in n_dict.values() if v is not None] + n_ax_max = max(max(all_n) * 1.45, 0.35) if all_n else 0.40 + ax_n.set_xlim(0, n_ax_max) + ax_n.set_xlabel("Manning's n →", fontsize=10, color="#555", labelpad=5) + ax_n.tick_params(axis="x", colors="#666", labelsize=9) + # Make sure the top spine is drawn (overriding the rcParam default) + for sp in ["right", "left", "bottom"]: + ax_n.spines[sp].set_visible(False) + ax_n.spines["top"].set_visible(True) + ax_n.spines["top"].set_color("#aaa") + ax_n.spines["top"].set_linewidth(0.9) + + # ── Main axis cosmetics ── + ax.set_xlim(0, x_max) + ax.set_ylim(0, y_max) + ax.set_xlabel("Discharge (m³/s)", labelpad=8) + ax.set_ylabel("Stage (m) [height above thalweg]", labelpad=8) + ax.xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f"{x:,.0f}")) + + title = f"Branch {bid} · Gauge {loc_id} · HydroID {hydroid}" + if branch_avg: + title += "\n(branch 0 : branch-wide average n of both gauges applied)" + ax.set_title(title, fontweight="bold", pad=28) # pad leaves room for top axis + ax.legend(loc="upper left") + + +# ── Main ────────────────────────────────────────────────────────── + +def main(): + SAVE_TO.mkdir(parents=True, exist_ok=True) + elev, rc_all, recur = load_data() + + for bid in find_calibrated_branches(): + branch_gauges = elev[elev["levpa_id"].astype(str) == str(bid)].copy() + branch_gauges = branch_gauges[ + branch_gauges["location_id"].apply( + lambda loc: not rc_all[rc_all["location_id"] == str(loc).zfill(8)].empty + ) + ] + if branch_gauges.empty: + print(f"Branch {bid}: no RC data — skip") + continue + + src_orig = load_src(bid, original=True) + src_calib = load_src(bid, original=False) + n_g = len(branch_gauges) + + fig, axes = plt.subplots( + 1, n_g, + figsize=(11 * n_g, 8.5), + squeeze=False, + constrained_layout=True, + ) + + for col, (_, g) in enumerate(branch_gauges.iterrows()): + draw_gauge( + axes[0, col], bid, + str(g["location_id"]), + int(g["HydroID"]), + int(g["feature_id"]), + float(g["dem_adj_elevation"]), + src_orig, src_calib, rc_all, recur, + branch_avg=(bid == "0"), + ) + + n_reaches = src_orig["HydroID"].nunique() + fig.suptitle( + f"Manning's n Recurrence Calibration Branch {bid} HUC8 {HUC8}\n" + f"{n_reaches} reaches · {n_g} USGS gauge(s) · " + f"x-axis: ~100-yr discharge · top axis: n vs HAND stage", + fontsize=12, fontweight="bold", + ) + + out = SAVE_TO / f"src_calib_branch_{bid}.png" + fig.savefig(out, dpi=200, bbox_inches="tight", facecolor="white") + plt.close(fig) + print(f" Saved: {out}") + + print(f"\nDone -> {SAVE_TO}") + + +if __name__ == "__main__": + main() diff --git a/scripts/plot_src_calibration_inset.py b/scripts/plot_src_calibration_inset.py new file mode 100644 index 0000000..a051d9a --- /dev/null +++ b/scripts/plot_src_calibration_inset.py @@ -0,0 +1,461 @@ +""" +Calibration comparison plots — variant with bottom-right inset for n vs HAND stage. + +Identical to plot_src_calibration.py except: + • No secondary top x-axis (twiny). + • Bottom-right inset axes showing Manning's n as a step function vs HAND stage, + with the same zone colouring as the main plot. + +Outputs saved as src_calib_inset_branch_{bid}.png (distinct from the top-axis variant). + +Run: + .venv\\Scripts\\python.exe scripts/plot_src_calibration_inset.py +""" +from __future__ import annotations +from pathlib import Path + +import matplotlib as mpl +import matplotlib.pyplot as plt +import matplotlib.ticker as mticker +import numpy as np +import pandas as pd + +# ── Publication style ───────────────────────────────────────────── +mpl.rcParams.update({ + "font.family": "DejaVu Sans", + "font.size": 11, + "axes.labelsize": 13, + "axes.titlesize": 12, + "xtick.labelsize": 11, + "ytick.labelsize": 11, + "legend.fontsize": 10.5, + "legend.framealpha": 0.93, + "legend.edgecolor": "#cccccc", + "axes.spines.top": False, + "axes.spines.right": False, + "axes.grid": True, + "grid.alpha": 0.28, + "grid.linestyle": ":", +}) + +# ── CONFIG ──────────────────────────────────────────────────────── +HUC8 = "03020102" +OUT_DIR = Path("D:/SI/out") +DATA_DIR = Path("data") +# ───────────────────────────────────────────────────────────────── + +AOI_ROOT = OUT_DIR / f"HUC{HUC8}" +WATERSHED = AOI_ROOT / "watershed-data" +BRANCHES = WATERSHED / "branches" +SAVE_TO = AOI_ROOT / "src_plots" + +RECURRENCE_YEARS = [2, 5, 10, 25, 50] + +ZONE_COLOURS = ["#aed6f1", "#a9dfbf", "#fad7a0", "#f9e79f", "#f1948a", "#d2b4de"] +ZONE_ALPHA = 0.40 + + +# ── Loaders ─────────────────────────────────────────────────────── + +def load_data(): + elev = pd.read_csv( + AOI_ROOT / "usgs_elev_table.csv", + dtype={"location_id": str, "feature_id": "Int64"}, + ) + rc = pd.read_parquet(DATA_DIR / "usgs_rating_curves.parquet") + rc["flow_cms"] = rc["flow"].astype(float) / 35.3147 + rc["elev_m"] = rc["elevation_navd88"].astype(float) / 3.28084 + recur = pd.read_parquet(DATA_DIR / "nwm3_17C_recurrence_flows_cfs.parquet") + for yr in RECURRENCE_YEARS: + recur[f"Q_{yr}yr_cms"] = ( + recur[f"{yr}_0_year_recurrence_flow_17C"].astype(float) * 0.028317 + ) + return elev, rc, recur + + +def load_src(bid: str, original: bool) -> pd.DataFrame: + branch_dir = BRANCHES / bid + fname = f"src_full_crosswalked_{bid}" + path = branch_dir / (fname + (".pre_n_calib.csv" if original else ".csv")) + if original and not path.exists(): + path = branch_dir / f"{fname}.csv" + return pd.read_csv(path, low_memory=False) + + +def find_calibrated_branches() -> list[str]: + return [ + bd.name for bd in sorted(BRANCHES.iterdir()) + if bd.is_dir() + and (bd / f"src_full_crosswalked_{bd.name}.pre_n_calib.csv").exists() + ] + + +# ── Zone helpers ────────────────────────────────────────────────── + +def zone_pts(loc_id, feature_id, dem_adj_m, rc_all, recur): + loc = loc_id.zfill(8) + grc = rc_all[rc_all["location_id"] == loc].sort_values("flow_cms") + if grc.empty: + return [] + feat = recur[recur["feature_id"] == feature_id] + if feat.empty: + return [] + pts = [] + for yr in RECURRENCE_YEARS: + Q = float(feat[f"Q_{yr}yr_cms"].iloc[0]) + if Q < grc["flow_cms"].min() or Q > grc["flow_cms"].max(): + continue + elev = float(np.interp(Q, grc["flow_cms"].values, grc["elev_m"].values)) + h = elev - dem_adj_m + if h > 0: + pts.append((yr, h, Q)) + return sorted(pts, key=lambda x: x[0]) + + +def back_calc_n(hydroid, src_df, zp): + hdf = src_df[src_df["HydroID"] == hydroid].sort_values("Stage") + if hdf.empty: + return {} + slope = float(hdf["SLOPE"].iloc[0]) + if slope <= 0: + return {} + out = {} + for yr, h, Q in zp: + idx = (hdf["Stage"] - h).abs().idxmin() + row = hdf.loc[idx] + A = float(row.get("WetArea (m2)", 0.0)) + R = float(row.get("HydraulicRadius (m)", 0.0)) + if A > 0 and R > 0 and Q > 0: + out[yr] = float(np.clip(A * (R ** (2 / 3)) * slope**0.5 / Q, 0.01, 0.30)) + return out + + +def _src_q_at_h(h, src_hid): + if src_hid.empty: + return 0.0 + return float(np.interp( + h, src_hid["Stage"].values, src_hid["Discharge (m3s-1)"].values, + left=0.0, right=float(src_hid["Discharge (m3s-1)"].max()), + )) + + +def n_step_curve(zp, n_dict, y_max): + """(n_vals, h_vals) for a horizontal step function: n on x, HAND stage on y.""" + yrs = [yr for yr, _, _ in zp] + n_arr = [n_dict.get(yr, 0.06) for yr in yrs] + n_arr.append(n_dict.get(yrs[-1], 0.06) if yrs else 0.06) + + h_brk = [0.0] + [h for _, h, _ in zp] + if h_brk[-1] < y_max: + h_brk.append(y_max) + + nx, hy = [], [] + for i, (h_lo, h_hi) in enumerate(zip(h_brk[:-1], h_brk[1:])): + n_v = n_arr[i] + nx.extend([n_v, n_v]) + hy.extend([h_lo, h_hi]) + if i < len(h_brk) - 2: + nx.append(n_arr[i + 1]) + hy.append(h_hi) + + return nx, hy + + +# ── Bottom-right inset: n vs HAND stage ─────────────────────────── + +def draw_n_inset(ax, zp, n_dict, y_max): + """ + Adds a small inset in the bottom-right corner of ax showing Manning's n + as a piecewise step function vs HAND stage. Zone colours match the main plot. + The bottom-right corner is typically empty in a stage-discharge plot because + discharge increases with stage, leaving high-Q / low-h space unused. + """ + inset = ax.inset_axes([0.68, 0.04, 0.27, 0.33]) # [x0, y0, w, h] in axes fraction + + yrs = [yr for yr, _, _ in zp] + h_brk = [0.0] + [h for _, h, _ in zp] + if h_brk[-1] < y_max: + h_brk.append(y_max) + n_arr = [n_dict.get(yr, 0.06) for yr in yrs] + n_arr.append(n_dict.get(yrs[-1], 0.06) if yrs else 0.06) + + all_n = [v for v in n_arr if v is not None] + if not all_n: + return + + n_lo = 0.0 + n_hi = max(all_n) * 1.45 + + # ── Zone-coloured horizontal bands ── + for i, (h_lo, h_hi) in enumerate(zip(h_brk[:-1], h_brk[1:])): + col = ZONE_COLOURS[min(i, len(ZONE_COLOURS) - 1)] + h_hi_c = min(h_hi, y_max) + inset.axhspan(h_lo, h_hi_c, color=col, alpha=0.55, lw=0, zorder=0) + + # ── Step function line ── + nx, hy = n_step_curve(zp, n_dict, y_max) + inset.plot(nx, hy, color="#1a1a1a", lw=2.4, zorder=4, solid_capstyle="butt") + + # ── n value label centred in each zone band ── + for i, (h_lo, h_hi) in enumerate(zip(h_brk[:-1], h_brk[1:])): + n_v = n_arr[i] + h_mid = (h_lo + min(h_hi, y_max)) / 2 + if h_mid <= y_max and n_v is not None: + # Place text just to the right of the step line, near the right edge + txt_x = n_v + (n_hi - n_v) * 0.20 + txt_x = min(txt_x, n_hi * 0.95) + inset.text( + txt_x, h_mid, f"{n_v:.3f}", + ha="left", va="center", fontsize=7.5, + color="#1a1a1a", fontweight="bold", + ) + + # ── Zone boundary dotted lines ── + for _, h, _ in zp: + inset.axhline(h, color="#888", ls=":", lw=0.8, alpha=0.7, zorder=2) + + # ── Axes cosmetics ── + inset.set_xlim(n_lo, n_hi) + inset.set_ylim(0, y_max) + inset.set_xlabel("Manning's n", fontsize=8.5, labelpad=3, color="#333") + inset.set_ylabel("h (m)", fontsize=8.5, labelpad=3, color="#333") + inset.tick_params(labelsize=7.5, colors="#555", length=3) + inset.xaxis.set_major_locator(mticker.MaxNLocator(3, prune="both")) + inset.yaxis.set_major_locator(mticker.MaxNLocator(4, prune="both")) + inset.set_title("n vs stage", fontsize=8.5, pad=4, color="#333", + fontweight="bold") + + # White semi-transparent background so it floats above main plot + inset.patch.set_facecolor("white") + inset.patch.set_alpha(0.88) + for sp in ["top", "right"]: + inset.spines[sp].set_visible(False) + for sp in ["bottom", "left"]: + inset.spines[sp].set_color("#999") + inset.spines[sp].set_linewidth(0.8) + inset.grid(True, alpha=0.18, color="#aaa", linestyle=":") + + # Thin border box around the whole inset + for sp in inset.spines.values(): + sp.set_linewidth(0.8) + + +# ── Main subplot drawing ────────────────────────────────────────── + +def draw_gauge(ax, bid, loc_id, hydroid, feature_id, dem_adj_m, + src_orig, src_calib, rc_all, recur, branch_avg): + + zp = zone_pts(loc_id, feature_id, dem_adj_m, rc_all, recur) + n_dict = back_calc_n(hydroid, src_orig, zp) + + loc_pad = loc_id.zfill(8) + grc = rc_all[rc_all["location_id"] == loc_pad].sort_values("flow_cms").copy() + grc["hand"] = grc["elev_m"] - dem_adj_m + grc = grc[grc["hand"] > 0] + + # ── Axis limits (~100-yr view) ── + h_max_yr = max((h for _, h, _ in zp), default=5.0) + y_max = max(h_max_yr * 1.50, 3.0) + + target_elev = dem_adj_m + y_max + rc_raw = rc_all[rc_all["location_id"] == loc_pad].sort_values("elev_m") + rc_below = rc_raw[rc_raw["elev_m"] <= target_elev] + rc_q_max = float(rc_below["flow_cms"].max()) if not rc_below.empty else 0.0 + x_max = max(rc_q_max, max((Q for _, _, Q in zp), default=50) * 1.4) * 1.08 + + go_h = src_orig[src_orig["HydroID"] == hydroid].sort_values("Stage") + gc_h = src_calib[src_calib["HydroID"] == hydroid].sort_values("Stage") + + # ── Zone shading + right-side labels ── + yrs_list = [yr for yr, _, _ in zp] + h_edges = [0.0] + [h for _, h, _ in zp] + if h_edges[-1] < y_max: + h_edges.append(y_max) + + prev = 0 + zone_labels = [] + for yr in yrs_list: + zone_labels.append(f"{prev}-{yr} yr") + prev = yr + zone_labels.append(f"> {prev} yr") + + n_per_zone = [n_dict.get(yr) for yr in yrs_list] + n_per_zone.append(n_dict.get(yrs_list[-1]) if yrs_list else None) + + # Inset occupies axes fraction [0.68, 0.04, 0.27, 0.33]: + # x: 0.68–0.95 of x_max y: 0.04–0.37 of y_max + # Lower zone labels are placed LEFT of the inset; upper zone labels go right + # and are staggered into two columns when they stack too close vertically. + INSET_Y_TOP = 0.37 # inset top edge as fraction of y_max + INSET_X_LEFT = 0.68 # inset left edge as fraction of x_max + + prev_upper_h = -1.0 # last placed upper-zone label h_mid + stagger_right = True # alternates column when upper labels are compressed + + for i, (h_lo, h_hi) in enumerate(zip(h_edges[:-1], h_edges[1:])): + h_hi_c = min(h_hi, y_max) + col = ZONE_COLOURS[min(i, len(ZONE_COLOURS) - 1)] + ax.axhspan(h_lo, h_hi_c, color=col, alpha=ZONE_ALPHA, zorder=0, lw=0) + + h_mid = (h_lo + h_hi_c) / 2 + q_max_src = max(_src_q_at_h(h_mid, go_h), _src_q_at_h(h_mid, gc_h)) + + if h_mid < INSET_Y_TOP * y_max: + # Lower zone: keep label LEFT of inset so they never overlap + lx = float(np.clip( + max(q_max_src * 2.0, x_max * 0.26), + q_max_src * 1.4, + INSET_X_LEFT * x_max * 0.86, # cap well below inset left edge + )) + prev_upper_h = -1.0 # reset upper stagger when we re-enter lower zones + stagger_right = True + else: + # Upper zone: right side, stagger into two columns when compressed + if prev_upper_h > 0 and abs(h_mid - prev_upper_h) < 0.12 * y_max: + if stagger_right: + lx = float(np.clip(max(q_max_src * 1.25, x_max * 0.74), + 0, x_max * 0.95)) + stagger_right = False + else: + lx = float(np.clip(max(q_max_src * 1.25, x_max * 0.55), + 0, x_max * 0.73)) + stagger_right = True + else: + lx = float(np.clip(max(q_max_src * 1.25, x_max * 0.65), + 0, x_max * 0.95)) + stagger_right = True # next close label starts in right column + prev_upper_h = h_mid + + n_v = n_per_zone[i] + lbl = zone_labels[i] + (f"\nn = {n_v:.3f}" if n_v is not None else "") + ax.text(lx, h_mid, lbl, + ha="center", va="center", fontsize=9.5, color="#222", + bbox=dict(boxstyle="round,pad=0.35", fc="white", + ec=col, linewidth=1.4, alpha=0.95), + zorder=9) + + for _, h, _ in zp: + ax.axhline(h, color="#888", ls=":", lw=0.9, alpha=0.60, zorder=1) + + # ── USGS RC ── + if not grc.empty: + rc_plt = grc[grc["hand"] <= y_max * 1.02] + ax.plot(rc_plt["flow_cms"], rc_plt["hand"], + color="#1a7a4a", lw=2.5, ls="--", zorder=5, + label=f"USGS RC (gauge {loc_id})") + + # ── Recurrence markers — cascade stagger to prevent label overlap ── + MIN_SEP_FRAC = 0.09 + label_ys = [] + + for yr, h, Q in zp: + ly = h + while any(abs(ly - py) < MIN_SEP_FRAC * y_max for py in label_ys): + ly += MIN_SEP_FRAC * y_max * 0.55 + ly = min(ly, y_max * 0.97) + label_ys.append(ly) + + ax.scatter(Q, h, color="#1a7a4a", s=110, zorder=7, marker="D") + + lx = Q + x_max * 0.04 + needs_arrow = abs(ly - h) > y_max * 0.04 + + if needs_arrow: + ax.annotate( + f"{yr} yr", + xy=(Q, h), + xytext=(lx, ly), + arrowprops=dict(arrowstyle="->", color="#1a7a4a", + lw=0.85, connectionstyle="arc3,rad=0.0"), + fontsize=9.5, color="#1a7a4a", fontweight="bold", + ha="left", va="center", zorder=8, + ) + else: + ax.text(lx, ly, f"{yr} yr", + fontsize=9.5, color="#1a7a4a", fontweight="bold", + ha="left", va="center", zorder=8) + + # ── SRC curves ── + if not go_h.empty: + ax.plot(go_h["Discharge (m3s-1)"], go_h["Stage"], + color="#2471a3", lw=3.0, zorder=4, + label=f"Original SRC (HydroID {hydroid})") + if not gc_h.empty: + ax.plot(gc_h["Discharge (m3s-1)"], gc_h["Stage"], + color="#c0392b", lw=3.0, zorder=4, + label="Calibrated SRC") + + # ── Bottom-right inset: n vs HAND stage ── + draw_n_inset(ax, zp, n_dict, y_max) + + # ── Main axis cosmetics ── + ax.set_xlim(0, x_max) + ax.set_ylim(0, y_max) + ax.set_xlabel("Discharge (m3/s)", labelpad=8) + ax.set_ylabel("HAND Stage (m) [height above thalweg]", labelpad=8) + ax.xaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f"{x:,.0f}")) + + title = f"Branch {bid} - Gauge {loc_id} - HydroID {hydroid}" + if branch_avg: + title += "\n(branch 0 : branch-wide average n of both gauges applied)" + ax.set_title(title, fontweight="bold", pad=10) + ax.legend(loc="upper left") + + +# ── Main ────────────────────────────────────────────────────────── + +def main(): + SAVE_TO.mkdir(parents=True, exist_ok=True) + elev, rc_all, recur = load_data() + + for bid in find_calibrated_branches(): + branch_gauges = elev[elev["levpa_id"].astype(str) == str(bid)].copy() + branch_gauges = branch_gauges[ + branch_gauges["location_id"].apply( + lambda loc: not rc_all[rc_all["location_id"] == str(loc).zfill(8)].empty + ) + ] + if branch_gauges.empty: + print(f"Branch {bid}: no RC data - skip") + continue + + src_orig = load_src(bid, original=True) + src_calib = load_src(bid, original=False) + n_g = len(branch_gauges) + + fig, axes = plt.subplots( + 1, n_g, + figsize=(11 * n_g, 8.5), + squeeze=False, + constrained_layout=True, + ) + + for col, (_, g) in enumerate(branch_gauges.iterrows()): + draw_gauge( + axes[0, col], bid, + str(g["location_id"]), + int(g["HydroID"]), + int(g["feature_id"]), + float(g["dem_adj_elevation"]), + src_orig, src_calib, rc_all, recur, + branch_avg=(bid == "0"), + ) + + n_reaches = src_orig["HydroID"].nunique() + fig.suptitle( + f"Manning's n Recurrence Calibration Branch {bid} HUC8 {HUC8}\n" + f"{n_reaches} reaches - {n_g} USGS gauge(s) - " + f"x-axis: ~100-yr discharge - inset: n vs HAND stage", + fontsize=12, fontweight="bold", + ) + + out = SAVE_TO / f"src_calib_inset_branch_{bid}.png" + fig.savefig(out, dpi=200, bbox_inches="tight", facecolor="white") + plt.close(fig) + print(f" Saved: {out}") + + print(f"\nDone -> {SAVE_TO}") + + +if __name__ == "__main__": + main() diff --git a/scripts/plot_src_ensemble.py b/scripts/plot_src_ensemble.py new file mode 100644 index 0000000..29e924e --- /dev/null +++ b/scripts/plot_src_ensemble.py @@ -0,0 +1,467 @@ +""" +SRC ensemble plots — clean, physically informative version. + +Produces three figures in E:/SI/out/calibration_analysis/: + + src_ensemble_sidebyside.png + Two panels side-by-side: original SRCs (left) and calibrated SRCs (right). + Individual gray spaghetti lines + bold colored median. Same axis limits + on both panels. No percentile bands. + + src_ensemble_absolute.png + Both ensembles overlaid on the same axes (log x-scale). + Individual spaghetti lines (light) + bold study-area median per ensemble. + + src_ensemble_normalized.png + Both ensembles normalized by each gauge's NWM 2-year recurrence flow + (Q / Q_2yr), removing the river-size effect. + +Run: + .venv\\Scripts\\python.exe scripts/plot_src_ensemble.py +""" +from __future__ import annotations + +import logging +from pathlib import Path + +import matplotlib as mpl +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from matplotlib.lines import Line2D + +# ── Style ───────────────────────────────────────────────────────── +mpl.rcParams.update({ + "font.family": "DejaVu Sans", + "font.size": 11, + "axes.labelsize": 13, + "axes.titlesize": 12, + "axes.titleweight": "bold", + "axes.spines.top": False, + "axes.spines.right": False, + "axes.grid": True, + "grid.alpha": 0.20, + "grid.linestyle": ":", + "legend.fontsize": 10, + "legend.framealpha": 0.92, + "figure.dpi": 150, +}) + +C_ORIG = "#2471a3" # blue — original SRC +C_CALIB = "#c0392b" # red — calibrated SRC +C_USGS = "#111111" # black — USGS observed RC + +# ── CONFIG ──────────────────────────────────────────────────────── +EXCEL_PATH = Path(r"C:\Users\Ali\OneDrive - CUNY\Desktop\SI\fimbox_SI26\data\study_area.xlsx") +HUC_CODE_COL = "HUC_CODE" +OUT_DIR = Path("E:/SI/out") +DATA_DIR = Path("data") +SUMMARY_DIR = Path("E:/SI/out/calibration_analysis") +RECURRENCE_YEARS = [2, 5, 10, 25, 50] +RECUR_COLS = {yr: f"{yr}_0_year_recurrence_flow_17C" for yr in RECURRENCE_YEARS} +# ───────────────────────────────────────────────────────────────── + +logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S") +log = logging.getLogger(__name__) + + +# ══════════════════════════════════════════════════════════════════ +# SECTION 1 — Data helpers +# ══════════════════════════════════════════════════════════════════ + +def _load_shared(): + rc = pd.read_parquet(DATA_DIR / "usgs_rating_curves.parquet") + rc["location_id"] = rc["location_id"].astype(str) + rc["flow_cms"] = rc["flow"].astype(float) / 35.3147 + rc["elev_m"] = rc["elevation_navd88"].astype(float) / 3.28084 + + recur = pd.read_parquet(DATA_DIR / "nwm3_17C_recurrence_flows_cfs.parquet") + for yr, col in RECUR_COLS.items(): + recur[f"Q_{yr}yr_cms"] = recur[col].astype(float) * 0.028317 + recur["feature_id"] = recur["feature_id"].astype("Int64") + return rc, recur + + +def _load_src(branch_dir: Path, bid: str, original: bool) -> pd.DataFrame | None: + suffix = ".pre_n_calib.csv" if original else ".csv" + p = branch_dir / f"src_full_crosswalked_{bid}{suffix}" + if not p.exists() and original: + p = branch_dir / f"src_full_crosswalked_{bid}.csv" + if not p.exists(): + return None + df = pd.read_csv(p, low_memory=False) + df["HydroID"] = df["HydroID"].astype(int) + return df + + +def _discover_calibrated(huc8: str, rc: pd.DataFrame, elev: pd.DataFrame) -> list[dict]: + branches_dir = OUT_DIR / f"HUC{huc8}" / "watershed-data" / "branches" + if not branches_dir.exists(): + return [] + rc_ids = set(rc["location_id"].astype(str)) + results = [] + for branch_dir in sorted(branches_dir.iterdir()): + if not branch_dir.is_dir(): + continue + bid = branch_dir.name + if bid == "0": + continue + if not (branch_dir / f"src_full_crosswalked_{bid}.pre_n_calib.csv").exists(): + continue + for _, gage in elev[elev["levpa_id"].astype(str) == bid].iterrows(): + loc_id = str(gage["location_id"]).zfill(8) + if loc_id not in rc_ids or pd.isna(gage.get("feature_id")): + continue + results.append({ + "bid": bid, + "location_id": loc_id, + "hydroid": int(gage["HydroID"]), + "feature_id": int(gage["feature_id"]), + "dem_adj_m": float(gage["dem_adj_elevation"]), + "branch_dir": branch_dir, + }) + return results + + +def collect_curves(hucs: list[str], rc: pd.DataFrame, recur: pd.DataFrame): + """ + Returns (orig, calib, usgs_rc) — three parallel lists of dicts: + + q : ndarray discharge (m³/s) + h : ndarray HAND stage (m) + Q_2yr : float NWM 2-yr recurrence flow (m³/s) — normalization factor + """ + orig, calib, usgs_rc = [], [], [] + + for huc8 in hucs: + elev_path = OUT_DIR / f"HUC{huc8}" / "usgs_elev_table.csv" + if not elev_path.exists(): + continue + elev = pd.read_csv(elev_path, dtype={"location_id": str, "feature_id": "Int64"}) + gauges = _discover_calibrated(huc8, rc, elev) + if not gauges: + continue + + by_branch: dict[str, list] = {} + for g in gauges: + by_branch.setdefault(g["bid"], []).append(g) + + for bid, gage_list in by_branch.items(): + bdir = gage_list[0]["branch_dir"] + src_o = _load_src(bdir, bid, original=True) + src_c = _load_src(bdir, bid, original=False) + if src_o is None or src_c is None: + continue + + for g in gage_list: + feat = recur[recur["feature_id"] == g["feature_id"]] + if feat.empty: + continue + Q_2yr = float(feat["Q_2yr_cms"].iloc[0]) + if Q_2yr <= 0: + continue + + hid = g["hydroid"] + s_o = src_o[src_o["HydroID"] == hid].sort_values("Stage") + s_c = src_c[src_c["HydroID"] == hid].sort_values("Stage") + if s_o.empty or s_c.empty: + continue + + base = {"Q_2yr": Q_2yr} + orig.append({**base, + "q": s_o["Discharge (m3s-1)"].values.copy(), + "h": s_o["Stage"].values.copy()}) + calib.append({**base, + "q": s_c["Discharge (m3s-1)"].values.copy(), + "h": s_c["Stage"].values.copy()}) + + # USGS RC converted to HAND space + grc = rc[rc["location_id"] == g["location_id"]].copy() + grc["hand"] = grc["elev_m"] - g["dem_adj_m"] + grc = grc[grc["hand"] > 0].sort_values("hand") + if not grc.empty: + usgs_rc.append({**base, + "q": grc["flow_cms"].values.copy(), + "h": grc["hand"].values.copy()}) + + log.info(" %d SRC pairs | %d USGS RC sets", len(orig), len(usgs_rc)) + return orig, calib, usgs_rc + + +# ══════════════════════════════════════════════════════════════════ +# SECTION 2 — Ensemble median +# ══════════════════════════════════════════════════════════════════ + +def _median_on_grid(curves: list[dict], h_grid: np.ndarray, + normalized: bool) -> np.ndarray: + """ + Interpolate each curve onto h_grid, return nanmedian across all curves. + If normalized=True, divide each curve's discharge by its Q_2yr first. + """ + q_mat = np.full((len(curves), len(h_grid)), np.nan) + for i, c in enumerate(curves): + q = c["q"] / c["Q_2yr"] if normalized else c["q"] + valid = c["h"] > 0 + if valid.sum() < 2: + continue + q_mat[i] = np.interp(h_grid, c["h"][valid], q[valid], + left=np.nan, right=np.nan) + return np.nanmedian(q_mat, axis=0) + + +# ══════════════════════════════════════════════════════════════════ +# SECTION 3 — Plot 1: Side-by-side panels (original | calibrated) +# ══════════════════════════════════════════════════════════════════ + +def plot_sidebyside(orig: list, calib: list, out_path: Path) -> None: + """ + Two panels with the same axes: original SRCs on the left, calibrated on + the right. Individual curves are drawn in gray (neutral — each represents + a different-sized river) with a bold colored median. No percentile bands. + X-axis is clipped at the 98th percentile of discharge so the majority of + curves are visible without outliers dominating the scale. + """ + # Shared axis limits from all curves combined + all_q = np.concatenate([c["q"] for c in orig + calib]) + all_h = np.concatenate([c["h"] for c in orig + calib]) + x_max = float(np.nanpercentile(all_q[all_q > 0], 98)) * 1.05 + y_max = float(np.nanpercentile(all_h, 99)) * 1.05 + + h_grid = np.linspace(0.05, y_max, 600) + med_o = _median_on_grid(orig, h_grid, normalized=False) + med_c = _median_on_grid(calib, h_grid, normalized=False) + + fig, axes = plt.subplots(1, 2, figsize=(16, 8), + sharey=True, constrained_layout=True) + + panels = [ + (axes[0], orig, med_o, C_ORIG, "Original SRCs (pre-calibration)"), + (axes[1], calib, med_c, C_CALIB, "Calibrated SRCs (post-calibration)"), + ] + + for ax, curves, med, color, title in panels: + # Individual gray spaghetti + for c in curves: + mask = (c["q"] > 0) & (c["q"] <= x_max * 1.05) + ax.plot(c["q"][mask], c["h"][mask], + color="#777", lw=0.6, alpha=0.28, zorder=2) + # Median + valid = (med > 0) & (med <= x_max) + ax.plot(med[valid], h_grid[valid], + color=color, lw=3.0, zorder=6, label="Median") + + ax.set_xlim(0, x_max) + ax.set_ylim(0, y_max) + ax.set_xlabel("Discharge (m³/s)", labelpad=8) + ax.set_title(f"{title}\n(n = {len(curves)} reaches)", pad=8) + ax.legend(loc="upper left", fontsize=10) + ax.xaxis.set_major_formatter( + plt.matplotlib.ticker.FuncFormatter(lambda x, _: f"{x:,.0f}") + ) + + axes[0].set_ylabel("HAND Stage (m above thalweg)", labelpad=8) + + fig.suptitle("Synthetic Rating Curve Ensemble — Original vs Calibrated", + fontsize=13, fontweight="bold") + + fig.savefig(out_path, dpi=150, bbox_inches="tight", facecolor="white") + plt.close(fig) + log.info("Saved: %s", out_path.name) + + +# ══════════════════════════════════════════════════════════════════ +# SECTION 4 — Plot 2: Absolute, log x-scale +# ══════════════════════════════════════════════════════════════════ + +def plot_absolute(orig: list, calib: list, out_path: Path) -> None: + """ + Original and calibrated SRCs on the same axes. + + Log x-scale lets small creeks and large rivers coexist without the large + rivers dominating the entire visible area. No percentile bands — they + are not interpretable when river sizes span 3-4 orders of magnitude. + The median lines summarise the typical shift from calibration. + """ + fig, ax = plt.subplots(figsize=(13, 8), constrained_layout=True) + + # ── Individual spaghetti ────────────────────────────────────── + for c in orig: + mask = c["q"] > 0 + ax.plot(c["q"][mask], c["h"][mask], + color=C_ORIG, lw=0.55, alpha=0.18, zorder=2) + for c in calib: + mask = c["q"] > 0 + ax.plot(c["q"][mask], c["h"][mask], + color=C_CALIB, lw=0.55, alpha=0.18, zorder=2) + + # ── Medians ─────────────────────────────────────────────────── + h_max = float(max(c["h"].max() for c in orig + calib)) + h_grid = np.linspace(0.05, h_max, 600) + med_o = _median_on_grid(orig, h_grid, normalized=False) + med_c = _median_on_grid(calib, h_grid, normalized=False) + + ax.plot(med_o[med_o > 0], h_grid[med_o > 0], + color=C_ORIG, lw=3.0, zorder=6, label="Original — median") + ax.plot(med_c[med_c > 0], h_grid[med_c > 0], + color=C_CALIB, lw=3.0, ls="--", zorder=6, label="Calibrated — median") + + # ── Axes ────────────────────────────────────────────────────── + q_pos = np.concatenate([c["q"][c["q"] > 0] for c in orig + calib]) + ax.set_xscale("log") + ax.set_xlim(max(float(np.nanpercentile(q_pos, 0.5)), 0.1), None) + ax.set_ylim(0, h_max) + ax.set_xlabel("Discharge (m³/s) — log scale", labelpad=8) + ax.set_ylabel("HAND Stage (m above thalweg)", labelpad=8) + ax.set_title( + "SRC Ensemble — Original vs Calibrated\n" + f"n = {len(orig)} gauged HydroIDs | log x-scale so all river sizes are visible", + pad=10, + ) + + handles = [ + Line2D([0], [0], color=C_ORIG, lw=3.0, label=f"Original — median (n = {len(orig)})"), + Line2D([0], [0], color=C_CALIB, lw=3.0, ls="--", label=f"Calibrated — median (n = {len(calib)})"), + Line2D([0], [0], color="#888", lw=0.8, alpha=0.5, label="Individual reaches"), + ] + ax.legend(handles=handles, loc="upper left") + ax.text(0.98, 0.02, + "Median shift LEFT → calibration raises stage for same discharge | " + "RIGHT → calibration lowers stage", + transform=ax.transAxes, ha="right", va="bottom", + fontsize=8.5, color="#666", style="italic") + + fig.savefig(out_path, dpi=150, bbox_inches="tight", facecolor="white") + plt.close(fig) + log.info("Saved: %s", out_path.name) + + +# ══════════════════════════════════════════════════════════════════ +# SECTION 5 — Plot 3: Normalized by Q_2yr +# ══════════════════════════════════════════════════════════════════ + +def plot_normalized(orig: list, calib: list, usgs_rc: list, + out_path: Path) -> None: + """ + Discharge normalized by each gauge's NWM 2-yr recurrence flow. + + Q / Q_2yr = 1.0 → 2-yr flood (common flood) + Q / Q_2yr = 2 → roughly 5-yr flood + Q / Q_2yr = 5 → roughly 25-yr flood + + Removing river-size effect means the SHAPE of the SRC and the calibration + SHIFT are directly comparable across all reaches. + + Key property: by construction, calibrated SRCs cross Q/Q_2yr = 1.0 at + exactly the HAND stage where the USGS RC hits the 2-yr NWM flow. + """ + fig, ax = plt.subplots(figsize=(13, 8), constrained_layout=True) + + # Reasonable x-axis limit: 98th %ile of normalized Q across all curves + all_qn = np.concatenate([c["q"] / c["Q_2yr"] + for c in orig + calib if c["Q_2yr"] > 0]) + x_max = max(float(np.nanpercentile(all_qn[all_qn > 0], 98)) * 1.1, 6.0) + + def _clip(q_norm, h): + mask = (q_norm > 0) & (q_norm <= x_max * 1.05) & (h > 0) + return q_norm[mask], h[mask] + + # ── USGS RC (plotted first, behind SRC curves) ──────────────── + for c in usgs_rc: + qn = c["q"] / c["Q_2yr"] + qn_c, h_c = _clip(qn, c["h"]) + if len(h_c) < 2: + continue + ax.plot(qn_c, h_c, color=C_USGS, lw=0.8, alpha=0.15, zorder=1) + + # ── Individual SRC curves ───────────────────────────────────── + for c in orig: + qn = c["q"] / c["Q_2yr"] + qn_c, h_c = _clip(qn, c["h"]) + ax.plot(qn_c, h_c, color=C_ORIG, lw=0.55, alpha=0.18, zorder=2) + for c in calib: + qn = c["q"] / c["Q_2yr"] + qn_c, h_c = _clip(qn, c["h"]) + ax.plot(qn_c, h_c, color=C_CALIB, lw=0.55, alpha=0.18, zorder=2) + + # ── Medians ─────────────────────────────────────────────────── + h_max = min(float(max(c["h"].max() for c in orig + calib)), 20.0) + h_grid = np.linspace(0.05, h_max, 600) + med_o = _median_on_grid(orig, h_grid, normalized=True) + med_c = _median_on_grid(calib, h_grid, normalized=True) + + valid_o = (med_o > 0) & (med_o <= x_max) + valid_c = (med_c > 0) & (med_c <= x_max) + ax.plot(med_o[valid_o], h_grid[valid_o], + color=C_ORIG, lw=3.0, zorder=6, label="Original — median") + ax.plot(med_c[valid_c], h_grid[valid_c], + color=C_CALIB, lw=3.0, ls="--", zorder=6, label="Calibrated — median") + + # ── Reference line at Q/Q_2yr = 1 ──────────────────────────── + ax.axvline(1.0, color="#555", ls=":", lw=1.8, alpha=0.8, zorder=5) + ax.text(1.03, h_max * 0.97, "Q = Q₂ yr", + ha="left", va="top", fontsize=9.5, color="#555", style="italic") + + # ── Axes ────────────────────────────────────────────────────── + ax.set_xlim(0, x_max) + ax.set_ylim(0, h_max) + ax.set_xlabel("Normalized Discharge Q / Q₂ᵧᵣ (dimensionless)", labelpad=8) + ax.set_ylabel("HAND Stage (m above thalweg)", labelpad=8) + ax.set_title( + "SRC Ensemble — Normalized by 2-Year Recurrence Flow\n" + "River-size effect removed · shape and calibration shift are directly comparable", + pad=10, + ) + + handles = [ + Line2D([0], [0], color=C_ORIG, lw=3.0, label=f"Original SRC — median (n = {len(orig)})"), + Line2D([0], [0], color=C_CALIB, lw=3.0, ls="--", label=f"Calibrated SRC — median (n = {len(calib)})"), + Line2D([0], [0], color=C_USGS, lw=1.0, alpha=0.5, label=f"USGS observed RC (n = {len(usgs_rc)})"), + Line2D([0], [0], color="#888", lw=0.8, alpha=0.5, label="Individual reaches"), + ] + ax.legend(handles=handles, loc="upper left") + ax.text(0.98, 0.02, + "Calibrated SRCs cross Q/Q₂ᵧᵣ = 1.0 at the 2-yr HAND stage by construction", + transform=ax.transAxes, ha="right", va="bottom", + fontsize=8.5, color="#666", style="italic") + + fig.savefig(out_path, dpi=150, bbox_inches="tight", facecolor="white") + plt.close(fig) + log.info("Saved: %s", out_path.name) + + +# ══════════════════════════════════════════════════════════════════ +# SECTION 6 — Main +# ══════════════════════════════════════════════════════════════════ + +def main(): + SUMMARY_DIR.mkdir(parents=True, exist_ok=True) + + df = pd.read_excel(EXCEL_PATH) + hucs = [str(int(c)).zfill(8) for c in df[HUC_CODE_COL]] + log.info("Study area: %d HUCs", len(hucs)) + + log.info("Loading shared tables …") + rc, recur = _load_shared() + + log.info("Collecting SRC curves …") + orig, calib, usgs_rc = collect_curves(hucs, rc, recur) + + log.info("Plot 1 — side-by-side panels (original | calibrated) …") + plot_sidebyside(orig, calib, + SUMMARY_DIR / "src_ensemble_sidebyside.png") + + log.info("Plot 2 — absolute ensemble overlaid (log scale) …") + plot_absolute(orig, calib, + SUMMARY_DIR / "src_ensemble_absolute.png") + + log.info("Plot 3 — normalized ensemble (Q / Q_2yr) …") + plot_normalized(orig, calib, usgs_rc, + SUMMARY_DIR / "src_ensemble_normalized.png") + + log.info("Done. Outputs → %s", SUMMARY_DIR) + + +if __name__ == "__main__": + main() diff --git a/scripts/plot_srcs.py b/scripts/plot_srcs.py new file mode 100644 index 0000000..7fd2af6 --- /dev/null +++ b/scripts/plot_srcs.py @@ -0,0 +1,168 @@ +""" +Plot Synthetic Rating Curves (SRCs) for a single HUC8 after Stage 2. + +Reads all hydroTable_*.csv files from every branch, then produces two figures: + 1. Overview — all SRCs on one axes, colored by Strahler order + 2. Branch grid — one subplot per branch showing individual reach SRCs + +Run: + .venv\\Scripts\\python.exe scripts/plot_srcs.py +""" +from pathlib import Path +import pandas as pd +import matplotlib.pyplot as plt +import matplotlib.colors as mcolors +import numpy as np + +# ── CONFIG ──────────────────────────────────────────────────────── +HUC8 = "03020102" +OUT_DIR = Path("D:/SI/out") +SAVE_TO = Path("D:/SI/out") / f"HUC{HUC8}" / "src_plots" +# ───────────────────────────────────────────────────────────────── + +WATERSHED = OUT_DIR / f"HUC{HUC8}" / "watershed-data" +BRANCHES = WATERSHED / "branches" + + +def load_all_hydrotables() -> pd.DataFrame: + frames = [] + for branch_dir in sorted(BRANCHES.iterdir()): + if not branch_dir.is_dir(): + continue + branch_id = branch_dir.name + htable = branch_dir / f"hydroTable_{branch_id}.csv" + if not htable.exists(): + continue + df = pd.read_csv(htable, dtype={"feature_id": str}) + df["branch_id"] = branch_id + frames.append(df) + if not frames: + raise FileNotFoundError(f"No hydroTable CSVs found under {BRANCHES}") + return pd.concat(frames, ignore_index=True) + + +def _order_cmap(orders): + max_order = max(orders) if orders else 5 + cmap = plt.colormaps["plasma_r"].resampled(max_order) + return {o: mcolors.to_hex(cmap(o / max_order)) for o in range(1, max_order + 1)} + + +def plot_overview(df: pd.DataFrame, save_to: Path) -> None: + """All SRCs on one axes, colored by Strahler stream order.""" + fig, ax = plt.subplots(figsize=(10, 7)) + + orders = sorted(df["order_"].dropna().unique().astype(int)) + colors = _order_cmap(orders) + + for order in orders: + subset = df[df["order_"] == order] + for _, grp in subset.groupby("HydroID"): + grp_s = grp.sort_values("stage") + ax.plot( + grp_s["discharge_cms"], + grp_s["stage"], + color=colors[order], + alpha=0.25, + linewidth=0.7, + ) + + # Legend — one proxy per order + handles = [ + plt.Line2D([0], [0], color=colors[o], linewidth=1.5, label=f"Order {o}") + for o in orders + ] + ax.legend(handles=handles, title="Strahler Order", loc="upper left", fontsize=8) + + ax.set_xlabel("Discharge (m³/s)", fontsize=11) + ax.set_ylabel("Stage (m)", fontsize=11) + ax.set_title(f"Synthetic Rating Curves — HUC8 {HUC8}\n" + f"({df['HydroID'].nunique()} reaches across " + f"{df['branch_id'].nunique()} branches)", + fontsize=12) + ax.set_xlim(left=0) + ax.set_ylim(bottom=0) + ax.grid(True, alpha=0.3) + fig.tight_layout() + + out = save_to / "src_overview.png" + fig.savefig(out, dpi=150) + plt.close(fig) + print(f"Saved: {out}") + + +def plot_branch_grid(df: pd.DataFrame, save_to: Path) -> None: + """One subplot per branch, each reach as a separate line.""" + branches = sorted(df["branch_id"].unique()) + n = len(branches) + ncols = 4 + nrows = int(np.ceil(n / ncols)) + + fig, axes = plt.subplots(nrows, ncols, figsize=(ncols * 4, nrows * 3.5), + sharex=False, sharey=False) + axes_flat = np.array(axes).flatten() + + orders_global = sorted(df["order_"].dropna().unique().astype(int)) + colors = _order_cmap(orders_global) + + for i, branch_id in enumerate(branches): + ax = axes_flat[i] + bdf = df[df["branch_id"] == branch_id] + + for order in orders_global: + subset = bdf[bdf["order_"] == order] + for _, grp in subset.groupby("HydroID"): + grp_s = grp.sort_values("stage") + ax.plot( + grp_s["discharge_cms"], + grp_s["stage"], + color=colors[order], + alpha=0.5, + linewidth=0.9, + ) + + ax.set_title(f"Branch {branch_id}\n({bdf['HydroID'].nunique()} reaches)", + fontsize=7) + ax.set_xlabel("Q (m³/s)", fontsize=7) + ax.set_ylabel("Stage (m)", fontsize=7) + ax.tick_params(labelsize=6) + ax.set_xlim(left=0) + ax.set_ylim(bottom=0) + ax.grid(True, alpha=0.3) + + # Hide unused subplots + for j in range(i + 1, len(axes_flat)): + axes_flat[j].set_visible(False) + + # Shared legend at the bottom + handles = [ + plt.Line2D([0], [0], color=colors[o], linewidth=1.5, label=f"Order {o}") + for o in orders_global + ] + fig.legend(handles=handles, title="Strahler Order", + loc="lower right", ncol=len(orders_global), fontsize=8) + + fig.suptitle(f"SRCs by Branch — HUC8 {HUC8}", fontsize=13, y=1.01) + fig.tight_layout() + + out = save_to / "src_by_branch.png" + fig.savefig(out, dpi=150, bbox_inches="tight") + plt.close(fig) + print(f"Saved: {out}") + + +def main(): + SAVE_TO.mkdir(parents=True, exist_ok=True) + + print(f"Loading hydroTables from {BRANCHES} ...") + df = load_all_hydrotables() + print(f" {len(df):,} rows | {df['HydroID'].nunique()} reaches | " + f"{df['branch_id'].nunique()} branches") + + plot_overview(df, SAVE_TO) + plot_branch_grid(df, SAVE_TO) + + print("\nDone. Plots saved to:", SAVE_TO) + + +if __name__ == "__main__": + main() diff --git a/scripts/plot_workflow_chart.py b/scripts/plot_workflow_chart.py new file mode 100644 index 0000000..2d720cf --- /dev/null +++ b/scripts/plot_workflow_chart.py @@ -0,0 +1,240 @@ +""" +PPT-ready workflow chart for the FIMbox calibration pipeline. + +Run: + .venv\\Scripts\\python.exe scripts/plot_workflow_chart.py +""" +from pathlib import Path +from matplotlib.patches import FancyBboxPatch +import matplotlib.pyplot as plt +import matplotlib as mpl + +mpl.rcParams["font.family"] = "DejaVu Sans" + +SAVE_TO = Path("D:/SI/out/HUC03020102/src_plots") + +# ── Color palette ───────────────────────────────────────────────── +C_PIPE = "#1b4f72" # dark navy — OWP pipeline stages +C_CUSTOM = "#a04000" # burnt orange — custom scripts (our work) +C_DATA = "#1a6b3c" # dark green — external data sources +C_OUT = "#0e6655" # teal — calibrated outputs +C_FUTURE = "#717d7e" # gray — future steps +C_ML = "#6c3483" # purple — ML future step +BG = "#f0f3f4" # background + + +# ── Helpers ─────────────────────────────────────────────────────── + +def draw_box(ax, cx, cy, w, h, lines, color, tc="white", + fs=9.5, future=False): + lx, by = cx - w / 2, cy - h / 2 + fc = "#c8cacb" if future else color + ls = "--" if future else "-" + lw = 1.2 if future else 2.0 + ec = "#aaa" if future else "white" + tc_use = "#666" if future else tc + + ax.add_patch(FancyBboxPatch( + (lx, by), w, h, + boxstyle="round,pad=0.13", + facecolor=fc, edgecolor=ec, + linewidth=lw, linestyle=ls, zorder=3, + )) + ax.text(cx, cy, "\n".join(lines), + ha="center", va="center", fontsize=fs, + color=tc_use, fontweight="bold", + multialignment="center", linespacing=1.45, + zorder=4) + + +def arrow(ax, x1, y1, x2, y2, color="#444", lw=2.0, rad=0.0): + ax.annotate("", xy=(x2, y2), xytext=(x1, y1), + arrowprops=dict( + arrowstyle="-|>", color=color, + lw=lw, mutation_scale=14, + connectionstyle=f"arc3,rad={rad}", + ), + zorder=5, annotation_clip=False) + + +def legend_entry(ax, cx, cy, w, h, color, label, future=False): + fc = "#c8cacb" if future else color + ax.add_patch(FancyBboxPatch( + (cx - w/2, cy - h/2), w, h, + boxstyle="round,pad=0.06", + facecolor=fc, edgecolor="white", + linewidth=1.0, linestyle="--" if future else "-", + zorder=3, + )) + ax.text(cx + w/2 + 0.15, cy, label, + ha="left", va="center", fontsize=8.5, + color="#222", zorder=4) + + +# ── Figure ──────────────────────────────────────────────────────── + +fig, ax = plt.subplots(figsize=(17, 11)) +ax.set_xlim(0, 17) +ax.set_ylim(0, 11) +ax.axis("off") +fig.patch.set_facecolor(BG) +ax.set_facecolor(BG) + +MX = 8.7 # main pipeline x-center +BW = 5.6 # main box width + +# ── Title ───────────────────────────────────────────────────────── + +ax.text(MX, 10.7, + "FIMbox Manning's n Calibration Pipeline", + ha="center", va="center", fontsize=15, + fontweight="bold", color="#111") +ax.text(MX, 10.35, + "NOAA OWP HAND-FIM Testbed | NC/SC Coastal Study Area | 24 HUC8s", + ha="center", va="center", fontsize=10, color="#555") + +# ── Main pipeline boxes ─────────────────────────────────────────── + +# 1. Study area +draw_box(ax, MX, 9.55, BW, 0.62, + ["Study Area Definition", + "24 HUC8s | NC / SC Coast (HUC2 = 03) | Test HUC8: 03020102"], + C_PIPE) + +# 2. Stage 1 +draw_box(ax, MX, 8.60, BW, 0.65, + ["Stage 1 - Input Data Acquisition", + "NWM streams WBD boundaries DEM NHD flowlines"], + C_PIPE) + +# 3. Stage 2 +draw_box(ax, MX, 7.38, BW, 0.90, + ["Stage 2 - HAND-FIM Preprocessing (OWP FIMbox)", + "17 branches (1 trunk + 16 level-path branches)", + "HAND rasters | Synthetic Rating Curves (SRCs) per HydroID"], + C_PIPE) + +# 4. USGS Crosswalk +draw_box(ax, MX, 5.93, BW, 0.90, + ["USGS Gage Crosswalk", + "Download gauges | Assign to NWM branches", + "Snap to thalweg | Extract HAND-stage elevation"], + C_CUSTOM) + +# 5. n Calibration +draw_box(ax, MX, 4.48, BW, 0.90, + ["Manning's n Calibration", + "Back-calculate n at 6 recurrence-interval zones (2, 5, 10, 25, 50, >50 yr)", + "Piecewise n applied to directly calibrated branches only"], + C_CUSTOM) + +# 6a. Calibrated branches (left branch) +draw_box(ax, 4.5, 3.00, 4.6, 0.82, + ["Calibrated Branches (2 of 17)", + "Branch 3351000014 | Gauge 02083000", + "Branch 3351000023 | Gauge 02082950"], + C_OUT) + +# 6b. Ungauged (right branch — future) +draw_box(ax, 13.2, 3.00, 4.6, 0.82, + ["Ungauged Branches (15 of 17)", + "ML-based n Regionalization", + "(future work)"], + C_ML, future=True) + +# 7. Validation plots +draw_box(ax, 4.5, 1.72, 4.6, 0.82, + ["Calibration Validation Plots", + "Orig. SRC vs Calibrated SRC vs USGS RC", + "Manning's n variation with HAND stage"], + C_CUSTOM) + +# 8. Future pipeline bar +draw_box(ax, 8.7, 0.55, 14.0, 0.60, + ["Stage 3: Streamflow Mapping Stage 4: FIM Calibration Stage 5: FIM Generation (Flood Inundation Maps)"], + C_FUTURE, future=True) + +# ── External data source boxes ──────────────────────────────────── + +# USGS gauge network → Crosswalk +draw_box(ax, 15.2, 5.93, 2.9, 0.62, + ["USGS Gauge Network", + "ArcGIS Online API"], + C_DATA, fs=8.8) + +# USGS RC + NWM Recurrence → Calibration +draw_box(ax, 15.2, 4.48, 2.9, 0.82, + ["USGS Rating Curves", + "usgs_rating_curves.parquet", + "NWM Recurrence Flows (2-50 yr)"], + C_DATA, fs=8.5) + +# ── Arrows ──────────────────────────────────────────────────────── + +PIPE_COL = "#1b4f72" + +# Main vertical flow +arrow(ax, MX, 9.24, MX, 8.93) # Study → Stage1 +arrow(ax, MX, 8.28, MX, 7.83) # Stage1 → Stage2 +arrow(ax, MX, 6.93, MX, 6.38) # Stage2 → Crosswalk +arrow(ax, MX, 5.48, MX, 4.93) # Crosswalk → Calibration + +# Calibration → split +arrow(ax, 7.0, 4.03, 5.0, 3.42, color="#0e6655") # → Calibrated +arrow(ax, 10.4, 4.03, 12.6, 3.42, color=C_ML) # → Ungauged + +# Calibrated → Validation +arrow(ax, 4.5, 2.59, 4.5, 2.13, color="#0e6655") + +# Validation → Future +arrow(ax, 5.8, 1.31, 7.5, 0.85, color="#777", lw=1.5) + +# Data inputs (horizontal) +arrow(ax, 13.75, 5.93, 11.51, 5.93, color=C_DATA, lw=1.6) # gauges → crosswalk +arrow(ax, 13.75, 4.48, 11.51, 4.48, color=C_DATA, lw=1.6) # RC+NWM → calibration + +# ── "Future Work" separator line ────────────────────────────────── + +ax.plot([1.0, 16.0], [1.18, 1.18], + color="#aaa", lw=1.2, ls="--", zorder=2) +ax.text(16.1, 1.18, "Future", ha="left", va="center", + fontsize=8.5, color="#777", fontstyle="italic") + +# ── Legend (upper-left, clear of all boxes) ────────────────────── + +items = [ + (C_PIPE, "OWP FIMbox Pipeline Stage (completed)", False), + (C_CUSTOM, "Custom Script (this project)", False), + (C_DATA, "External Data Source", False), + (C_OUT, "Calibrated Output", False), + (C_ML, "Future - ML Regionalization", True), + (C_FUTURE, "Future - Pipeline Continuation", True), +] + +lx0, ly0, dy = 0.55, 8.60, 0.50 + +# subtle background panel +n_items = len(items) +panel_h = dy * n_items + 0.15 +panel_y = ly0 - panel_h + 0.38 +ax.add_patch(FancyBboxPatch( + (0.18, panel_y - 0.10), 4.55, panel_h + 0.55, + boxstyle="round,pad=0.08", + facecolor="white", edgecolor="#ccc", + linewidth=0.9, alpha=0.82, zorder=2, +)) + +ax.text(0.40, ly0 + 0.38, "Legend", ha="left", va="center", + fontsize=9.5, fontweight="bold", color="#333", zorder=5) + +for i, (col, lbl, fut) in enumerate(items): + ly = ly0 - i * dy + legend_entry(ax, lx0 + 0.22, ly, 0.44, 0.30, col, lbl, future=fut) + +fig.tight_layout(pad=0.3) + +out = SAVE_TO / "workflow_chart.png" +SAVE_TO.mkdir(parents=True, exist_ok=True) +fig.savefig(out, dpi=200, bbox_inches="tight", facecolor=BG) +plt.close(fig) +print(f"Saved: {out}") diff --git a/scripts/s00_study_boundary.py b/scripts/s00_study_boundary.py new file mode 100644 index 0000000..3faf1a0 --- /dev/null +++ b/scripts/s00_study_boundary.py @@ -0,0 +1,66 @@ +from pathlib import Path +import pandas as pd +import geopandas as gpd +from shapely import wkt + +# ── EDIT THESE ─────────────────────────────────────────────────── +EXCEL_PATH = Path(r"C:\Users\Ali\OneDrive - CUNY\Desktop\SI\fimbox_SI26\data\study_area.xlsx") + +# "wkt" — use the wkt_geom column already in the Excel (requires knowing the CRS) +# "service" — fetch boundaries by HUC_CODE from the USGS web service (no CRS needed) +MODE = "service" + +# Used only when MODE = "wkt". Confirm the CRS with your data source. +# Coordinate values ~1.5M–1.65M suggest EPSG:5070 (NAD83 Conus Albers). +WKT_CRS = "EPSG:5069" + +# Optional: list specific HUC_CODE values to include. +# Leave as [] to use ALL rows in the Excel. +HUC_CODES = [] # e.g. [3020102, 3020103, 3020104] +# ───────────────────────────────────────────────────────────────── + +OUT_PATH = Path("docs/study_boundary/study_area.gpkg") + +df = pd.read_excel(EXCEL_PATH) +print(f"Loaded {len(df)} rows from Excel") + +if HUC_CODES: + df = df[df["HUC_CODE"].isin(HUC_CODES)] + print(f"Filtered to {len(df)} rows matching HUC_CODES") + +if MODE == "wkt": + # ── Option A: use WKT geometry already in the Excel ────────── + df["geometry"] = df["wkt_geom"].apply(wkt.loads) + gdf = gpd.GeoDataFrame(df, geometry="geometry", crs=WKT_CRS) + +elif MODE == "service": + # ── Option B: fetch boundaries from USGS web service ───────── + # HUC_CODE in the Excel may be 7-digit (e.g. 3020102); zero-pad to 8. + from fimbox import HUC8Finder + finder = HUC8Finder() + frames = [] + skipped = [] + for code in df["HUC_CODE"]: + huc8 = str(int(code)).zfill(8) + print(f" Fetching HUC8 {huc8} ...") + try: + frames.append(finder.from_huc8(huc8)) + except ValueError: + print(f" WARNING: HUC8 {huc8} not found in service — skipping") + skipped.append(huc8) + if skipped: + print(f"\nSkipped {len(skipped)} HUC(s) not found in service: {skipped}") + gdf = pd.concat(frames, ignore_index=True) + gdf = gpd.GeoDataFrame(gdf, geometry="geometry") + +else: + raise ValueError(f"MODE must be 'wkt' or 'service', got {MODE!r}") + +dissolved = gdf.dissolve().reset_index(drop=True)[["geometry"]] +# Simplify to reduce vertex count — the ArcGIS FeatureServer rejects +# overly complex geometries as spatial filters (returns HTTP 400). +dissolved["geometry"] = dissolved["geometry"].simplify(0.01, preserve_topology=True) + +OUT_PATH.parent.mkdir(parents=True, exist_ok=True) +dissolved.to_file(OUT_PATH, driver="GPKG", index=False) +print(f"Saved merged boundary ({len(df)} HUCs dissolved) --> {OUT_PATH}") diff --git a/scripts/s01_download_dem.py b/scripts/s01_download_dem.py new file mode 100644 index 0000000..fc3013b --- /dev/null +++ b/scripts/s01_download_dem.py @@ -0,0 +1,96 @@ +""" +Download 10m 3DEP DEM per HUC8, save each tile to disk, then merge. +This avoids the in-memory OOM that happens when merging all tiles at once +for a large (24 HUC) study area. + +Run ONCE before test_getallinputdata.py, then pass the output path via dem= in the test. +""" +from pathlib import Path + +import pandas as pd +import geopandas as gpd +import py3dep +from rasterio.merge import merge as rio_merge + +# ── EDIT THESE ──────────────────────────────────────────────────── +EXCEL_PATH = Path(r"C:\Users\Ali\OneDrive - CUNY\Desktop\SI\fimbox_SI26\data\study_area.xlsx") +HUC_CODE_COL = "HUC_CODE" +HUC_CODES = [] # leave empty to use ALL rows; or e.g. [3020102, 3020103] +EPSG = 5070 # CONUS Albers — matches the rest of the fimbox pipeline +RESOLUTION = 10 # metres +# ────────────────────────────────────────────────────────────────── + +TEMP_DIR = Path("E:/SI/out/study_area/watershed-data/dem_tiles") +OUT_DEM = Path("E:/SI/out/study_area/watershed-data/3dep_dem_10m.tif") + +from fimbox.preprocessing.download_data.utils import HUC8Finder + + +def _download_huc(huc8: str, finder: HUC8Finder) -> Path | None: + tile_path = TEMP_DIR / f"dem_{huc8}.tif" + if tile_path.exists(): + print(f" {huc8}: tile already on disk — skipping download") + return tile_path + + print(f" {huc8}: fetching boundary …") + try: + gdf = finder.from_huc8(huc8) + except ValueError: + print(f" {huc8}: WARNING — not found in service, skipping") + return None + + geom = gdf.to_crs(epsg=4326).union_all() + + print(f" {huc8}: downloading 3DEP 10m …") + dem = py3dep.get_dem(geom, resolution=RESOLUTION, crs=4326) + + print(f" {huc8}: reprojecting to EPSG:{EPSG} …") + dem = dem.rio.reproject(f"EPSG:{EPSG}") + dem = dem.where(dem > -90000, -999999) + dem.rio.write_nodata(-999999, inplace=True) + dem.rio.to_raster( + str(tile_path), + driver="GTiff", compress="lzw", tiled=True, + blockxsize=256, blockysize=256, + ) + print(f" {huc8}: saved → {tile_path}") + return tile_path + + +def main(): + TEMP_DIR.mkdir(parents=True, exist_ok=True) + OUT_DEM.parent.mkdir(parents=True, exist_ok=True) + + df = pd.read_excel(EXCEL_PATH) + print(f"Loaded {len(df)} rows from Excel") + + if HUC_CODES: + df = df[df[HUC_CODE_COL].isin(HUC_CODES)] + print(f"Filtered to {len(df)} rows") + + finder = HUC8Finder() + tile_paths = [] + for code in df[HUC_CODE_COL]: + huc8 = str(int(code)).zfill(8) + path = _download_huc(huc8, finder) + if path is not None: + tile_paths.append(path) + + if not tile_paths: + raise RuntimeError("No tiles downloaded — nothing to merge.") + + print(f"\nMerging {len(tile_paths)} tiles to {OUT_DEM} …") + rio_merge( + [str(p) for p in tile_paths], + method="first", + dst_path=str(OUT_DEM), + ) + + print(f"\nDone. Merged DEM → {OUT_DEM}") + print("\nNext step: in tests/test_getallinputdata.py add:") + print(f' dem=Path("{OUT_DEM}"),') + print("to the getAllInputData() call.") + + +if __name__ == "__main__": + main() diff --git a/scripts/s02_stage1_all_hucs.py b/scripts/s02_stage1_all_hucs.py new file mode 100644 index 0000000..31cf897 --- /dev/null +++ b/scripts/s02_stage1_all_hucs.py @@ -0,0 +1,100 @@ +""" +Stage 1 — Data Download for all HUC8s in the study area. +Downloads NWM flowlines, catchments, levees, OSM roads, FEMA data. +Uses pre-downloaded 10m DEM tiles from DEM_TILES_DIR. + +Progress is written to TASK_LOG after each HUC so a re-run skips completed ones. + +Run: + .venv\\Scripts\\python.exe scripts/run_stage1_download.py +""" +import logging +import traceback +from pathlib import Path +import pandas as pd + +# ── CONFIG ──────────────────────────────────────────────────────── +EXCEL_PATH = Path(r"C:\Users\Ali\OneDrive - CUNY\Desktop\SI\fimbox_SI26\data\study_area.xlsx") +HUC_CODE_COL = "HUC_CODE" +DEM_TILES_DIR = Path("E:/SI/out/study_area/watershed-data/dem_tiles") +OUT_DIR = Path("E:/SI/out") +IDENTIFIER = "nwmmr" +BUFFER_M = 5000 +TASK_LOG = Path("E:/SI/out/stage1_status.txt") # records pass/fail per HUC +# ───────────────────────────────────────────────────────────────── + +logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S") +log = logging.getLogger(__name__) + + +def _load_done() -> set[str]: + """Return set of HUC8s already marked PASS in the task log.""" + if not TASK_LOG.exists(): + return set() + done = set() + for line in TASK_LOG.read_text().splitlines(): + parts = line.strip().split() + if len(parts) == 3 and parts[2] == "PASS": + done.add(parts[1]) + return done + + +def _log_result(huc8: str, status: str, note: str = "") -> None: + TASK_LOG.parent.mkdir(parents=True, exist_ok=True) + with TASK_LOG.open("a") as f: + f.write(f"stage1 {huc8} {status}{(' ' + note) if note else ''}\n") + + +def run_huc(huc8: str) -> None: + import fimbox + + dem_path = DEM_TILES_DIR / f"dem_{huc8}.tif" + if not dem_path.exists(): + raise FileNotFoundError(f"DEM tile not found: {dem_path}") + + pp = fimbox.getAllInputData( + huc8=huc8, + out_dir=str(OUT_DIR), + buffer_m=BUFFER_M, + headwater_buffer_cells=8, + get_flowlines=True, + get_catchments=True, + resolution="medium", + identifier=IDENTIFIER, + dem=dem_path, + ) + pp.run() + + +def main(): + df = pd.read_excel(EXCEL_PATH) + hucs = [str(int(c)).zfill(8) for c in df[HUC_CODE_COL]] + done = _load_done() + remaining = [h for h in hucs if h not in done] + + log.info(f"Stage 1: {len(hucs)} total | {len(done)} already done | {len(remaining)} to run") + + passed, failed = list(done), [] + for i, huc8 in enumerate(remaining, 1): + log.info(f" [{i}/{len(remaining)}] HUC8 = {huc8}") + try: + run_huc(huc8) + _log_result(huc8, "PASS") + passed.append(huc8) + log.info(f" [{huc8}] PASS → {OUT_DIR / f'HUC{huc8}'}") + except Exception: + err = traceback.format_exc().splitlines()[-1] + _log_result(huc8, "FAIL", err) + failed.append(huc8) + log.error(f" [{huc8}] FAIL: {err}") + + log.info(f"\nStage 1 done. Passed: {len(passed)} Failed: {len(failed)}") + if failed: + log.warning(f"Failed HUCs: {failed}") + log.warning(f"Re-run this script to retry failures (edit TASK_LOG to remove their FAIL lines first).") + + +if __name__ == "__main__": + main() diff --git a/scripts/s02_stage1_single_huc.py b/scripts/s02_stage1_single_huc.py new file mode 100644 index 0000000..808844a --- /dev/null +++ b/scripts/s02_stage1_single_huc.py @@ -0,0 +1,65 @@ +""" +Stage 1 — Data Download for a SINGLE HUC8. +Use this to test one HUC before running run_stage1_download.py on all 24. + +Set HUC8 below and run: + .venv\\Scripts\\python.exe scripts/run_single_huc_stage1.py +""" +import logging +import traceback +from pathlib import Path + +# ── SET THIS ────────────────────────────────────────────────────── +HUC8 = "03020102" +# ───────────────────────────────────────────────────────────────── + +DEM_TILES_DIR = Path("D:/SI/out/study_area/watershed-data/dem_tiles") +OUT_DIR = Path("D:/SI/out") +IDENTIFIER = "nwmmr" +BUFFER_M = 5000 +TASK_LOG = Path("D:/SI/out/stage1_status.txt") + +logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S") +log = logging.getLogger(__name__) + + +def _log_result(huc8: str, status: str, note: str = "") -> None: + TASK_LOG.parent.mkdir(parents=True, exist_ok=True) + with TASK_LOG.open("a") as f: + f.write(f"stage1 {huc8} {status}{(' ' + note) if note else ''}\n") + + +def main(): + import fimbox + + dem_path = DEM_TILES_DIR / f"dem_{HUC8}.tif" + if not dem_path.exists(): + log.error(f"DEM tile not found: {dem_path}") + return + + log.info(f"Stage 1 — HUC8: {HUC8}") + try: + pp = fimbox.getAllInputData( + huc8=HUC8, + out_dir=str(OUT_DIR), + buffer_m=BUFFER_M, + headwater_buffer_cells=8, + get_flowlines=True, + get_catchments=True, + resolution="medium", + identifier=IDENTIFIER, + dem=dem_path, + ) + pp.run() + _log_result(HUC8, "PASS") + log.info(f"PASS → {OUT_DIR / f'HUC{HUC8}'}") + except Exception: + err = traceback.format_exc() + _log_result(HUC8, "FAIL", err.splitlines()[-1]) + log.error(f"FAIL:\n{err}") + + +if __name__ == "__main__": + main() diff --git a/scripts/s03_stage2_all_hucs.py b/scripts/s03_stage2_all_hucs.py new file mode 100644 index 0000000..ba6129a --- /dev/null +++ b/scripts/s03_stage2_all_hucs.py @@ -0,0 +1,189 @@ +""" +Stage 2 — HAND Processing for all HUC8s in the study area. +Runs BranchDerivation then calculate_allbranches (DEM conditioning, +HAND rasters, synthetic rating curves) for each HUC. + +Prerequisite: run_stage1_download.py must have completed for a HUC +before this script processes it. + +Progress is written to TASK_LOG after each HUC so a re-run skips completed ones. + +Run: + .venv\\Scripts\\python.exe scripts/run_stage2_hand.py +""" +import logging +import time +import traceback +from datetime import datetime +from pathlib import Path +import pandas as pd + +# ── CONFIG ──────────────────────────────────────────────────────── +EXCEL_PATH = Path(r"C:\Users\Ali\OneDrive - CUNY\Desktop\SI\fimbox_SI26\data\study_area.xlsx") +HUC_CODE_COL = "HUC_CODE" +OUT_DIR = Path("E:/SI/out") +IDENTIFIER = "nwmmr" +TASK_LOG = Path("E:/SI/out/stage2_status.txt") +CONFIG_DIR = Path("config") # deny lists live here +# ───────────────────────────────────────────────────────────────── + +logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S") +log = logging.getLogger(__name__) + + +def _load_done() -> set[str]: + if not TASK_LOG.exists(): + return set() + done = set() + for line in TASK_LOG.read_text().splitlines(): + parts = line.strip().split() + if len(parts) >= 3 and parts[2] == "PASS": + done.add(parts[1]) + return done + + +def _log_result(huc8: str, status: str, note: str = "") -> None: + TASK_LOG.parent.mkdir(parents=True, exist_ok=True) + with TASK_LOG.open("a") as f: + f.write(f"stage2 {huc8} {status}{(' ' + note) if note else ''}\n") + + +def run_huc(huc8: str) -> None: + from fimbox import AOIProcessingConfig, BranchDerivation, calculate_allbranches + from fimbox._dask import _resolve_n_workers + + aoi_dir = OUT_DIR / f"HUC{huc8}" / "watershed-data" + if not aoi_dir.exists(): + raise FileNotFoundError(f"watershed-data not found — run Stage 1 first: {aoi_dir}") + + def _opt(p: Path): + return p if p.exists() else None + + # Step Z0: level paths + branch polygons + BranchDerivation( + out_dir=aoi_dir, + branch_id_attribute="levpa_id", + reach_id_attribute="ID", + branch_buffer_distance_meters=7000.0, + ).run() + + # Steps Z1 + B: BranchZero (whole AOI) then all non-zero branches in parallel + cfg = AOIProcessingConfig( + aoi_dir=aoi_dir, + branch_list_path=aoi_dir / "branch_ids.lst", + dem_path=aoi_dir / "dem.tif", + streams_gpkg=aoi_dir / f"{IDENTIFIER}_subset_streams.gpkg", + boundary_gpkg=aoi_dir / "wbd_buffered.gpkg", + bridge_elev_diff_path=_opt(aoi_dir / "bridge_elev_diff.tif"), + levee_gpkg_path=_opt(aoi_dir / "3d_nld_subset_levees_burned.gpkg"), + headwaters_gpkg=_opt(aoi_dir / f"{IDENTIFIER}_headwater_points_subset.gpkg"), + levelpaths_extended_gpkg=_opt( + aoi_dir / f"{IDENTIFIER}_subset_streams_levelPaths_extended.gpkg" + ), + # AGREE DEM conditioning + agree_buffer_m=15.0, + agree_smooth_drop=10.0, + agree_sharp_drop=1000.0, + # HAND geometry + cost_distance_tolerance=50.0, + lateral_elevation_threshold=10, + max_split_distance_m=1500.0, + slope_min=0.0001, + lakes_buffer_dist_m=100.0, + # SRC + mannings_n=0.06, + stage_min_m=0.0, + stage_interval_m=0.3048, + stage_max_m=25.2984, + min_catchment_area=0.25, + min_stream_length=0.5, + crosswalk_max_distance_m=100.0, + src_slope_source="iris_sword", + iris_slope_csv=None, + hfab_slope_column=None, + # execution + n_workers=_resolve_n_workers(), + keep_failed_branches=True, + delete_deny_list=True, + ) + + calculate_allbranches( + cfg, + run_branch_zero=True, + delete_deny_list=True, + deny_unit_list=CONFIG_DIR / "deny_unit.lst", + branch_ids_csv=aoi_dir / "branch_ids.csv", + ) + + +def _fmt(seconds: float) -> str: + seconds = int(seconds) + h, rem = divmod(seconds, 3600) + m, s = divmod(rem, 60) + if h: + return f"{h}h {m:02d}m {s:02d}s" + if m: + return f"{m}m {s:02d}s" + return f"{s}s" + + +def main(): + df = pd.read_excel(EXCEL_PATH) + hucs = [str(int(c)).zfill(8) for c in df[HUC_CODE_COL]] + done = _load_done() + remaining = [h for h in hucs if h not in done] + + log.info(f"Stage 2: {len(hucs)} total | {len(done)} already done | {len(remaining)} to run") + log.info(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + batch_start = time.time() + huc_times: list[tuple[str, float, str]] = [] # (huc8, elapsed_s, status) + passed, failed = list(done), [] + + for i, huc8 in enumerate(remaining, 1): + huc_start = time.time() + log.info(f" [{i}/{len(remaining)}] HUC8 = {huc8} | " + f"batch elapsed: {_fmt(huc_start - batch_start)}") + try: + run_huc(huc8) + elapsed = time.time() - huc_start + _log_result(huc8, "PASS") + passed.append(huc8) + huc_times.append((huc8, elapsed, "PASS")) + + # running average and ETA over completed HUCs + completed = [t for _, t, s in huc_times if s == "PASS"] + avg_s = sum(completed) / len(completed) + remaining_n = len(remaining) - i + eta_s = avg_s * remaining_n + log.info(f" [{huc8}] PASS | this HUC: {_fmt(elapsed)} | " + f"avg: {_fmt(avg_s)} | remaining: {remaining_n} | " + f"ETA: {_fmt(eta_s)}" + + (f" (~{datetime.fromtimestamp(time.time() + eta_s).strftime('%H:%M')})" + if remaining_n > 0 else " (last HUC)")) + except Exception: + elapsed = time.time() - huc_start + err = traceback.format_exc().splitlines()[-1] + _log_result(huc8, "FAIL", err) + failed.append(huc8) + huc_times.append((huc8, elapsed, "FAIL")) + log.error(f" [{huc8}] FAIL after {_fmt(elapsed)} | {err}") + + total = time.time() - batch_start + log.info(f"\n{'─'*60}") + log.info(f"Stage 2 complete | total time: {_fmt(total)}") + log.info(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + log.info(f"Passed: {len(passed)} | Failed: {len(failed)}") + if huc_times: + log.info(f"\nPer-HUC timing summary:") + for huc8, elapsed, status in huc_times: + log.info(f" {huc8} {status:4s} {_fmt(elapsed)}") + if failed: + log.warning(f"\nFailed HUCs: {failed}") + log.warning("Re-run this script to retry (FAIL lines are automatically retried).") + + +if __name__ == "__main__": + main() diff --git a/scripts/s03_stage2_single_huc.py b/scripts/s03_stage2_single_huc.py new file mode 100644 index 0000000..4efb051 --- /dev/null +++ b/scripts/s03_stage2_single_huc.py @@ -0,0 +1,107 @@ +""" +Stage 2 — HAND Processing for a SINGLE HUC8. +Use this to test one HUC before running run_stage2_hand.py on all 24. + +Prerequisite: Stage 1 must have completed for this HUC. + +Set HUC8 below and run: + .venv\\Scripts\\python.exe scripts/run_single_huc_stage2.py +""" +import logging +import traceback +from pathlib import Path + +# ── SET THIS ────────────────────────────────────────────────────── +HUC8 = "03020102" +# ───────────────────────────────────────────────────────────────── + +OUT_DIR = Path("D:/SI/out") +IDENTIFIER = "nwmmr" +CONFIG_DIR = Path("config") +TASK_LOG = Path("D:/SI/out/stage2_status.txt") + +logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S") +log = logging.getLogger(__name__) + + +def _log_result(huc8: str, status: str, note: str = "") -> None: + TASK_LOG.parent.mkdir(parents=True, exist_ok=True) + with TASK_LOG.open("a") as f: + f.write(f"stage2 {huc8} {status}{(' ' + note) if note else ''}\n") + + +def main(): + from fimbox import AOIProcessingConfig, BranchDerivation, calculate_allbranches + from fimbox._dask import _resolve_n_workers + + aoi_dir = OUT_DIR / f"HUC{HUC8}" / "watershed-data" + if not aoi_dir.exists(): + log.error(f"watershed-data not found — run Stage 1 first: {aoi_dir}") + return + + def _opt(p: Path): + return p if p.exists() else None + + log.info(f"Stage 2 — HUC8: {HUC8}") + try: + BranchDerivation( + out_dir=aoi_dir, + branch_id_attribute="levpa_id", + reach_id_attribute="ID", + branch_buffer_distance_meters=7000.0, + ).run() + + cfg = AOIProcessingConfig( + aoi_dir=aoi_dir, + branch_list_path=aoi_dir / "branch_ids.lst", + dem_path=aoi_dir / "dem.tif", + streams_gpkg=aoi_dir / f"{IDENTIFIER}_subset_streams.gpkg", + boundary_gpkg=aoi_dir / "wbd_buffered.gpkg", + bridge_elev_diff_path=_opt(aoi_dir / "bridge_elev_diff.tif"), + levee_gpkg_path=_opt(aoi_dir / "3d_nld_subset_levees_burned.gpkg"), + headwaters_gpkg=_opt(aoi_dir / f"{IDENTIFIER}_headwater_points_subset.gpkg"), + levelpaths_extended_gpkg=_opt( + aoi_dir / f"{IDENTIFIER}_subset_streams_levelPaths_extended.gpkg" + ), + agree_buffer_m=15.0, + agree_smooth_drop=10.0, + agree_sharp_drop=1000.0, + cost_distance_tolerance=50.0, + lateral_elevation_threshold=10, + max_split_distance_m=1500.0, + slope_min=0.0001, + lakes_buffer_dist_m=100.0, + mannings_n=0.06, + stage_min_m=0.0, + stage_interval_m=0.3048, + stage_max_m=25.2984, + min_catchment_area=0.25, + min_stream_length=0.5, + crosswalk_max_distance_m=100.0, + src_slope_source="iris_sword", + iris_slope_csv=None, + hfab_slope_column=None, + n_workers=_resolve_n_workers(), + keep_failed_branches=True, + delete_deny_list=True, + ) + + calculate_allbranches( + cfg, + run_branch_zero=True, + delete_deny_list=True, + deny_unit_list=CONFIG_DIR / "deny_unit.lst", + branch_ids_csv=aoi_dir / "branch_ids.csv", + ) + _log_result(HUC8, "PASS") + log.info("PASS") + except Exception: + err = traceback.format_exc() + _log_result(HUC8, "FAIL", err.splitlines()[-1]) + log.error(f"FAIL:\n{err}") + + +if __name__ == "__main__": + main() diff --git a/scripts/s04_usgs_gage_crosswalk.py b/scripts/s04_usgs_gage_crosswalk.py new file mode 100644 index 0000000..64b3365 --- /dev/null +++ b/scripts/s04_usgs_gage_crosswalk.py @@ -0,0 +1,216 @@ +""" +USGS Gage Crosswalk for a single HUC8. +Implements steps C20, C21, C22 from the repo (currently commented-out in tests). + +Steps: + C20 Download USGS gauge points within the HUC8 boundary from ArcGIS Online + C21 Assign each gauge to a NWM level path (branch ID) + C22 Per-branch: snap gauges to thalweg, sample DEM to get dem_adj_elevation + CON Consolidate all per-branch usgs_elev_table.csv into one at AOI root + (required by Stage 4 UsgsRatingCalibrator and the n-calibration script) + +Prerequisite: Stage 2 must have completed (all branch directories must exist). + +Run: + .venv\\Scripts\\python.exe scripts/run_usgs_gage_crosswalk.py +""" +import logging +import traceback +from pathlib import Path + +import pandas as pd + +# ── CONFIG ──────────────────────────────────────────────────────── +HUC8 = "03020102" +OUT_DIR = Path("D:/SI/out") +# ───────────────────────────────────────────────────────────────── + +IDENTIFIER = "nwmmr" +AOI_ROOT = OUT_DIR / f"HUC{HUC8}" +WATERSHED = AOI_ROOT / "watershed-data" +BRANCHES = WATERSHED / "branches" + +logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S") +log = logging.getLogger(__name__) + + +def step_c20_download_gages() -> Path: + """Download USGS gauge points within the HUC8 boundary.""" + from fimbox import DownloadUSGSGages + + out_path = WATERSHED / "usgs_gages.gpkg" + if out_path.exists(): + log.info("C20 SKIP (exists): %s", out_path.name) + return out_path + + boundary_gpkg = WATERSHED / "wbd.gpkg" + if not boundary_gpkg.exists(): + raise FileNotFoundError(f"HUC boundary not found: {boundary_gpkg}") + + log.info("C20: Downloading USGS gages for HUC8 %s …", HUC8) + gdf = DownloadUSGSGages(out_sr=5070, n_workers=8).download( + boundary=str(boundary_gpkg), + aoi_id=HUC8, + out_dir=WATERSHED, + out_name="usgs_gages.gpkg", + ) + if gdf.empty: + raise RuntimeError(f"No USGS gages found for HUC8 {HUC8}") + + log.info("C20 PASS: %d gages --> %s", len(gdf), out_path.name) + return out_path + + +def step_c21_assign_to_branches(usgs_gages_gpkg: Path) -> tuple[Path, Path]: + """Tag every gauge with a feature_id + levpa_id (branch ID).""" + from fimbox import assign_gages_to_branches + + out_aoi = WATERSHED / "usgs_subset_gages.gpkg" + out_bzero = WATERSHED / f"usgs_subset_gages_0.gpkg" + + if out_aoi.exists() and out_bzero.exists(): + log.info("C21 SKIP (exists): %s + %s", out_aoi.name, out_bzero.name) + return out_aoi, out_bzero + + levelpaths_gpkg = WATERSHED / f"{IDENTIFIER}_subset_streams_levelPaths.gpkg" + if not levelpaths_gpkg.exists(): + raise FileNotFoundError(f"Level-paths gpkg not found: {levelpaths_gpkg}") + + log.info("C21: Assigning gages to branches …") + result = assign_gages_to_branches( + usgs_gages_gpkg=usgs_gages_gpkg, + nwm_streams_levelpaths_gpkg=levelpaths_gpkg, + aoi_id=HUC8, + out_dir=WATERSHED, + aoi_filter_column="aoi_id", + branch_zero_id="0", + ) + if result is None: + raise RuntimeError("No gages could be assigned to branches") + + log.info("C21 PASS: %d gages assigned", len(result.aoi_gages)) + return result.aoi_gages_path, result.branch_zero_gages_path + + +def step_c22_per_branch_crosswalk(aoi_gages_gpkg: Path, bzero_gages_gpkg: Path) -> None: + """ + Snap gauges to the DEM thalweg in every branch; writes usgs_elev_table.csv per branch. + + Stage 2 (HAND) cleans up per-branch DEMs after use. The shared + ``watershed-data/dem.tif`` covers the full AOI and is used as both the + raw-elevation and thalweg-elevation raster. The small bias this introduces + (thalweg-conditioned DEM is slightly lower at channels) is acceptable when + the goal is to correct only gauged SRCs. + """ + from fimbox import run_branch_crosswalk + + # Shared DEM — covers entire watershed, used for all branches. + shared_dem = WATERSHED / "dem.tif" + if not shared_dem.exists(): + raise FileNotFoundError(f"Shared DEM not found: {shared_dem}") + + # Identify which branches actually contain gauges to avoid unnecessary work. + import geopandas as gpd + aoi_gages = gpd.read_file(aoi_gages_gpkg) + gauged_branch_ids = set(aoi_gages["levpa_id"].dropna().astype(str).unique()) + # Branch zero always gets a pass. + gauged_branch_ids.add("0") + log.info("C22: branches with gauges = %s", sorted(gauged_branch_ids)) + + branch_dirs = sorted(d for d in BRANCHES.iterdir() if d.is_dir()) + if not branch_dirs: + raise FileNotFoundError(f"No branch directories under {BRANCHES}") + + for branch_dir in branch_dirs: + bid = branch_dir.name + if bid not in gauged_branch_ids: + log.debug("C22 SKIP branch %s (no gauges)", bid) + continue + + out_table = branch_dir / "usgs_elev_table.csv" + if out_table.exists(): + log.info("C22 SKIP branch %s (exists)", bid) + continue + + # Branch zero uses the dedicated branch-zero gage set (levpa_id="0") + gages_gpkg = bzero_gages_gpkg if bid == "0" else aoi_gages_gpkg + + catchments_gpkg = branch_dir / ( + f"gw_catchments_reaches_filtered_addedAttributes_crosswalked_{bid}.gpkg" + ) + flows_gpkg = branch_dir / ( + f"demDerived_reaches_split_filtered_addedAttributes_crosswalked_{bid}.gpkg" + ) + + missing = [p for p in (catchments_gpkg, flows_gpkg) if not p.exists()] + if missing: + log.warning("C22 SKIP branch %s — missing: %s", bid, + ", ".join(p.name for p in missing)) + continue + + # Use per-branch dem if it survived cleanup; otherwise fall back to the shared DEM. + branch_dem = branch_dir / f"dem_{bid}.tif" + dem_path = branch_dem if branch_dem.exists() else shared_dem + + log.info("C22: crosswalk branch %s (dem=%s) …", bid, dem_path.name) + try: + out = run_branch_crosswalk( + aoi_gages_gpkg=gages_gpkg, + branch_catchments_gpkg=catchments_gpkg, + branch_flows_gpkg=flows_gpkg, + dem_path=dem_path, + dem_thalweg_path=dem_path, # shared DEM doubles as thalweg DEM + branch_id=bid, + out_dir=branch_dir, + ) + written = [p for p in out.values() if p is not None] + if written: + log.info(" branch %s OK: %s", bid, ", ".join(p.name for p in written)) + else: + log.info(" branch %s: no gages intersected catchments", bid) + except Exception: + log.error(" branch %s FAIL:\n%s", bid, traceback.format_exc()) + + +def step_con_consolidate() -> Path: + """ + Merge all per-branch usgs_elev_table.csv files into one at the AOI root. + Stage 4 UsgsRatingCalibrator and the n-calibration script both read from + aoi_root/usgs_elev_table.csv. + """ + out_path = AOI_ROOT / "usgs_elev_table.csv" + + frames = [] + for branch_dir in sorted(d for d in BRANCHES.iterdir() if d.is_dir()): + t = branch_dir / "usgs_elev_table.csv" + if t.exists(): + df = pd.read_csv(t, dtype={"location_id": str}) + df["levpa_id"] = branch_dir.name + frames.append(df) + + if not frames: + log.warning("CON: no per-branch usgs_elev_table.csv found — no gauges in HUC?") + return out_path + + consolidated = pd.concat(frames, ignore_index=True) + consolidated = consolidated.drop_duplicates(subset=["location_id", "HydroID"]) + consolidated.to_csv(out_path, index=False) + log.info("CON PASS: %d gage rows --> %s", len(consolidated), out_path) + return out_path + + +def main(): + log.info("=== USGS Gage Crosswalk — HUC8 %s ===", HUC8) + + usgs_gages_gpkg = step_c20_download_gages() + aoi_gages_gpkg, bzero_gages_gpkg = step_c21_assign_to_branches(usgs_gages_gpkg) + step_c22_per_branch_crosswalk(aoi_gages_gpkg, bzero_gages_gpkg) + elev_table = step_con_consolidate() + + log.info("=== Done. usgs_elev_table.csv --> %s ===", elev_table) + + +if __name__ == "__main__": + main() diff --git a/scripts/s04_usgs_gage_crosswalk_all_hucs.py b/scripts/s04_usgs_gage_crosswalk_all_hucs.py new file mode 100644 index 0000000..95c1d56 --- /dev/null +++ b/scripts/s04_usgs_gage_crosswalk_all_hucs.py @@ -0,0 +1,243 @@ +""" +USGS Gage Crosswalk for all HUC8s in the study area. + +Steps per HUC: + C20 Download USGS gauge points within the HUC8 boundary from ArcGIS Online + C21 Assign each gauge to a NWM level path (branch ID) + C22 Per-branch: snap gauges to thalweg, sample DEM to get dem_adj_elevation + CON Consolidate all per-branch usgs_elev_table.csv into one at AOI root + (required by s05 n-calibration and Stage 4 UsgsRatingCalibrator) + +Prerequisite: s03_stage2_all_hucs.py must have completed for each HUC. + +Progress is written to TASK_LOG after each HUC so a re-run skips completed ones. + +Run: + .venv\\Scripts\\python.exe scripts/s04_usgs_gage_crosswalk_all_hucs.py +""" +import logging +import time +import traceback +from datetime import datetime +from pathlib import Path + +import pandas as pd + +# ── CONFIG ──────────────────────────────────────────────────────── +EXCEL_PATH = Path(r"C:\Users\Ali\OneDrive - CUNY\Desktop\SI\fimbox_SI26\data\study_area.xlsx") +HUC_CODE_COL = "HUC_CODE" +OUT_DIR = Path("E:/SI/out") +IDENTIFIER = "nwmmr" +TASK_LOG = Path("E:/SI/out/crosswalk_status.txt") +# ───────────────────────────────────────────────────────────────── + +logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S") +log = logging.getLogger(__name__) + + +def _load_done() -> set[str]: + if not TASK_LOG.exists(): + return set() + done = set() + for line in TASK_LOG.read_text().splitlines(): + parts = line.strip().split() + if len(parts) >= 3 and parts[2] == "PASS": + done.add(parts[1]) + return done + + +def _log_result(huc8: str, status: str, note: str = "") -> None: + TASK_LOG.parent.mkdir(parents=True, exist_ok=True) + with TASK_LOG.open("a") as f: + f.write(f"crosswalk {huc8} {status}{(' ' + note) if note else ''}\n") + + +def _fmt(seconds: float) -> str: + seconds = int(seconds) + h, rem = divmod(seconds, 3600) + m, s = divmod(rem, 60) + if h: + return f"{h}h {m:02d}m {s:02d}s" + if m: + return f"{m}m {s:02d}s" + return f"{s}s" + + +def run_huc(huc8: str) -> None: + from fimbox import DownloadUSGSGages, assign_gages_to_branches, run_branch_crosswalk + import geopandas as gpd + + aoi_root = OUT_DIR / f"HUC{huc8}" + watershed = aoi_root / "watershed-data" + branches = watershed / "branches" + + if not watershed.exists(): + raise FileNotFoundError(f"watershed-data not found — run Stage 2 first: {watershed}") + + # ── C20: Download gages ────────────────────────────────────────── + usgs_gages_gpkg = watershed / "usgs_gages.gpkg" + if not usgs_gages_gpkg.exists(): + boundary_gpkg = watershed / "wbd.gpkg" + if not boundary_gpkg.exists(): + raise FileNotFoundError(f"HUC boundary not found: {boundary_gpkg}") + log.info(" C20: downloading USGS gages for HUC8 %s …", huc8) + gdf = DownloadUSGSGages(out_sr=5070, n_workers=8).download( + boundary=str(boundary_gpkg), + aoi_id=huc8, + out_dir=watershed, + out_name="usgs_gages.gpkg", + ) + if gdf.empty: + raise RuntimeError(f"No USGS gages found for HUC8 {huc8}") + log.info(" C20 PASS: %d gages", len(gdf)) + else: + log.info(" C20 SKIP (exists): usgs_gages.gpkg") + + # ── C21: Assign to branches ────────────────────────────────────── + out_aoi = watershed / "usgs_subset_gages.gpkg" + out_bzero = watershed / "usgs_subset_gages_0.gpkg" + if not out_aoi.exists() or not out_bzero.exists(): + levelpaths_gpkg = watershed / f"{IDENTIFIER}_subset_streams_levelPaths.gpkg" + if not levelpaths_gpkg.exists(): + raise FileNotFoundError(f"Level-paths gpkg not found: {levelpaths_gpkg}") + log.info(" C21: assigning gages to branches …") + result = assign_gages_to_branches( + usgs_gages_gpkg=usgs_gages_gpkg, + nwm_streams_levelpaths_gpkg=levelpaths_gpkg, + aoi_id=huc8, + out_dir=watershed, + aoi_filter_column="aoi_id", + branch_zero_id="0", + ) + if result is None: + raise RuntimeError("No gages could be assigned to branches") + log.info(" C21 PASS: %d gages assigned", len(result.aoi_gages)) + else: + log.info(" C21 SKIP (exists): usgs_subset_gages.gpkg") + + # ── C22: Per-branch crosswalk ──────────────────────────────────── + aoi_gages_gdf = gpd.read_file(out_aoi) + gauged_bids = set(aoi_gages_gdf["levpa_id"].dropna().astype(str).unique()) | {"0"} + log.info(" C22: branches with gauges = %s", sorted(gauged_bids)) + + shared_dem = watershed / "dem.tif" + if not shared_dem.exists(): + raise FileNotFoundError(f"Shared DEM not found: {shared_dem}") + + for branch_dir in sorted(d for d in branches.iterdir() if d.is_dir()): + bid = branch_dir.name + if bid not in gauged_bids: + continue + out_table = branch_dir / "usgs_elev_table.csv" + if out_table.exists(): + log.info(" C22 SKIP branch %s (exists)", bid) + continue + gages_gpkg = out_bzero if bid == "0" else out_aoi + catchments_gpkg = branch_dir / ( + f"gw_catchments_reaches_filtered_addedAttributes_crosswalked_{bid}.gpkg" + ) + flows_gpkg = branch_dir / ( + f"demDerived_reaches_split_filtered_addedAttributes_crosswalked_{bid}.gpkg" + ) + missing = [p for p in (catchments_gpkg, flows_gpkg) if not p.exists()] + if missing: + log.warning(" C22 SKIP branch %s — missing: %s", bid, + ", ".join(p.name for p in missing)) + continue + branch_dem = branch_dir / f"dem_{bid}.tif" + dem_path = branch_dem if branch_dem.exists() else shared_dem + log.info(" C22: crosswalk branch %s (dem=%s) …", bid, dem_path.name) + try: + out = run_branch_crosswalk( + aoi_gages_gpkg=gages_gpkg, + branch_catchments_gpkg=catchments_gpkg, + branch_flows_gpkg=flows_gpkg, + dem_path=dem_path, + dem_thalweg_path=dem_path, + branch_id=bid, + out_dir=branch_dir, + ) + written = [p for p in out.values() if p is not None] + log.info(" branch %s OK: %s", bid, + ", ".join(p.name for p in written) if written + else "no gages intersected catchments") + except Exception: + log.error(" branch %s FAIL:\n%s", bid, traceback.format_exc()) + + # ── CON: Consolidate ───────────────────────────────────────────── + frames = [] + for branch_dir in sorted(d for d in branches.iterdir() if d.is_dir()): + t = branch_dir / "usgs_elev_table.csv" + if t.exists(): + df = pd.read_csv(t, dtype={"location_id": str}) + df["levpa_id"] = branch_dir.name + frames.append(df) + if frames: + out_path = aoi_root / "usgs_elev_table.csv" + consolidated = pd.concat(frames, ignore_index=True).drop_duplicates( + subset=["location_id", "HydroID"] + ) + consolidated.to_csv(out_path, index=False) + log.info(" CON PASS: %d gage rows → usgs_elev_table.csv", len(consolidated)) + else: + log.warning(" CON: no per-branch usgs_elev_table.csv found — no gauges in HUC?") + + +def main(): + df = pd.read_excel(EXCEL_PATH) + hucs = [str(int(c)).zfill(8) for c in df[HUC_CODE_COL]] + done = _load_done() + remaining = [h for h in hucs if h not in done] + + log.info(f"Crosswalk: {len(hucs)} total | {len(done)} already done | {len(remaining)} to run") + log.info(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + batch_start = time.time() + huc_times: list[tuple[str, float, str]] = [] + passed, failed = list(done), [] + + for i, huc8 in enumerate(remaining, 1): + huc_start = time.time() + log.info(f" [{i}/{len(remaining)}] HUC8 = {huc8} | " + f"batch elapsed: {_fmt(huc_start - batch_start)}") + try: + run_huc(huc8) + elapsed = time.time() - huc_start + _log_result(huc8, "PASS") + passed.append(huc8) + huc_times.append((huc8, elapsed, "PASS")) + completed = [t for _, t, s in huc_times if s == "PASS"] + avg_s = sum(completed) / len(completed) + remaining_n = len(remaining) - i + eta_s = avg_s * remaining_n + log.info(f" [{huc8}] PASS | this HUC: {_fmt(elapsed)} | " + f"avg: {_fmt(avg_s)} | remaining: {remaining_n} | " + f"ETA: {_fmt(eta_s)}" + + (f" (~{datetime.fromtimestamp(time.time() + eta_s).strftime('%H:%M')})" + if remaining_n > 0 else " (last HUC)")) + except Exception: + elapsed = time.time() - huc_start + err = traceback.format_exc().splitlines()[-1] + _log_result(huc8, "FAIL", err) + failed.append(huc8) + huc_times.append((huc8, elapsed, "FAIL")) + log.error(f" [{huc8}] FAIL after {_fmt(elapsed)} | {err}") + + total = time.time() - batch_start + log.info(f"\n{'─'*60}") + log.info(f"Crosswalk complete | total time: {_fmt(total)}") + log.info(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + log.info(f"Passed: {len(passed)} | Failed: {len(failed)}") + if huc_times: + log.info(f"\nPer-HUC timing summary:") + for huc8, elapsed, status in huc_times: + log.info(f" {huc8} {status:4s} {_fmt(elapsed)}") + if failed: + log.warning(f"\nFailed HUCs: {failed}") + log.warning("Re-run this script to retry (FAIL lines are automatically retried).") + + +if __name__ == "__main__": + main() diff --git a/scripts/s05_calibrate_n_recurrence.py b/scripts/s05_calibrate_n_recurrence.py new file mode 100644 index 0000000..d7a0f7c --- /dev/null +++ b/scripts/s05_calibrate_n_recurrence.py @@ -0,0 +1,460 @@ +""" +Post-Stage-2 Manning's n calibration using USGS rating curves at NWM recurrence stages. + +Concept +------- +For each USGS gauge in the HUC8, the NWM supplies 5 recurrence-interval flows +(2yr, 5yr, 10yr, 25yr, 50yr in cfs). The USGS rating curve maps each of those +flows to an absolute water-surface elevation, which we convert to a HAND stage +using the thalweg DEM elevation sampled at the gauge point. + +Those 5 HAND stages become 6 depth zones: + + Zone 1 : 0 < h <= h_2yr + Zone 2 : h_2yr < h <= h_5yr + Zone 3 : h_5yr < h <= h_10yr + Zone 4 : h_10yr < h <= h_25yr + Zone 5 : h_25yr < h <= h_50yr + Zone 6 : h > h_50yr (same n as Zone 5) + +For each zone boundary h_r, Manning's equation is inverted against the SRC +hydraulic geometry at that stage to back-calculate n: + + n_r = A(h_r) . R(h_r)^(2/3) . sqrt(S) / Q_r + +where Q_r is the NWM recurrence flow and A, R, S come from the SRC table. + +The calibrated n values are then applied piecewise to recompute discharge at +every stage in the SRC, replacing the original constant n=0.06 baseline. + +Scope +----- +Only "directly calibrated" branches are processed: + - Non-trunk branches (levpa_id != "0") + - That have at least one USGS gauge with available rating-curve data + +Branch 0 (the watershed trunk) and all ungauged branches are intentionally +left unchanged. Ungauged branches will be handled by a separate ML-based +regionalisation step. + +Prerequisite +------------ +run_usgs_gage_crosswalk.py must have run and produced + D:/SI/out/HUC03020102/usgs_elev_table.csv + +Run +--- + .venv\\Scripts\\python.exe scripts/calibrate_n_recurrence.py +""" +from __future__ import annotations + +import logging +import shutil +from pathlib import Path + +import numpy as np +import pandas as pd + +# ── CONFIG ──────────────────────────────────────────────────────── +HUC8 = "03020102" +OUT_DIR = Path("D:/SI/out") +DATA_DIR = Path("data") + +# Physical bounds for back-calculated n (Chow 1959 range). +N_MIN = 0.010 # very smooth concrete +N_MAX = 0.300 # dense brush / floodplain forest +# ───────────────────────────────────────────────────────────────── + +RECURRENCE_YEARS = [2, 5, 10, 25, 50] +RECUR_COLS = {yr: f"{yr}_0_year_recurrence_flow_17C" for yr in RECURRENCE_YEARS} + +AOI_ROOT = OUT_DIR / f"HUC{HUC8}" +WATERSHED = AOI_ROOT / "watershed-data" +BRANCHES = WATERSHED / "branches" + +logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S") +log = logging.getLogger(__name__) + + +# ── Data loading ────────────────────────────────────────────────── + +def load_inputs() -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """Load the three external tables needed for calibration.""" + elev_path = AOI_ROOT / "usgs_elev_table.csv" + if not elev_path.exists(): + raise FileNotFoundError( + f"usgs_elev_table.csv not found at {elev_path}\n" + "Run scripts/run_usgs_gage_crosswalk.py first." + ) + + elev = pd.read_csv(elev_path, dtype={"location_id": str, "feature_id": "Int64"}) + log.info("usgs_elev_table: %d rows", len(elev)) + + rc = pd.read_parquet(DATA_DIR / "usgs_rating_curves.parquet") + rc["location_id"] = rc["location_id"].astype(str) + rc["flow_cms"] = rc["flow"] / 35.3147 # cfs → m³/s + rc["elev_m"] = rc["elevation_navd88"] / 3.28084 # ft NAVD88 → m + log.info("USGS rating curves: %d rows, %d gauges", + len(rc), rc["location_id"].nunique()) + + recur = pd.read_parquet(DATA_DIR / "nwm3_17C_recurrence_flows_cfs.parquet") + for yr, col in RECUR_COLS.items(): + recur[f"Q_{yr}yr_cms"] = recur[col].astype(float) * 0.028317 # cfs → m³/s + recur["feature_id"] = recur["feature_id"].astype("Int64") + log.info("NWM recurrence flows: %d feature_ids", len(recur)) + + return elev, rc, recur + + +def load_src(branch_dir: Path, bid: str) -> pd.DataFrame | None: + """Load src_full_crosswalked CSV for a branch; return None if missing.""" + p = branch_dir / f"src_full_crosswalked_{bid}.csv" + if not p.exists(): + return None + df = pd.read_csv(p, dtype={"feature_id": str}) + df["HydroID"] = df["HydroID"].astype(int) + return df + + +# ── Zone boundary calculation ───────────────────────────────────── + +def compute_zone_boundaries( + location_id: str, + feature_id: int, + dem_adj_elevation_m: float, + rc: pd.DataFrame, + recur: pd.DataFrame, +) -> dict[int, tuple[float, float]] | None: + """ + Returns {recurrence_year: (hand_stage_m, target_Q_cms)} for each year. + Returns None when the gauge or feature_id has insufficient data. + """ + # Rating curve file uses zero-padded 8-digit IDs (e.g. "02082950"). + loc_padded = location_id.zfill(8) + gage_rc = rc[rc["location_id"] == loc_padded].copy() + if gage_rc.empty: + return None + gage_rc = gage_rc.sort_values("flow_cms").drop_duplicates("flow_cms") + + feat_recur = recur[recur["feature_id"] == feature_id] + if feat_recur.empty: + return None + + boundaries = {} + for yr in RECURRENCE_YEARS: + Q_r = float(feat_recur[f"Q_{yr}yr_cms"].iloc[0]) + if Q_r <= 0: + continue + + # Interpolate the USGS rating curve to find the absolute elevation at Q_r. + if Q_r < gage_rc["flow_cms"].min() or Q_r > gage_rc["flow_cms"].max(): + continue # outside observed range — skip this recurrence + elev_r = float(np.interp(Q_r, gage_rc["flow_cms"], gage_rc["elev_m"])) + hand_r = elev_r - dem_adj_elevation_m + + if hand_r <= 0: + log.debug(" gage %s yr=%d: HAND<0 (%.2fm) — skipping", location_id, yr, hand_r) + continue + + boundaries[yr] = (hand_r, Q_r) + + return boundaries if boundaries else None + + +# ── Back-calculate n per zone ───────────────────────────────────── + +def calibrate_n_zones( + hydroid: int, + src: pd.DataFrame, + boundaries: dict[int, tuple[float, float]], +) -> dict[int, float]: + """ + Back-calculate Manning's n for each zone boundary. + + At each recurrence stage h_r, solve: + n_r = A(h_r) · R(h_r)^(2/3) · √S / Q_r + + Returns {recurrence_year: n_value} for successfully calibrated zones. + """ + hdf = src[src["HydroID"] == hydroid].sort_values("Stage") + if hdf.empty: + return {} + + # Use the subdivided discharge column if subdiv already ran; else Manning raw. + slope = hdf["SLOPE"].iloc[0] + if slope <= 0: + return {} + + n_zones: dict[int, float] = {} + for yr, (h_r, Q_r) in sorted(boundaries.items()): + # Find the SRC row whose stage is closest to h_r. + idx = (hdf["Stage"] - h_r).abs().idxmin() + row = hdf.loc[idx] + + A = float(row.get("WetArea (m2)", 0.0)) + R = float(row.get("HydraulicRadius (m)", 0.0)) + + if A <= 0 or R <= 0 or Q_r <= 0: + continue + + n_r = A * (R ** (2.0 / 3.0)) * (slope ** 0.5) / Q_r + n_r = float(np.clip(n_r, N_MIN, N_MAX)) + n_zones[yr] = n_r + log.debug(" HydroID %d yr=%d h=%.2fm Q=%.1f cms n=%.4f", + hydroid, yr, h_r, Q_r, n_r) + + return n_zones + + +# ── Apply piecewise n to a full SRC ────────────────────────────── + +def apply_n_zones_to_src( + src: pd.DataFrame, + hydroid: int, + boundaries: dict[int, tuple[float, float]], + n_zones: dict[int, float], +) -> pd.DataFrame: + """ + Recompute discharge for one HydroID using piecewise n. + + Zone lookup (sorted ascending by h_r): + Stage <= h_2yr → n_2yr + h_2yr < stage <= h_5yr → n_5yr + ... + stage > h_50yr → n of the highest available zone + """ + hdf = src[src["HydroID"] == hydroid].copy() + if hdf.empty or not n_zones: + return src + + slope = hdf["SLOPE"].iloc[0] + # Sorted zone boundaries: [(h_2yr, n_2yr), (h_5yr, n_5yr), ...] + sorted_zones = sorted( + ((h, n_zones[yr]) for yr, (h, _) in boundaries.items() if yr in n_zones), + key=lambda x: x[0], + ) + if not sorted_zones: + return src + + def _zone_n(stage_h: float) -> float: + for h_boundary, n_val in sorted_zones: + if stage_h <= h_boundary: + return n_val + return sorted_zones[-1][1] # above all boundaries → highest zone n + + rows_mask = src["HydroID"] == hydroid + + new_q = [] + for _, row in hdf.iterrows(): + h = float(row["Stage"]) + if h == 0.0: + new_q.append(0.0) + continue + A = float(row.get("WetArea (m2)", 0.0)) + R = float(row.get("HydraulicRadius (m)", 0.0)) + n = _zone_n(h) + if A <= 0 or R <= 0 or slope <= 0: + new_q.append(float(row.get("Discharge (m3s-1)", 0.0))) + else: + new_q.append(A * (R ** (2.0 / 3.0)) * (slope ** 0.5) / n) + + src.loc[rows_mask, "Discharge (m3s-1)"] = new_q + # Record the zone-n for the lowest recurrence year on each row (informational). + src.loc[rows_mask, "zonal_n_applied"] = [_zone_n(float(h)) + for h in hdf["Stage"]] + return src + + +# ── Propagation helpers ─────────────────────────────────────────── + +def average_zones( + all_boundaries: list[dict[int, tuple[float, float]]], + all_n_zones: list[dict[int, float]], +) -> tuple[dict[int, tuple[float, float]], dict[int, float]]: + """Average calibrated zone data across multiple gauges in a branch.""" + # Average the HAND boundary stages and n values independently. + avg_boundaries: dict[int, list] = {} + avg_n: dict[int, list] = {} + for bounds, n_zones in zip(all_boundaries, all_n_zones): + for yr, (h, _) in bounds.items(): + avg_boundaries.setdefault(yr, []).append(h) + for yr, n in n_zones.items(): + avg_n.setdefault(yr, []).append(n) + + merged_bounds = {yr: (float(np.mean(hs)), 0.0) for yr, hs in avg_boundaries.items()} + merged_n = {yr: float(np.mean(ns)) for yr, ns in avg_n.items()} + return merged_bounds, merged_n + + +# ── Per-branch processing ───────────────────────────────────────── + +def process_branch( + bid: str, + branch_dir: Path, + elev: pd.DataFrame, + rc: pd.DataFrame, + recur: pd.DataFrame, +) -> None: + """ + Calibrate n for a directly calibrated branch and write back to the SRC + + hydroTable CSVs. + + The gauge's calibrated n is applied to every HydroID in the branch. + If the branch somehow has multiple gauges with RC data, their zone n values + are averaged before propagation. + """ + src = load_src(branch_dir, bid) + if src is None: + log.warning("Branch %s: src_full_crosswalked not found — skipping", bid) + return + + ht_path = branch_dir / f"hydroTable_{bid}.csv" + if not ht_path.exists(): + log.warning("Branch %s: hydroTable not found — skipping", bid) + return + + branch_elev = elev[elev["levpa_id"].astype(str) == str(bid)] + log.info("Branch %s: %d gauge row(s)", bid, len(branch_elev)) + + all_bounds_list = [] + all_n_list = [] + + for _, gage in branch_elev.iterrows(): + loc_id = str(gage["location_id"]) + hydroid = int(gage["HydroID"]) + feat_id = int(gage["feature_id"]) if pd.notna(gage["feature_id"]) else -1 + dem_adj = float(gage["dem_adj_elevation"]) + + if feat_id < 0: + log.warning(" Gauge %s: no feature_id — skipping", loc_id) + continue + + bounds = compute_zone_boundaries(loc_id, feat_id, dem_adj, rc, recur) + if bounds is None: + log.warning(" Gauge %s: could not compute zone boundaries — skipping", loc_id) + continue + + n_zones = calibrate_n_zones(hydroid, src, bounds) + if not n_zones: + log.warning(" Gauge %s HydroID %d: no valid n values — skipping", loc_id, hydroid) + continue + + log.info(" Gauge %s HydroID %d: calibrated n = %s", + loc_id, hydroid, + {yr: f"{n:.4f}" for yr, n in sorted(n_zones.items())}) + + src = apply_n_zones_to_src(src, hydroid, bounds, n_zones) + all_bounds_list.append(bounds) + all_n_list.append(n_zones) + + if not all_bounds_list: + log.warning("Branch %s: no valid gauge calibrations — SRC not modified", bid) + return + + # Single gauge: use directly. Multiple gauges: average before propagation. + if len(all_bounds_list) == 1: + branch_bounds, branch_n = all_bounds_list[0], all_n_list[0] + else: + branch_bounds, branch_n = average_zones(all_bounds_list, all_n_list) + + # Apply branch n to all HydroIDs not already individually calibrated. + calibrated_hids = { + int(gage["HydroID"]) + for _, gage in branch_elev.iterrows() + if pd.notna(gage["feature_id"]) + } + for hid in src["HydroID"].unique(): + if hid not in calibrated_hids: + src = apply_n_zones_to_src(src, hid, branch_bounds, branch_n) + + src_path = branch_dir / f"src_full_crosswalked_{bid}.csv" + _backup(src_path) + _safe_write_csv(src, src_path, bid) + _sync_hydrotable(ht_path, src, bid) + + log.info("Branch %s: updated %d HydroIDs", bid, src["HydroID"].nunique()) + + +def _backup(path: Path) -> None: + """Copy to *.pre_n_calib.csv if backup doesn't already exist.""" + backup = path.with_suffix(".pre_n_calib.csv") + if not backup.exists(): + shutil.copy2(path, backup) + + +def _safe_write_csv(df: pd.DataFrame, path: Path, bid: str) -> None: + """Write CSV via a temp file to avoid Windows file-lock errors.""" + import os + tmp = path.with_suffix(".tmp.csv") + try: + df.to_csv(tmp, index=False) + os.replace(tmp, path) # atomic on same filesystem + except PermissionError as exc: + tmp.unlink(missing_ok=True) + raise PermissionError( + f"Branch {bid}: cannot write {path.name} — close it in any open " + f"application and re-run.\n ({exc})" + ) from exc + + +def _sync_hydrotable(ht_path: Path, src: pd.DataFrame, bid: str) -> None: + """Push the updated discharge + zonal_n_applied from SRC into the hydroTable.""" + _backup(ht_path) + ht = pd.read_csv(ht_path, low_memory=False) + + pull = src[["HydroID", "Stage", "Discharge (m3s-1)"]].copy() + if "zonal_n_applied" in src.columns: + pull["zonal_n_applied"] = src["zonal_n_applied"] + pull = pull.rename(columns={"Stage": "stage", "Discharge (m3s-1)": "discharge_cms"}) + + ht["HydroID"] = ht["HydroID"].astype(int) + ht = ht.drop(columns=["discharge_cms", "zonal_n_applied"], errors="ignore") + ht = ht.merge(pull, on=["HydroID", "stage"], how="left") + ht["discharge_cms"] = ht["discharge_cms"].fillna(0.0) + ht.to_csv(ht_path, index=False) + + +# ── Main ────────────────────────────────────────────────────────── + +def _get_directly_calibrated_branches(elev: pd.DataFrame, rc: pd.DataFrame) -> set[str]: + """ + Returns branch IDs that qualify for direct calibration: + - levpa_id != "0" (not the watershed trunk) + - gauge has at least one entry in the USGS rating-curve parquet + """ + rc_ids = set(rc["location_id"].astype(str).unique()) + non_trunk = elev[elev["levpa_id"].astype(str) != "0"].copy() + non_trunk["has_rc"] = non_trunk["location_id"].apply( + lambda loc: str(loc).zfill(8) in rc_ids + ) + return set(non_trunk[non_trunk["has_rc"]]["levpa_id"].astype(str).unique()) + + +def main(): + log.info("=== n-Recurrence Calibration — HUC8 %s ===", HUC8) + + elev, rc, recur = load_inputs() + + directly_calibrated = _get_directly_calibrated_branches(elev, rc) + if not directly_calibrated: + log.warning("No directly calibrated branches found — nothing to do.") + return + log.info("Directly calibrated branches: %s", sorted(directly_calibrated)) + + branch_dirs = sorted(d for d in BRANCHES.iterdir() if d.is_dir()) + for bdir in branch_dirs: + bid = bdir.name + if bid not in directly_calibrated: + log.debug("Branch %s: not directly calibrated — skip", bid) + continue + process_branch(bid, bdir, elev, rc, recur) + + log.info("=== n-Recurrence Calibration complete ===") + log.info("Calibrated branches: %s", sorted(directly_calibrated)) + log.info("Original SRCs backed up as *.pre_n_calib.csv") + + +if __name__ == "__main__": + main() diff --git a/scripts/s05_calibrate_n_recurrence_all_hucs.py b/scripts/s05_calibrate_n_recurrence_all_hucs.py new file mode 100644 index 0000000..625b647 --- /dev/null +++ b/scripts/s05_calibrate_n_recurrence_all_hucs.py @@ -0,0 +1,432 @@ +""" +Manning's n calibration for all HUC8s using USGS rating curves at NWM recurrence stages. + +Concept +------- +For each gauged branch, NWM recurrence flows (2, 5, 10, 25, 50 yr) are interpolated +through the USGS rating curve to HAND stage. Those 5 stage boundaries define 6 depth +zones. Manning's n is back-calculated at each boundary: + + n_r = A(h_r) · R(h_r)^(2/3) · √S / Q_r + +The calibrated n is then applied piecewise, recomputing discharge at every SRC stage row. +All HydroIDs in a calibrated branch receive the same piecewise n structure. + +Scope +----- +Only non-trunk branches that have at least one USGS gauge with rating-curve data are +processed. Branch 0 and ungauged branches are left unchanged. + +Prerequisite: s04_usgs_gage_crosswalk_all_hucs.py must have completed for each HUC +(usgs_elev_table.csv must exist at the AOI root). + +Run: + .venv\\Scripts\\python.exe scripts/s05_calibrate_n_recurrence_all_hucs.py +""" +from __future__ import annotations + +import logging +import os +import shutil +import time +import traceback +from datetime import datetime +from pathlib import Path + +import numpy as np +import pandas as pd + +# ── CONFIG ──────────────────────────────────────────────────────── +EXCEL_PATH = Path(r"C:\Users\Ali\OneDrive - CUNY\Desktop\SI\fimbox_SI26\data\study_area.xlsx") +HUC_CODE_COL = "HUC_CODE" +OUT_DIR = Path("E:/SI/out") +DATA_DIR = Path("data") +TASK_LOG = Path("E:/SI/out/ncalib_status.txt") + +N_MIN = 0.010 +N_MAX = 0.300 +RECURRENCE_YEARS = [2, 5, 10, 25, 50] +RECUR_COLS = {yr: f"{yr}_0_year_recurrence_flow_17C" for yr in RECURRENCE_YEARS} +# ───────────────────────────────────────────────────────────────── + +logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S") +log = logging.getLogger(__name__) + + +# ── Progress tracking ───────────────────────────────────────────── + +def _load_done() -> set[str]: + if not TASK_LOG.exists(): + return set() + done = set() + for line in TASK_LOG.read_text().splitlines(): + parts = line.strip().split() + if len(parts) >= 3 and parts[2] == "PASS": + done.add(parts[1]) + return done + + +def _log_result(huc8: str, status: str, note: str = "") -> None: + TASK_LOG.parent.mkdir(parents=True, exist_ok=True) + with TASK_LOG.open("a") as f: + f.write(f"ncalib {huc8} {status}{(' ' + note) if note else ''}\n") + + +def _fmt(seconds: float) -> str: + seconds = int(seconds) + h, rem = divmod(seconds, 3600) + m, s = divmod(rem, 60) + if h: + return f"{h}h {m:02d}m {s:02d}s" + if m: + return f"{m}m {s:02d}s" + return f"{s}s" + + +# ── Zone boundary calculation ───────────────────────────────────── + +def compute_zone_boundaries( + location_id: str, + feature_id: int, + dem_adj_m: float, + rc: pd.DataFrame, + recur: pd.DataFrame, +) -> dict[int, tuple[float, float]] | None: + loc_padded = str(location_id).zfill(8) + gage_rc = rc[rc["location_id"] == loc_padded].copy() + if gage_rc.empty: + return None + gage_rc = gage_rc.sort_values("flow_cms").drop_duplicates("flow_cms") + + feat_recur = recur[recur["feature_id"] == feature_id] + if feat_recur.empty: + return None + + boundaries: dict[int, tuple[float, float]] = {} + for yr in RECURRENCE_YEARS: + Q_r = float(feat_recur[f"Q_{yr}yr_cms"].iloc[0]) + if Q_r <= 0: + continue + if Q_r < gage_rc["flow_cms"].min() or Q_r > gage_rc["flow_cms"].max(): + continue + elev_r = float(np.interp(Q_r, gage_rc["flow_cms"].values, gage_rc["elev_m"].values)) + hand_r = elev_r - dem_adj_m + if hand_r <= 0: + log.debug(" gage %s yr=%d: HAND<0 (%.2fm) — skipping", location_id, yr, hand_r) + continue + boundaries[yr] = (hand_r, Q_r) + + return boundaries if boundaries else None + + +# ── Back-calculate n per zone ───────────────────────────────────── + +def calibrate_n_zones( + hydroid: int, + src: pd.DataFrame, + boundaries: dict[int, tuple[float, float]], +) -> dict[int, float]: + hdf = src[src["HydroID"] == hydroid].sort_values("Stage") + if hdf.empty: + return {} + slope = float(hdf["SLOPE"].iloc[0]) + if slope <= 0: + return {} + + n_zones: dict[int, float] = {} + for yr, (h_r, Q_r) in sorted(boundaries.items()): + idx = (hdf["Stage"] - h_r).abs().idxmin() + row = hdf.loc[idx] + A = float(row.get("WetArea (m2)", 0.0)) + R = float(row.get("HydraulicRadius (m)", 0.0)) + if A <= 0 or R <= 0 or Q_r <= 0: + continue + n_r = A * (R ** (2.0 / 3.0)) * (slope ** 0.5) / Q_r + n_zones[yr] = float(np.clip(n_r, N_MIN, N_MAX)) + log.debug(" HydroID %d yr=%d h=%.2fm Q=%.1f cms n=%.4f", + hydroid, yr, h_r, Q_r, n_zones[yr]) + + return n_zones + + +# ── Apply piecewise n to a full SRC ────────────────────────────── + +def apply_n_zones_to_src( + src: pd.DataFrame, + hydroid: int, + boundaries: dict[int, tuple[float, float]], + n_zones: dict[int, float], +) -> pd.DataFrame: + hdf = src[src["HydroID"] == hydroid].copy() + if hdf.empty or not n_zones: + return src + slope = float(hdf["SLOPE"].iloc[0]) + + sorted_zones = sorted( + ((h, n_zones[yr]) for yr, (h, _) in boundaries.items() if yr in n_zones), + key=lambda x: x[0], + ) + if not sorted_zones: + return src + + def _zone_n(stage_h: float) -> float: + for h_boundary, n_val in sorted_zones: + if stage_h <= h_boundary: + return n_val + return sorted_zones[-1][1] + + rows_mask = src["HydroID"] == hydroid + new_q = [] + for _, row in hdf.iterrows(): + h = float(row["Stage"]) + if h == 0.0: + new_q.append(0.0) + continue + A = float(row.get("WetArea (m2)", 0.0)) + R = float(row.get("HydraulicRadius (m)", 0.0)) + n = _zone_n(h) + if A <= 0 or R <= 0 or slope <= 0: + new_q.append(float(row.get("Discharge (m3s-1)", 0.0))) + else: + new_q.append(A * (R ** (2.0 / 3.0)) * (slope ** 0.5) / n) + + src.loc[rows_mask, "Discharge (m3s-1)"] = new_q + src.loc[rows_mask, "zonal_n_applied"] = [_zone_n(float(h)) for h in hdf["Stage"]] + return src + + +# ── Multi-gauge averaging ───────────────────────────────────────── + +def average_zones( + all_boundaries: list[dict[int, tuple[float, float]]], + all_n_zones: list[dict[int, float]], +) -> tuple[dict[int, tuple[float, float]], dict[int, float]]: + avg_boundaries: dict[int, list] = {} + avg_n: dict[int, list] = {} + for bounds, n_zones in zip(all_boundaries, all_n_zones): + for yr, (h, _) in bounds.items(): + avg_boundaries.setdefault(yr, []).append(h) + for yr, n in n_zones.items(): + avg_n.setdefault(yr, []).append(n) + merged_bounds = {yr: (float(np.mean(hs)), 0.0) for yr, hs in avg_boundaries.items()} + merged_n = {yr: float(np.mean(ns)) for yr, ns in avg_n.items()} + return merged_bounds, merged_n + + +# ── File I/O helpers ────────────────────────────────────────────── + +def _backup(path: Path) -> None: + backup = path.with_suffix(".pre_n_calib.csv") + if not backup.exists(): + shutil.copy2(path, backup) + + +def _safe_write_csv(df: pd.DataFrame, path: Path, bid: str) -> None: + tmp = path.with_suffix(".tmp.csv") + try: + df.to_csv(tmp, index=False) + os.replace(tmp, path) + except PermissionError as exc: + tmp.unlink(missing_ok=True) + raise PermissionError( + f"Branch {bid}: cannot write {path.name} — close it in any open application." + ) from exc + + +def _sync_hydrotable(ht_path: Path, src: pd.DataFrame, bid: str) -> None: + _backup(ht_path) + ht = pd.read_csv(ht_path, low_memory=False) + pull = src[["HydroID", "Stage", "Discharge (m3s-1)"]].copy() + if "zonal_n_applied" in src.columns: + pull["zonal_n_applied"] = src["zonal_n_applied"] + pull = pull.rename(columns={"Stage": "stage", "Discharge (m3s-1)": "discharge_cms"}) + ht["HydroID"] = ht["HydroID"].astype(int) + ht = ht.drop(columns=["discharge_cms", "zonal_n_applied"], errors="ignore") + ht = ht.merge(pull, on=["HydroID", "stage"], how="left") + ht["discharge_cms"] = ht["discharge_cms"].fillna(0.0) + ht.to_csv(ht_path, index=False) + + +# ── Per-branch processing ───────────────────────────────────────── + +def process_branch( + bid: str, + branch_dir: Path, + elev: pd.DataFrame, + rc: pd.DataFrame, + recur: pd.DataFrame, +) -> None: + src_path = branch_dir / f"src_full_crosswalked_{bid}.csv" + ht_path = branch_dir / f"hydroTable_{bid}.csv" + + if not src_path.exists(): + log.warning(" Branch %s: SRC not found — skip", bid) + return + if not ht_path.exists(): + log.warning(" Branch %s: hydroTable not found — skip", bid) + return + + src = pd.read_csv(src_path, dtype={"feature_id": str}) + src["HydroID"] = src["HydroID"].astype(int) + + branch_elev = elev[elev["levpa_id"].astype(str) == str(bid)] + log.info(" Branch %s: %d gauge row(s)", bid, len(branch_elev)) + + all_bounds_list: list[dict] = [] + all_n_list: list[dict] = [] + calibrated_hids: set[int] = set() + + for _, gage in branch_elev.iterrows(): + loc_id = str(gage["location_id"]) + hydroid = int(gage["HydroID"]) + feat_id = int(gage["feature_id"]) if pd.notna(gage["feature_id"]) else -1 + dem_adj = float(gage["dem_adj_elevation"]) + + if feat_id < 0: + log.warning(" Gauge %s: no feature_id — skip", loc_id) + continue + + bounds = compute_zone_boundaries(loc_id, feat_id, dem_adj, rc, recur) + if bounds is None: + log.warning(" Gauge %s: no zone boundaries — skip", loc_id) + continue + + n_zones = calibrate_n_zones(hydroid, src, bounds) + if not n_zones: + log.warning(" Gauge %s HydroID %d: no valid n — skip", loc_id, hydroid) + continue + + log.info(" Gauge %s HydroID %d: n = %s", loc_id, hydroid, + {yr: f"{n:.4f}" for yr, n in sorted(n_zones.items())}) + + src = apply_n_zones_to_src(src, hydroid, bounds, n_zones) + all_bounds_list.append(bounds) + all_n_list.append(n_zones) + calibrated_hids.add(hydroid) + + if not all_bounds_list: + log.warning(" Branch %s: no valid gauge calibrations — SRC not modified", bid) + return + + branch_bounds, branch_n = ( + (all_bounds_list[0], all_n_list[0]) + if len(all_bounds_list) == 1 + else average_zones(all_bounds_list, all_n_list) + ) + + for hid in src["HydroID"].unique(): + if hid not in calibrated_hids: + src = apply_n_zones_to_src(src, hid, branch_bounds, branch_n) + + _backup(src_path) + _safe_write_csv(src, src_path, bid) + _sync_hydrotable(ht_path, src, bid) + log.info(" Branch %s: updated %d HydroIDs", bid, src["HydroID"].nunique()) + + +# ── Per-HUC entry point ─────────────────────────────────────────── + +def run_huc(huc8: str, rc: pd.DataFrame, recur: pd.DataFrame) -> None: + aoi_root = OUT_DIR / f"HUC{huc8}" + branches = aoi_root / "watershed-data" / "branches" + + elev_path = aoi_root / "usgs_elev_table.csv" + if not elev_path.exists(): + raise FileNotFoundError( + f"usgs_elev_table.csv not found — run s04 crosswalk first: {elev_path}" + ) + + elev = pd.read_csv(elev_path, dtype={"location_id": str, "feature_id": "Int64"}) + log.info(" usgs_elev_table: %d rows", len(elev)) + + rc_ids = set(rc["location_id"].astype(str)) + non_trunk = elev[elev["levpa_id"].astype(str) != "0"].copy() + non_trunk["has_rc"] = non_trunk["location_id"].apply( + lambda loc: str(loc).zfill(8) in rc_ids + ) + directly_calibrated = set(non_trunk[non_trunk["has_rc"]]["levpa_id"].astype(str)) + + if not directly_calibrated: + log.info(" No directly calibrated branches for HUC %s — nothing to do", huc8) + return + log.info(" Calibrated branches: %s", sorted(directly_calibrated)) + + for branch_dir in sorted(d for d in branches.iterdir() if d.is_dir()): + bid = branch_dir.name + if bid in directly_calibrated: + process_branch(bid, branch_dir, elev, rc, recur) + + +# ── Main ────────────────────────────────────────────────────────── + +def main(): + df = pd.read_excel(EXCEL_PATH) + hucs = [str(int(c)).zfill(8) for c in df[HUC_CODE_COL]] + done = _load_done() + remaining = [h for h in hucs if h not in done] + + log.info(f"n-Calibration: {len(hucs)} total | {len(done)} already done | {len(remaining)} to run") + log.info(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + # Load shared tables once — identical for all HUCs + rc = pd.read_parquet(DATA_DIR / "usgs_rating_curves.parquet") + rc["location_id"] = rc["location_id"].astype(str) + rc["flow_cms"] = rc["flow"].astype(float) / 35.3147 + rc["elev_m"] = rc["elevation_navd88"].astype(float) / 3.28084 + log.info(f"USGS rating curves: {len(rc)} rows, {rc['location_id'].nunique()} gauges") + + recur = pd.read_parquet(DATA_DIR / "nwm3_17C_recurrence_flows_cfs.parquet") + for yr, col in RECUR_COLS.items(): + recur[f"Q_{yr}yr_cms"] = recur[col].astype(float) * 0.028317 + recur["feature_id"] = recur["feature_id"].astype("Int64") + log.info(f"NWM recurrence flows: {len(recur)} feature_ids") + + batch_start = time.time() + huc_times: list[tuple[str, float, str]] = [] + passed, failed = list(done), [] + + for i, huc8 in enumerate(remaining, 1): + huc_start = time.time() + log.info(f" [{i}/{len(remaining)}] HUC8 = {huc8} | " + f"batch elapsed: {_fmt(huc_start - batch_start)}") + try: + run_huc(huc8, rc, recur) + elapsed = time.time() - huc_start + _log_result(huc8, "PASS") + passed.append(huc8) + huc_times.append((huc8, elapsed, "PASS")) + completed = [t for _, t, s in huc_times if s == "PASS"] + avg_s = sum(completed) / len(completed) + remaining_n = len(remaining) - i + eta_s = avg_s * remaining_n + log.info(f" [{huc8}] PASS | this HUC: {_fmt(elapsed)} | " + f"avg: {_fmt(avg_s)} | remaining: {remaining_n} | " + f"ETA: {_fmt(eta_s)}" + + (f" (~{datetime.fromtimestamp(time.time() + eta_s).strftime('%H:%M')})" + if remaining_n > 0 else " (last HUC)")) + except Exception: + elapsed = time.time() - huc_start + err = traceback.format_exc().splitlines()[-1] + _log_result(huc8, "FAIL", err) + failed.append(huc8) + huc_times.append((huc8, elapsed, "FAIL")) + log.error(f" [{huc8}] FAIL after {_fmt(elapsed)} | {err}") + + total = time.time() - batch_start + log.info(f"\n{'─'*60}") + log.info(f"n-Calibration complete | total time: {_fmt(total)}") + log.info(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + log.info(f"Passed: {len(passed)} | Failed: {len(failed)}") + if huc_times: + log.info(f"\nPer-HUC timing summary:") + for huc8, elapsed, status in huc_times: + log.info(f" {huc8} {status:4s} {_fmt(elapsed)}") + if failed: + log.warning(f"\nFailed HUCs: {failed}") + log.warning("Re-run this script to retry (FAIL lines are automatically retried).") + + +if __name__ == "__main__": + main() diff --git a/scripts/s05_revert_uncalibrated_srcs.py b/scripts/s05_revert_uncalibrated_srcs.py new file mode 100644 index 0000000..c7f8eb3 --- /dev/null +++ b/scripts/s05_revert_uncalibrated_srcs.py @@ -0,0 +1,88 @@ +""" +Revert SRCs for branches that were not directly calibrated. + +"Directly calibrated" = a non-trunk branch (levpa_id != "0") that has exactly +one USGS gauge with available rating-curve data. These are the only branches +where n was back-calculated from real observations. + +All other modified branches (Branch 0 averaged, ungauged fallback) are restored +from their .pre_n_calib.csv backup to the canonical SRC CSV, and the backup is +deleted. After this script runs, .pre_n_calib.csv files exist ONLY for the +truly calibrated branches, so find_calibrated_branches() and the plot script +will automatically show only those. + +Run: + .venv\\Scripts\\python.exe scripts/revert_uncalibrated_srcs.py +""" +import shutil +from pathlib import Path + +import pandas as pd + +# ── CONFIG ──────────────────────────────────────────────────────── +HUC8 = "03020102" +OUT_DIR = Path("D:/SI/out") +DATA_DIR = Path("data") +# ───────────────────────────────────────────────────────────────── + +AOI_ROOT = OUT_DIR / f"HUC{HUC8}" +WATERSHED = AOI_ROOT / "watershed-data" +BRANCHES = WATERSHED / "branches" + + +def get_directly_calibrated_branches() -> set[str]: + """ + Returns branch IDs where n was individually back-calculated from a single + USGS gauge with rating-curve data. Excludes: + - Branch 0 (trunk; received an average of two gauges) + - Any branch whose gauge has no entry in usgs_rating_curves.parquet + """ + elev = pd.read_csv( + AOI_ROOT / "usgs_elev_table.csv", + dtype={"location_id": str, "feature_id": "Int64"}, + ) + rc = pd.read_parquet(DATA_DIR / "usgs_rating_curves.parquet") + rc_ids = set(rc["location_id"].astype(str).unique()) + + non_trunk = elev[elev["levpa_id"].astype(str) != "0"].copy() + non_trunk["has_rc"] = non_trunk["location_id"].apply( + lambda loc: str(loc).zfill(8) in rc_ids + ) + return set(non_trunk[non_trunk["has_rc"]]["levpa_id"].astype(str).unique()) + + +def main(): + directly_calibrated = get_directly_calibrated_branches() + print(f"Directly calibrated branches (keeping): {sorted(directly_calibrated)}") + print() + + reverted, kept, no_backup = [], [], [] + + for branch_dir in sorted(d for d in BRANCHES.iterdir() if d.is_dir()): + bid = branch_dir.name + backup = branch_dir / f"src_full_crosswalked_{bid}.pre_n_calib.csv" + main_csv = branch_dir / f"src_full_crosswalked_{bid}.csv" + + if not backup.exists(): + no_backup.append(bid) + continue + + if bid in directly_calibrated: + kept.append(bid) + print(f" KEEP branch {bid} (directly calibrated, backup preserved)") + continue + + # Restore original SRC from backup, remove backup + shutil.copy2(backup, main_csv) + backup.unlink() + reverted.append(bid) + print(f" REVERT branch {bid} -> original SRC restored, backup removed") + + print() + print(f"Reverted : {len(reverted)} branch(es) -> {reverted}") + print(f"Kept : {len(kept)} branch(es) -> {kept}") + print(f"Untouched: {len(no_backup)} branch(es) (never modified)") + + +if __name__ == "__main__": + main() diff --git a/scripts/s06_stage3_all_hucs.py b/scripts/s06_stage3_all_hucs.py new file mode 100644 index 0000000..8915566 --- /dev/null +++ b/scripts/s06_stage3_all_hucs.py @@ -0,0 +1,140 @@ +""" +Stage 3 — Feature ID extraction + NWM retrospective streamflow for all HUC8s. + +Step 3a: extract_feature_ids → out//feature_id.csv +Step 3b: getNWMretrospective → out//discharge-inputs/.csv + +Prerequisite: run_stage2_hand.py must have completed for a HUC. + +Progress is written to TASK_LOG after each HUC so a re-run skips completed ones. + +Run: + .venv\\Scripts\\python.exe scripts/run_stage3_streamflow.py +""" +import logging +import time +import traceback +from datetime import datetime +from pathlib import Path +import pandas as pd + +# ── CONFIG ──────────────────────────────────────────────────────── +EXCEL_PATH = Path(r"C:\Users\Ali\OneDrive - CUNY\Desktop\SI\fimbox_SI26\data\study_area.xlsx") +HUC_CODE_COL = "HUC_CODE" +OUT_DIR = Path("E:/SI/out") + +# NWM retrospective event — edit to your flood event of interest +# Use "date" for a single instant, or set START + END for a continuous range. +EVENT_DATE = "2020-10-10 21:00:00" # single event +# START = "2016-10-05" # uncomment for range +# END = "2016-10-20" + +TASK_LOG = Path("E:/SI/out/stage3_status.txt") +# ───────────────────────────────────────────────────────────────── + +logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S") +log = logging.getLogger(__name__) + + +def _load_done() -> set[str]: + if not TASK_LOG.exists(): + return set() + done = set() + for line in TASK_LOG.read_text().splitlines(): + parts = line.strip().split() + if len(parts) >= 3 and parts[2] == "PASS": + done.add(parts[1]) + return done + + +def _log_result(huc8: str, status: str, note: str = "") -> None: + TASK_LOG.parent.mkdir(parents=True, exist_ok=True) + with TASK_LOG.open("a") as f: + f.write(f"stage3 {huc8} {status}{(' ' + note) if note else ''}\n") + + +def run_huc(huc8: str) -> None: + from fimbox import extract_feature_ids, getNWMretrospective + + aoi_root = OUT_DIR / f"HUC{huc8}" + if not aoi_root.exists(): + raise FileNotFoundError(f"AOI folder not found — run Stage 1 first: {aoi_root}") + + # Step 3a: collect NWM feature IDs from all branch hydroTables + extract_feature_ids(aoi_root) + + # Step 3b: pull NWM v3 retrospective streamflow for the event + getNWMretrospective(aoi_root, date=EVENT_DATE) + # For a date range, replace the line above with: + # getNWMretrospective(aoi_root, start=START, end=END) + + +def _fmt(seconds: float) -> str: + seconds = int(seconds) + h, rem = divmod(seconds, 3600) + m, s = divmod(rem, 60) + if h: + return f"{h}h {m:02d}m {s:02d}s" + if m: + return f"{m}m {s:02d}s" + return f"{s}s" + + +def main(): + df = pd.read_excel(EXCEL_PATH) + hucs = [str(int(c)).zfill(8) for c in df[HUC_CODE_COL]] + done = _load_done() + remaining = [h for h in hucs if h not in done] + + log.info(f"Stage 3: {len(hucs)} total | {len(done)} already done | {len(remaining)} to run") + log.info(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + batch_start = time.time() + huc_times: list[tuple[str, float, str]] = [] + passed, failed = list(done), [] + + for i, huc8 in enumerate(remaining, 1): + huc_start = time.time() + log.info(f" [{i}/{len(remaining)}] HUC8 = {huc8} | " + f"batch elapsed: {_fmt(huc_start - batch_start)}") + try: + run_huc(huc8) + elapsed = time.time() - huc_start + _log_result(huc8, "PASS") + passed.append(huc8) + huc_times.append((huc8, elapsed, "PASS")) + completed = [t for _, t, s in huc_times if s == "PASS"] + avg_s = sum(completed) / len(completed) + remaining_n = len(remaining) - i + eta_s = avg_s * remaining_n + log.info(f" [{huc8}] PASS | this HUC: {_fmt(elapsed)} | " + f"avg: {_fmt(avg_s)} | remaining: {remaining_n} | " + f"ETA: {_fmt(eta_s)}" + + (f" (~{datetime.fromtimestamp(time.time() + eta_s).strftime('%H:%M')})" + if remaining_n > 0 else " (last HUC)")) + except Exception: + elapsed = time.time() - huc_start + err = traceback.format_exc().splitlines()[-1] + _log_result(huc8, "FAIL", err) + failed.append(huc8) + huc_times.append((huc8, elapsed, "FAIL")) + log.error(f" [{huc8}] FAIL after {_fmt(elapsed)} | {err}") + + total = time.time() - batch_start + log.info(f"\n{'─'*60}") + log.info(f"Stage 3 complete | total time: {_fmt(total)}") + log.info(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + log.info(f"Passed: {len(passed)} | Failed: {len(failed)}") + if huc_times: + log.info(f"\nPer-HUC timing summary:") + for huc8, elapsed, status in huc_times: + log.info(f" {huc8} {status:4s} {_fmt(elapsed)}") + if failed: + log.warning(f"\nFailed HUCs: {failed}") + log.warning("Re-run this script to retry (FAIL lines are automatically retried).") + + +if __name__ == "__main__": + main() diff --git a/scripts/s06_stage3_single_huc.py b/scripts/s06_stage3_single_huc.py new file mode 100644 index 0000000..824f849 --- /dev/null +++ b/scripts/s06_stage3_single_huc.py @@ -0,0 +1,64 @@ +""" +Stage 3 — Feature IDs + Streamflow for a SINGLE HUC8. +Use this to test one HUC before running run_stage3_streamflow.py on all 24. + +Prerequisite: Stage 2 must have completed for this HUC. + +Set HUC8 and EVENT_DATE below and run: + .venv\\Scripts\\python.exe scripts/run_single_huc_stage3.py +""" +import logging +import traceback +from pathlib import Path + +# ── SET THESE ───────────────────────────────────────────────────── +HUC8 = "03020102" +EVENT_DATE = "2020-10-10 21:00:00" # edit to your flood event +# For a date range instead of a single event, uncomment and use: +# START = "2016-10-05" +# END = "2016-10-20" +# ───────────────────────────────────────────────────────────────── + +OUT_DIR = Path("D:/SI/out") +TASK_LOG = Path("D:/SI/out/stage3_status.txt") + +logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S") +log = logging.getLogger(__name__) + + +def _log_result(huc8: str, status: str, note: str = "") -> None: + TASK_LOG.parent.mkdir(parents=True, exist_ok=True) + with TASK_LOG.open("a") as f: + f.write(f"stage3 {huc8} {status}{(' ' + note) if note else ''}\n") + + +def main(): + from fimbox import extract_feature_ids, getNWMretrospective + + aoi_root = OUT_DIR / f"HUC{HUC8}" + if not aoi_root.exists(): + log.error(f"AOI folder not found — run Stage 1 first: {aoi_root}") + return + + log.info(f"Stage 3 — HUC8: {HUC8}") + try: + log.info(" Step 3a: extracting feature IDs …") + extract_feature_ids(aoi_root) + + log.info(f" Step 3b: fetching NWM retrospective for {EVENT_DATE} …") + getNWMretrospective(aoi_root, date=EVENT_DATE) + # For a date range replace the line above with: + # getNWMretrospective(aoi_root, start=START, end=END) + + _log_result(HUC8, "PASS") + log.info("PASS") + except Exception: + err = traceback.format_exc() + _log_result(HUC8, "FAIL", err.splitlines()[-1]) + log.error(f"FAIL:\n{err}") + + +if __name__ == "__main__": + main() diff --git a/scripts/s07_stage4_all_hucs.py b/scripts/s07_stage4_all_hucs.py new file mode 100644 index 0000000..66dc81e --- /dev/null +++ b/scripts/s07_stage4_all_hucs.py @@ -0,0 +1,185 @@ +""" +Stage 4 — Synthetic Rating Curve calibration for all HUC8s. + +Applies bankfull identification, channel/overbank subdivision, and USGS +rating-curve calibration to the per-branch hydroTables produced by Stage 2. + +Prerequisite: run_stage2_hand.py must have completed for a HUC. + +Progress is written to TASK_LOG after each HUC so a re-run skips completed ones. + +Run: + .venv\\Scripts\\python.exe scripts/run_stage4_calibration.py +""" +import logging +import time +import traceback +from datetime import datetime +from pathlib import Path +import pandas as pd + +# ── CONFIG ──────────────────────────────────────────────────────── +EXCEL_PATH = Path(r"C:\Users\Ali\OneDrive - CUNY\Desktop\SI\fimbox_SI26\data\study_area.xlsx") +HUC_CODE_COL = "HUC_CODE" +OUT_DIR = Path("E:/SI/out") +DATA_DIR = Path("data") # calibration tables shipped in the repo +TASK_LOG = Path("E:/SI/out/stage4_status.txt") +# ───────────────────────────────────────────────────────────────── + +logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S") +log = logging.getLogger(__name__) + + +def _load_done() -> set[str]: + if not TASK_LOG.exists(): + return set() + done = set() + for line in TASK_LOG.read_text().splitlines(): + parts = line.strip().split() + if len(parts) >= 3 and parts[2] == "PASS": + done.add(parts[1]) + return done + + +def _log_result(huc8: str, status: str, note: str = "") -> None: + TASK_LOG.parent.mkdir(parents=True, exist_ok=True) + with TASK_LOG.open("a") as f: + f.write(f"stage4 {huc8} {status}{(' ' + note) if note else ''}\n") + + +def run_huc(huc8: str) -> None: + from fimbox import run_calibration, CalibrationConfig + from fimbox._dask import _resolve_n_workers + + aoi_root = OUT_DIR / f"HUC{huc8}" + if not aoi_root.exists(): + raise FileNotFoundError(f"AOI folder not found — run Stage 1 first: {aoi_root}") + + cfg = CalibrationConfig( + # reset — revert to uncalibrated baseline on re-runs + calibration_rerun=True, + + # aggregate_pre — assemble usgs/ras2fim elev tables before adjustments + aggregate_pre=True, + + # thalweg — remove thalweg-notch artifact rows, refill stage ladder + thalweg_notches_adjustment=True, + + # longitudinal — smooth hydraulic geometry along reach chains + longitudinal_filter=True, + + # bathymetry — add missing in-channel area below DEM (self-skips if no overlap) + bathymetry_adjust=True, + bathy_file_ehydro=DATA_DIR / "final_bathymetry_ehydro_ohrfc.gpkg", + + # bankfull — identify bankfull stage in every branch SRC + src_bankfull_toggle=True, + bankfull_flows_file=DATA_DIR / "nwm3_high_water_threshold_cms.parquet", + include_branch_zero=True, + + # subdiv — channel/overbank subdivision (needs vmann + bankfull on) + src_subdiv_toggle=True, + vmann_input_file=DATA_DIR / "mannings_global_optz.parquet", + default_channel_n=0.06, + default_overbank_n=0.12, + + # nonmonotonic — force monotonic in-channel rating curves + nonmonotonic_src_adjustment=True, + nonmonotonic_stream_order_min=4, + + # usgs — calibrate SRCs against USGS rating curves at NWM recurrence flows + src_adjust_usgs=True, + usgs_rating_curve_csv=DATA_DIR / "usgs_rating_curves.parquet", + usgs_acceptable_gages=DATA_DIR / "acceptable_sites_for_rating_curves.parquet", + nwm_recur_file=DATA_DIR / "nwm3_17C_recurrence_flows_cfs.parquet", + + # spatial — self-skips when calib_points_file=None (no benchmark points yet) + src_adjust_spatial=True, + calib_points_file=None, + + # manual — self-skips when man_calb_file=None + manual_calb_toggle=True, + man_calb_file=None, + + # aggregate_post — publish htable + bridge + road to AOI root + aggregate_post=True, + + # log scan — collect error/warning lines into per-AOI summary files + scan_logs=True, + + # execution + job_branch_limit=_resolve_n_workers(), + skip_unimplemented=True, + ) + run_calibration(aoi_root, cfg) + + +def _fmt(seconds: float) -> str: + seconds = int(seconds) + h, rem = divmod(seconds, 3600) + m, s = divmod(rem, 60) + if h: + return f"{h}h {m:02d}m {s:02d}s" + if m: + return f"{m}m {s:02d}s" + return f"{s}s" + + +def main(): + df = pd.read_excel(EXCEL_PATH) + hucs = [str(int(c)).zfill(8) for c in df[HUC_CODE_COL]] + done = _load_done() + remaining = [h for h in hucs if h not in done] + + log.info(f"Stage 4: {len(hucs)} total | {len(done)} already done | {len(remaining)} to run") + log.info(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + batch_start = time.time() + huc_times: list[tuple[str, float, str]] = [] + passed, failed = list(done), [] + + for i, huc8 in enumerate(remaining, 1): + huc_start = time.time() + log.info(f" [{i}/{len(remaining)}] HUC8 = {huc8} | " + f"batch elapsed: {_fmt(huc_start - batch_start)}") + try: + run_huc(huc8) + elapsed = time.time() - huc_start + _log_result(huc8, "PASS") + passed.append(huc8) + huc_times.append((huc8, elapsed, "PASS")) + completed = [t for _, t, s in huc_times if s == "PASS"] + avg_s = sum(completed) / len(completed) + remaining_n = len(remaining) - i + eta_s = avg_s * remaining_n + log.info(f" [{huc8}] PASS | this HUC: {_fmt(elapsed)} | " + f"avg: {_fmt(avg_s)} | remaining: {remaining_n} | " + f"ETA: {_fmt(eta_s)}" + + (f" (~{datetime.fromtimestamp(time.time() + eta_s).strftime('%H:%M')})" + if remaining_n > 0 else " (last HUC)")) + except Exception: + elapsed = time.time() - huc_start + err = traceback.format_exc().splitlines()[-1] + _log_result(huc8, "FAIL", err) + failed.append(huc8) + huc_times.append((huc8, elapsed, "FAIL")) + log.error(f" [{huc8}] FAIL after {_fmt(elapsed)} | {err}") + + total = time.time() - batch_start + log.info(f"\n{'─'*60}") + log.info(f"Stage 4 complete | total time: {_fmt(total)}") + log.info(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + log.info(f"Passed: {len(passed)} | Failed: {len(failed)}") + if huc_times: + log.info(f"\nPer-HUC timing summary:") + for huc8, elapsed, status in huc_times: + log.info(f" {huc8} {status:4s} {_fmt(elapsed)}") + if failed: + log.warning(f"\nFailed HUCs: {failed}") + log.warning("Re-run this script to retry (FAIL lines are automatically retried).") + + +if __name__ == "__main__": + main() diff --git a/scripts/s07_stage4_single_huc.py b/scripts/s07_stage4_single_huc.py new file mode 100644 index 0000000..d21944a --- /dev/null +++ b/scripts/s07_stage4_single_huc.py @@ -0,0 +1,84 @@ +""" +Stage 4 — Calibration for a SINGLE HUC8. +Use this to test one HUC before running run_stage4_calibration.py on all 24. + +Prerequisite: Stage 2 must have completed for this HUC. + +Set HUC8 below and run: + .venv\\Scripts\\python.exe scripts/run_single_huc_stage4.py +""" +import logging +import traceback +from pathlib import Path + +# ── SET THIS ────────────────────────────────────────────────────── +HUC8 = "03020102" +# ───────────────────────────────────────────────────────────────── + +OUT_DIR = Path("D:/SI/out") +DATA_DIR = Path("data") +TASK_LOG = Path("D:/SI/out/stage4_status.txt") + +logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S") +log = logging.getLogger(__name__) + + +def _log_result(huc8: str, status: str, note: str = "") -> None: + TASK_LOG.parent.mkdir(parents=True, exist_ok=True) + with TASK_LOG.open("a") as f: + f.write(f"stage4 {huc8} {status}{(' ' + note) if note else ''}\n") + + +def main(): + from fimbox import run_calibration, CalibrationConfig + from fimbox._dask import _resolve_n_workers + + aoi_root = OUT_DIR / f"HUC{HUC8}" + if not aoi_root.exists(): + log.error(f"AOI folder not found — run Stage 1 first: {aoi_root}") + return + + log.info(f"Stage 4 — HUC8: {HUC8}") + try: + cfg = CalibrationConfig( + calibration_rerun=True, + aggregate_pre=True, + thalweg_notches_adjustment=True, + longitudinal_filter=True, + bathymetry_adjust=True, + bathy_file_ehydro=DATA_DIR / "final_bathymetry_ehydro_ohrfc.gpkg", + src_bankfull_toggle=True, + bankfull_flows_file=DATA_DIR / "nwm3_high_water_threshold_cms.parquet", + include_branch_zero=True, + src_subdiv_toggle=True, + vmann_input_file=DATA_DIR / "mannings_global_optz.parquet", + default_channel_n=0.06, + default_overbank_n=0.12, + nonmonotonic_src_adjustment=True, + nonmonotonic_stream_order_min=4, + src_adjust_usgs=True, + usgs_rating_curve_csv=DATA_DIR / "usgs_rating_curves.parquet", + usgs_acceptable_gages=DATA_DIR / "acceptable_sites_for_rating_curves.parquet", + nwm_recur_file=DATA_DIR / "nwm3_17C_recurrence_flows_cfs.parquet", + src_adjust_spatial=True, + calib_points_file=None, + manual_calb_toggle=True, + man_calb_file=None, + aggregate_post=True, + scan_logs=True, + job_branch_limit=_resolve_n_workers(), + skip_unimplemented=True, + ) + run_calibration(aoi_root, cfg) + _log_result(HUC8, "PASS") + log.info("PASS") + except Exception: + err = traceback.format_exc() + _log_result(HUC8, "FAIL", err.splitlines()[-1]) + log.error(f"FAIL:\n{err}") + + +if __name__ == "__main__": + main() diff --git a/scripts/s08_stage5_all_hucs.py b/scripts/s08_stage5_all_hucs.py new file mode 100644 index 0000000..aba93c3 --- /dev/null +++ b/scripts/s08_stage5_all_hucs.py @@ -0,0 +1,139 @@ +""" +Stage 5 — FIM Generation for all HUC8s in the study area. + +Generates inundation extent + depth rasters for every discharge CSV +found in out//discharge-inputs/. + +Prerequisite: run_stage3_streamflow.py must have completed for a HUC +(discharge CSVs must exist before this script runs). + +Progress is written to TASK_LOG after each HUC so a re-run skips completed ones. + +Run: + .venv\\Scripts\\python.exe scripts/run_stage5_fim.py +""" +import logging +import time +import traceback +from datetime import datetime +from pathlib import Path +import pandas as pd + +# ── CONFIG ──────────────────────────────────────────────────────── +EXCEL_PATH = Path(r"C:\Users\Ali\OneDrive - CUNY\Desktop\SI\fimbox_SI26\data\study_area.xlsx") +HUC_CODE_COL = "HUC_CODE" +OUT_DIR = Path("E:/SI/out") +TASK_LOG = Path("E:/SI/out/stage5_status.txt") +# ───────────────────────────────────────────────────────────────── + +logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S") +log = logging.getLogger(__name__) + + +def _load_done() -> set[str]: + if not TASK_LOG.exists(): + return set() + done = set() + for line in TASK_LOG.read_text().splitlines(): + parts = line.strip().split() + if len(parts) >= 3 and parts[2] == "PASS": + done.add(parts[1]) + return done + + +def _log_result(huc8: str, status: str, note: str = "") -> None: + TASK_LOG.parent.mkdir(parents=True, exist_ok=True) + with TASK_LOG.open("a") as f: + f.write(f"stage5 {huc8} {status}{(' ' + note) if note else ''}\n") + + +def run_huc(huc8: str) -> None: + from fimbox import generateFIM + from fimbox._dask import _resolve_n_workers + + aoi_root = OUT_DIR / f"HUC{huc8}" + discharge_dir = aoi_root / "discharge-inputs" + + if not aoi_root.exists(): + raise FileNotFoundError(f"AOI folder not found — run Stage 1 first: {aoi_root}") + if not discharge_dir.exists() or not list(discharge_dir.glob("*.csv")): + raise FileNotFoundError( + f"No discharge CSVs found — run Stage 3 (streamflow) first: {discharge_dir}" + ) + + results = generateFIM( + aoi_root, n_workers=_resolve_n_workers(), depth=True + ).from_discharge_inputs() + + log.info(f" [{huc8}] {len(results)} FIM raster(s) written to {aoi_root / 'fim-outputs'}") + + +def _fmt(seconds: float) -> str: + seconds = int(seconds) + h, rem = divmod(seconds, 3600) + m, s = divmod(rem, 60) + if h: + return f"{h}h {m:02d}m {s:02d}s" + if m: + return f"{m}m {s:02d}s" + return f"{s}s" + + +def main(): + df = pd.read_excel(EXCEL_PATH) + hucs = [str(int(c)).zfill(8) for c in df[HUC_CODE_COL]] + done = _load_done() + remaining = [h for h in hucs if h not in done] + + log.info(f"Stage 5: {len(hucs)} total | {len(done)} already done | {len(remaining)} to run") + log.info(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + batch_start = time.time() + huc_times: list[tuple[str, float, str]] = [] + passed, failed = list(done), [] + + for i, huc8 in enumerate(remaining, 1): + huc_start = time.time() + log.info(f" [{i}/{len(remaining)}] HUC8 = {huc8} | " + f"batch elapsed: {_fmt(huc_start - batch_start)}") + try: + run_huc(huc8) + elapsed = time.time() - huc_start + _log_result(huc8, "PASS") + passed.append(huc8) + huc_times.append((huc8, elapsed, "PASS")) + completed = [t for _, t, s in huc_times if s == "PASS"] + avg_s = sum(completed) / len(completed) + remaining_n = len(remaining) - i + eta_s = avg_s * remaining_n + log.info(f" [{huc8}] PASS | this HUC: {_fmt(elapsed)} | " + f"avg: {_fmt(avg_s)} | remaining: {remaining_n} | " + f"ETA: {_fmt(eta_s)}" + + (f" (~{datetime.fromtimestamp(time.time() + eta_s).strftime('%H:%M')})" + if remaining_n > 0 else " (last HUC)")) + except Exception: + elapsed = time.time() - huc_start + err = traceback.format_exc().splitlines()[-1] + _log_result(huc8, "FAIL", err) + failed.append(huc8) + huc_times.append((huc8, elapsed, "FAIL")) + log.error(f" [{huc8}] FAIL after {_fmt(elapsed)} | {err}") + + total = time.time() - batch_start + log.info(f"\n{'─'*60}") + log.info(f"Stage 5 complete | total time: {_fmt(total)}") + log.info(f"Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + log.info(f"Passed: {len(passed)} | Failed: {len(failed)}") + if huc_times: + log.info(f"\nPer-HUC timing summary:") + for huc8, elapsed, status in huc_times: + log.info(f" {huc8} {status:4s} {_fmt(elapsed)}") + if failed: + log.warning(f"\nFailed HUCs: {failed}") + log.warning("Re-run this script to retry (FAIL lines are automatically retried).") + + +if __name__ == "__main__": + main() diff --git a/scripts/s08_stage5_single_huc.py b/scripts/s08_stage5_single_huc.py new file mode 100644 index 0000000..d4498a7 --- /dev/null +++ b/scripts/s08_stage5_single_huc.py @@ -0,0 +1,62 @@ +""" +Stage 5 — FIM Generation for a SINGLE HUC8. +Use this to test one HUC before running run_stage5_fim.py on all 24. + +Prerequisite: Stages 2 and 3 must have completed for this HUC +(branches and discharge CSVs must exist). + +Set HUC8 below and run: + .venv\\Scripts\\python.exe scripts/run_single_huc_stage5.py +""" +import logging +import traceback +from pathlib import Path + +# ── SET THIS ────────────────────────────────────────────────────── +HUC8 = "03020102" +# ───────────────────────────────────────────────────────────────── + +OUT_DIR = Path("D:/SI/out") +TASK_LOG = Path("D:/SI/out/stage5_status.txt") + +logging.basicConfig(level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%H:%M:%S") +log = logging.getLogger(__name__) + + +def _log_result(huc8: str, status: str, note: str = "") -> None: + TASK_LOG.parent.mkdir(parents=True, exist_ok=True) + with TASK_LOG.open("a") as f: + f.write(f"stage5 {huc8} {status}{(' ' + note) if note else ''}\n") + + +def main(): + from fimbox import generateFIM + from fimbox._dask import _resolve_n_workers + + aoi_root = OUT_DIR / f"HUC{HUC8}" + discharge_dir = aoi_root / "discharge-inputs" + + if not aoi_root.exists(): + log.error(f"AOI folder not found — run Stage 1 first: {aoi_root}") + return + if not discharge_dir.exists() or not list(discharge_dir.glob("*.csv")): + log.error(f"No discharge CSVs found — run Stage 3 first: {discharge_dir}") + return + + log.info(f"Stage 5 — HUC8: {HUC8}") + try: + results = generateFIM( + aoi_root, n_workers=_resolve_n_workers(), depth=True + ).from_discharge_inputs() + _log_result(HUC8, "PASS") + log.info(f"PASS — {len(results)} FIM raster(s) → {aoi_root / 'fim-outputs'}") + except Exception: + err = traceback.format_exc() + _log_result(HUC8, "FAIL", err.splitlines()[-1]) + log.error(f"FAIL:\n{err}") + + +if __name__ == "__main__": + main() diff --git a/tests/test_getallinputdata.py b/tests/test_getallinputdata.py index 4167bec..3c0136e 100644 --- a/tests/test_getallinputdata.py +++ b/tests/test_getallinputdata.py @@ -3,22 +3,29 @@ import fimbox -test_boundary = Path("./docs/test_boundary/test_smallB.shp") +# ── EDIT THIS for each HUC8 you want to process ────────────────── +CURRENT_HUC8 = "03020102" +DEM_TILES_DIR = Path("D:/SI/out/study_area/watershed-data/dem_tiles") +# ───────────────────────────────────────────────────────────────── + +test_boundary = Path("./docs/study_boundary/study_area.gpkg") test_huc8 = "08060202" # Yazoo River basin, MS -# Combined preprocessing pipeline tests -# Run full pipeline from a boundary shapefile +# Run full pipeline for a single HUC8 using the pre-downloaded DEM tile. +# Change CURRENT_HUC8 above and re-run for each HUC in your study area. def test_preprocess_all_from_boundary(): + dem_path = DEM_TILES_DIR / f"dem_{CURRENT_HUC8}.tif" pp = fimbox.getAllInputData( - boundary=test_boundary, + huc8=CURRENT_HUC8, out_dir="../out", - buffer_m=5000, # metres to buffer boundary for data downloads - headwater_buffer_cells=8, # pixels to shrink buffer for headwater clip - get_flowlines=True, # set False to use your own flowlines and corresponding catchments - get_catchments=True, # set False to skip NWM catchments--> use - resolution="medium", # "high" -> NHDPlus HR flowlines/catchments via pynhd; "medium" (default) -> NWM. Lakes always NWM. - identifier="nwmmr", # filename prefix for ALL source files; flows download->processing. Default "nwm". + buffer_m=5000, + headwater_buffer_cells=8, + get_flowlines=True, + get_catchments=True, + resolution="medium", + identifier="nwmmr", + dem=dem_path, # use pre-downloaded 10m tile — skips 3DEP download ) pp.run()