eScienceLab · faizollah · Jun 29, 2026 · Jun 29, 2026
diff --git a/.github/workflows/realapps.yml b/.github/workflows/realapps.yml
@@ -0,0 +1,85 @@
+name: real-app e2e
+
+# Tier B integration: actually fetch a real Flower Hub app, run the federation
+# end to end, and validate the produced RO-Crate. This is heavier and more
+# network-dependent than the per-push `tests` workflow (which runs against
+# recorded fixtures), so it runs on a schedule and on demand, plus on push so it
+# can be watched per change.
+on:
+  push:
+    branches: ["**"]
+  schedule:
+    - cron: "0 3 * * *"   # nightly
+  workflow_dispatch:
+
+jobs:
+  e2e:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - app: quickstart-sklearn
+            handle: "@flwrlabs/quickstart-sklearn"
+            pkg: sklearnexample
+            ci_server_app: tests/e2e/sklearn_ci_server_app.py
+            strategy: FedAvg
+            framework: scikit-learn
+            python: "3.11"
+
+    name: e2e (${{ matrix.app }})
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python }}
+
+      - name: Install flwrcrate (with Flower)
+        run: |
+          python -m pip install --upgrade pip
+          pip install -e .
+
+      - name: Fetch the app from the Flower Hub
+        working-directory: ${{ runner.temp }}
+        run: flwr new ${{ matrix.handle }}
+
+      - name: Integrate flwrcrate and install the app
+        run: |
+          APP="${{ runner.temp }}/${{ matrix.app }}"
+          # drop in the CI-integrated ServerApp (fixed /tmp paths)
+          cp "${{ matrix.ci_server_app }}" "$APP/${{ matrix.pkg }}/server_app.py"
+          # the tracker reads pyproject from a fixed absolute path (Ray worker has no cwd)
+          cp "$APP/pyproject.toml" /tmp/flcrate_pyproject.toml
+          # add a metric-URI mapping so a metric carries a semantic propertyID
+          printf '\n[tool.flwrcrate.metric-uris]\naccuracy = "https://schema.org/Accuracy"\n' >> /tmp/flcrate_pyproject.toml
+          pip install -e "$APP"
+
+      - name: Run the federation
+        working-directory: ${{ runner.temp }}/${{ matrix.app }}
+        run: |
+          rm -rf /tmp/flcrate_out
+          # flwr run detaches the simulation when there's no TTY, so we kick it
+          # off and then poll for the crate within this same step (keeping the
+          # step alive so the background sim isn't killed).
+          flwr run . --run-config "num-server-rounds=2" || true
+          for i in $(seq 1 90); do
+            if [ -f /tmp/flcrate_out/ro-crate/ro-crate-metadata.json ]; then
+              echo "crate appeared after ~$((i*5))s"; break
+            fi
+            sleep 5
+          done
+
+      - name: Validate the generated crate
+        run: |
+          python tests/e2e/validate_crate.py \
+            /tmp/flcrate_out/ro-crate/ro-crate-metadata.json \
+            "${{ matrix.strategy }}" "${{ matrix.framework }}"
+
+      - name: Upload the crate as an artifact
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: crate-${{ matrix.app }}
+          path: /tmp/flcrate_out/ro-crate/
+          if-no-files-found: warn
diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 # flwrCrate
 
-[![tests](https://github.com/faizollah/flwrCrate/actions/workflows/tests.yml/badge.svg)](https://github.com/faizollah/flwrCrate/actions/workflows/tests.yml)
+[![tests](https://github.com/eScienceLab/flwrCrate/actions/workflows/tests.yml/badge.svg)](https://github.com/eScienceLab/flwrCrate/actions/workflows/tests.yml)
+[![real-app e2e](https://github.com/eScienceLab/flwrCrate/actions/workflows/realapps.yml/badge.svg)](https://github.com/eScienceLab/flwrCrate/actions/workflows/realapps.yml)
 
 Capture a [Flower](https://flower.ai/) federated learning run and emit an
 [RO-Crate](https://www.researchobject.org/ro-crate/) describing it — a
@@ -264,12 +265,19 @@ pytest                                        # run the full unit + integration
 pytest --cov=flwrcrate --cov-report=term-missing   # with a coverage report
 ```
 
-The suite is split into fast **unit** tests (dependency parsing, metric
-handling, slug/person helpers, crate assembly) and **integration** tests that
-drive the whole `FLCrateTracker` lifecycle and assert a complete, correct
-`ro-crate-metadata.json` — without needing Flower, Ray, or any ML framework
-installed. CI runs it on Python 3.10–3.12 (see the badge above) and enforces a
-minimum coverage of 85% (currently ~92%).
+Testing has two tiers:
+
+- **`tests` workflow (every push)** — fast **unit** tests + **integration**
+  tests that drive the whole `FLCrateTracker` lifecycle, plus **real-data tests**
+  that feed the *actual* captured output of the Tested-with apps
+  (`tests/fixtures/<app>/`) through the crate builder. None of these need
+  Flower, Ray, or an ML framework installed, so they run in seconds on Python
+  3.10–3.12 and enforce ≥85% coverage (currently ~92%).
+- **`real-app e2e` workflow (nightly + on demand)** — actually fetches a real
+  Flower Hub app with `flwr new`, runs the federation end to end, and validates
+  the produced crate (`.github/workflows/realapps.yml`). This is the only tier
+  that exercises live capture from a running Flower simulation; it's heavier and
+  network-dependent, hence separate from the per-push suite.
 
 ## License
 

diff --git a/tests/e2e/sklearn_ci_server_app.py b/tests/e2e/sklearn_ci_server_app.py
@@ -0,0 +1,55 @@
+"""CI end-to-end variant of the quickstart-sklearn ServerApp.
+
+Used by the real-app e2e workflow (.github/workflows/realapps.yml): the workflow
+fetches the stock @flwrlabs/quickstart-sklearn app with `flwr new`, drops this
+file in over its server_app.py, then actually runs the federation and validates
+the produced crate.
+
+The only differences from a normal integration are CI-friendly fixed absolute
+paths under /tmp — required because Flower runs the ServerApp in a Ray worker
+that doesn't inherit the shell's cwd or environment, so relative paths and env
+vars don't reach it. The workflow copies the app's pyproject.toml to
+/tmp/flcrate_pyproject.toml before running.
+"""
+
+import joblib
+from flwr.app import ArrayRecord, Context
+from flwr.serverapp import Grid, ServerApp
+from flwr.serverapp.strategy import FedAvg
+from flwrcrate import FLCrateTracker
+
+from sklearnexample.task import (
+    create_log_reg_and_instantiate_parameters,
+    get_model_params,
+    set_model_params,
+)
+
+app = ServerApp()
+
+
+@app.main()
+def main(grid: Grid, context: Context) -> None:
+    num_rounds: int = context.run_config["num-server-rounds"]
+    penalty = context.run_config["penalty"]
+    model = create_log_reg_and_instantiate_parameters(penalty)
+    arrays = ArrayRecord(get_model_params(model))
+
+    strategy = FedAvg(fraction_train=1.0, fraction_evaluate=1.0)
+
+    with FLCrateTracker(
+        context, strategy,
+        output_dir="/tmp/flcrate_out",
+        pyproject_path="/tmp/flcrate_pyproject.toml",
+        app_name="Quickstart scikit-learn (CI e2e)",
+        author={"name": "flwrCrate CI", "orcid": "https://orcid.org/0000-0000-0000-0000"},
+        license="https://spdx.org/licenses/MIT.html",
+    ) as tracker:
+        result = strategy.start(
+            grid=grid,
+            initial_arrays=arrays,
+            num_rounds=num_rounds,
+        )
+        ndarrays = result.arrays.to_numpy_ndarrays()
+        set_model_params(model, ndarrays)
+        joblib.dump(model, "logreg_model.pkl")
+        tracker.record_result(result, model_path="logreg_model.pkl")
diff --git a/tests/e2e/validate_crate.py b/tests/e2e/validate_crate.py
@@ -0,0 +1,43 @@
+"""Validate a real generated RO-Crate for the e2e workflow.
+
+Usage: python validate_crate.py <ro-crate-metadata.json> <expected-strategy> <expected-framework-slug>
+Exits non-zero (failing the CI job) if the crate is missing or incomplete.
+"""
+
+import json
+import sys
+
+
+def main():
+    if len(sys.argv) != 4:
+        sys.exit(f"usage: {sys.argv[0]} <metadata.json> <strategy> <framework-slug>")
+    path, expected_strategy, expected_fw = sys.argv[1], sys.argv[2], sys.argv[3]
+
+    try:
+        graph = json.loads(open(path).read())["@graph"]
+    except FileNotFoundError:
+        sys.exit(f"FAIL: crate not found at {path} (did the run produce one?)")
+    g = {e["@id"]: e for e in graph}
+
+    checks = {
+        "root mentions the run (#1)": g["./"].get("mentions") == [{"@id": "#fl-run"}],
+        "run completed": g.get("#fl-run", {}).get("actionStatus", {}).get("@id", "").endswith("CompletedActionStatus"),
+        f"strategy is {expected_strategy} (#2)": g.get("#fl-strategy", {}).get("name") == expected_strategy,
+        f"framework #framework-{expected_fw} captured (#4)": f"#framework-{expected_fw}" in g,
+        "at least one metric (#3)": any(k.startswith("#metric") for k in g),
+        "endTime set (record_result ran)": bool(g.get("#fl-run", {}).get("endTime")),
+        "license + author + agent (#5)": all([
+            g["./"].get("license"), g["./"].get("author"), g.get("#fl-run", {}).get("agent"),
+        ]),
+    }
+
+    failed = [name for name, ok in checks.items() if not ok]
+    for name, ok in checks.items():
+        print(f"  [{'PASS' if ok else 'FAIL'}] {name}")
+    if failed:
+        sys.exit(f"FAIL: {len(failed)} check(s) failed: {failed}")
+    print(f"OK: valid end-to-end crate ({expected_strategy} / {expected_fw}).")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/fixtures/fed-engines/captured_metadata.json b/tests/fixtures/fed-engines/captured_metadata.json
@@ -0,0 +1,91 @@
+{
+  "app_name": "Federated Engine Anomaly Detection",
+  "run_timing": {
+    "start_time": "2026-06-09T10:25:52.343680+00:00",
+    "end_time": "2026-06-09T10:26:10.660400+00:00"
+  },
+  "environment_config": {
+    "num-server-rounds": 3,
+    "num-clients": 2,
+    "fraction-train": 1.0,
+    "fraction-evaluate": 1.0,
+    "local-epochs": 1,
+    "batch-size": 32,
+    "learning-rate": 0.03,
+    "proximal-mu": 0.01,
+    "num-classes": 2,
+    "dataset-name": "luviner/industrial-faults",
+    "partitioner": "dirichlet",
+    "partition-by": "label",
+    "dirichlet-alpha": 0.3,
+    "dirichlet-min-partition-size": 10,
+    "dirichlet-self-balancing": false,
+    "val-fraction": 0.2,
+    "train-samples-per-client": 512,
+    "val-samples-per-client": 128,
+    "seed": 42
+  },
+  "federation": {
+    "num-clients": 2,
+    "fraction-train": 1.0,
+    "fraction-evaluate": 1.0,
+    "partitioner": "dirichlet",
+    "partition-by": "label",
+    "dirichlet-min-partition-size": 10,
+    "val-fraction": 0.2,
+    "train-samples-per-client": 512,
+    "val-samples-per-client": 128
+  },
+  "flower": {
+    "version": "1.30.0"
+  },
+  "frameworks": [
+    {
+      "package": "datasets",
+      "name": "Hugging Face Datasets",
+      "homepage": "https://huggingface.co/docs/datasets",
+      "declared": ">=3.1.0",
+      "installed_version": "4.8.5",
+      "known_framework": true
+    },
+    {
+      "package": "torch",
+      "name": "PyTorch",
+      "homepage": "https://pytorch.org/",
+      "declared": "==2.8.0",
+      "installed_version": "2.8.0",
+      "known_framework": true
+    }
+  ],
+  "strategy": {
+    "class_name": "FedProx",
+    "module": "flwr.serverapp.strategy.fedprox",
+    "attributes": {
+      "arrayrecord_key": "arrays",
+      "configrecord_key": "config",
+      "fraction_evaluate": 1.0,
+      "fraction_train": 1.0,
+      "min_available_nodes": 2,
+      "min_evaluate_nodes": 2,
+      "min_train_nodes": 2,
+      "proximal_mu": 0.01,
+      "weighted_by_key": "num-examples"
+    }
+  },
+  "final_metrics": {
+    "final_round": 3,
+    "metrics": {
+      "train_loss": 0.6608005687594414,
+      "train_acc": 0.712890625,
+      "train_balanced_acc": 0.5468078489943253,
+      "train_normal_recall": 0.3562555456965395,
+      "train_anomaly_recall": 0.7373601522921112,
+      "val_loss": 0.5283429622650146,
+      "val_acc": 0.78125,
+      "val_balanced_acc": 0.5343414358394574,
+      "val_normal_recall": 0.25,
+      "val_anomaly_recall": 0.8186828716789147
+    }
+  },
+  "metrics_log_file": "metrics_log.json"
+}
diff --git a/tests/fixtures/fed-engines/metrics_log.json b/tests/fixtures/fed-engines/metrics_log.json
@@ -0,0 +1,84 @@
+{
+  "federation": {
+    "num-clients": 2,
+    "fraction-train": 1.0,
+    "fraction-evaluate": 1.0,
+    "partitioner": "dirichlet",
+    "partition-by": "label",
+    "dirichlet-min-partition-size": 10,
+    "val-fraction": 0.2,
+    "train-samples-per-client": 512,
+    "val-samples-per-client": 128
+  },
+  "run_config": {
+    "num-server-rounds": 3,
+    "num-clients": 2,
+    "fraction-train": 1.0,
+    "fraction-evaluate": 1.0,
+    "local-epochs": 1,
+    "batch-size": 32,
+    "learning-rate": 0.03,
+    "proximal-mu": 0.01,
+    "num-classes": 2,
+    "dataset-name": "luviner/industrial-faults",
+    "partitioner": "dirichlet",
+    "partition-by": "label",
+    "dirichlet-alpha": 0.3,
+    "dirichlet-min-partition-size": 10,
+    "dirichlet-self-balancing": false,
+    "val-fraction": 0.2,
+    "train-samples-per-client": 512,
+    "val-samples-per-client": 128,
+    "seed": 42
+  },
+  "per_round": {
+    "1": {
+      "train_clientapp": {
+        "train_loss": 0.7037347061559558,
+        "train_acc": 0.5908203125,
+        "train_balanced_acc": 0.5055093743299932,
+        "train_normal_recall": 0.41481810115350487,
+        "train_anomaly_recall": 0.5962006475064816
+      },
+      "evaluate_clientapp": {
+        "val_loss": 0.43482594564557076,
+        "val_acc": 0.9296875,
+        "val_balanced_acc": 0.5,
+        "val_normal_recall": 0.0,
+        "val_anomaly_recall": 1.0
+      }
+    },
+    "2": {
+      "train_clientapp": {
+        "train_loss": 0.6651889113709331,
+        "train_acc": 0.8134765625,
+        "train_balanced_acc": 0.5246637139704542,
+        "train_normal_recall": 0.19520851818988466,
+        "train_anomaly_recall": 0.8541189097510236
+      },
+      "evaluate_clientapp": {
+        "val_loss": 0.6743118688464165,
+        "val_acc": 0.4609375,
+        "val_balanced_acc": 0.6145656679856792,
+        "val_normal_recall": 0.7916666666666667,
+        "val_anomaly_recall": 0.43746466930469197
+      }
+    },
+    "3": {
+      "train_clientapp": {
+        "train_loss": 0.6608005687594414,
+        "train_acc": 0.712890625,
+        "train_balanced_acc": 0.5468078489943253,
+        "train_normal_recall": 0.3562555456965395,
+        "train_anomaly_recall": 0.7373601522921112
+      },
+      "evaluate_clientapp": {
+        "val_loss": 0.5283429622650146,
+        "val_acc": 0.78125,
+        "val_balanced_acc": 0.5343414358394574,
+        "val_normal_recall": 0.25,
+        "val_anomaly_recall": 0.8186828716789147
+      }
+    }
+  }
+}