Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 26 additions & 4 deletions src/TiledArray/dist_eval/contraction_eval.h
Original file line number Diff line number Diff line change
Expand Up @@ -728,24 +728,46 @@ class Summa

ProcessID get_row_group_root(const ordinal_type k,
const madness::Group& row_group) const {
// The broadcast root is the process in this process row that owns the
// column-k source tiles, i.e. grid position (rank_row, k % proc_cols).
ProcessID group_root = k % proc_grid_.proc_cols();
if (!right_.shape().is_dense() &&
row_group.size() < static_cast<ProcessID>(proc_grid_.proc_cols())) {
const ProcessID world_root =
proc_grid_.rank_row() * proc_grid_.proc_cols() + group_root;
// The group has been pruned (it omits ranks that own no nonzero tiles
// for this column), so the root's group-local rank no longer equals its
// process-column index and must be looked up by its world rank. Use
// ProcGrid::map_col, which is exactly how make_group inserted the root
// into the group (it always includes the geometric root). Computing the
// world rank by hand here is a latent bug: it must account for the
// grid's rank offset (nonzero for the h-grouped 3-d batched grid), which
// map_col supplies. Without it Group::rank() fails to find the root and
// returns -1, aborting the broadcast.
const ProcessID world_root = proc_grid_.map_col(group_root);
group_root = row_group.rank(world_root);
TA_ASSERT(group_root >= 0);
}
return group_root;
}

ProcessID get_col_group_root(const ordinal_type k,
const madness::Group& col_group) const {
// The broadcast root is the process in this process column that owns the
// row-k source tiles, i.e. grid position (k % proc_rows, rank_col).
ProcessID group_root = k % proc_grid_.proc_rows();
if (!left_.shape().is_dense() &&
col_group.size() < static_cast<ProcessID>(proc_grid_.proc_rows())) {
const ProcessID world_root =
group_root * proc_grid_.proc_cols() + proc_grid_.rank_col();
// The group has been pruned (it omits ranks that own no nonzero tiles
// for this row), so the root's group-local rank no longer equals its
// process-row index and must be looked up by its world rank. Use
// ProcGrid::map_row, which is exactly how make_group inserted the root
// into the group (it always includes the geometric root). Computing the
// world rank by hand here is a latent bug: it must account for the
// grid's rank offset (nonzero for the h-grouped 3-d batched grid), which
// map_row supplies. Without it Group::rank() fails to find the root and
// returns -1, aborting the broadcast.
const ProcessID world_root = proc_grid_.map_row(group_root);
group_root = col_group.rank(world_root);
TA_ASSERT(group_root >= 0);
}
return group_root;
}
Expand Down
87 changes: 87 additions & 0 deletions tests/proc_grid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,93 @@ BOOST_AUTO_TEST_CASE(make_groups) {
}
}

// Regression test for the SUMMA broadcast-root computation
// (Summa::get_row_group_root / get_col_group_root in
// dist_eval/contraction_eval.h).
//
// When the right (resp. left) operand is block-sparse the SUMMA row (resp.
// column) broadcast group is pruned to the ranks that own nonzero source
// tiles, so its size drops below proc_cols (resp. proc_rows). In that case the
// broadcast root's group-local rank no longer equals its process-column
// (-row) index and must be looked up from its *world* rank. make_group always
// inserts the geometric root into the group via ProcGrid::map_col / map_row,
// whose returned world rank includes the grid's rank_offset_ -- the world rank
// of the grid's first process. rank_offset_ is nonzero for the h-grouped 3-d
// batched SUMMA grid (every slab group beyond group 0). The old root code
// reconstructed the world rank by hand as rank_row*proc_cols + group_root
// (resp. group_root*proc_cols + rank_col), *omitting rank_offset_*, so for an
// offset grid Group::rank() failed to find the root and returned -1, tripping
// MADNESS_ASSERT(group_root >= 0) in WorldGopInterface::bcast and aborting the
// process.
//
// The fix recomputes the root's world rank with ProcGrid::map_col / map_row,
// which is exactly the value make_group inserts. This test validates that
// primitive: on an offset grid map_col / map_row must include the offset, so
// the broadcast root the fix now uses is the value present in the group, while
// the old offset-less formula would have missed it (differing by exactly the
// nonzero offset). A full end-to-end SUMMA repro requires >= 4 ranks (proc_h_
// > 1 needs P >= 2*proc_rows*proc_cols and a pruned group needs proc_cols or
// proc_rows >= 2), beyond this suite's np<=2 runs.
BOOST_AUTO_TEST_CASE(summa_bcast_root_offset) {
// Baseline (np-independent): on an unoffset grid the offset-aware maps agree
// with the bare geometric formula the old root code used. The test ctor
// permits an arbitrary (test_rank, test_nprocs) regardless of world size and
// leaves rank_offset_ == 0.
{
TiledArray::detail::ProcGrid grid(*GlobalFixture::world, /*test_rank=*/5u,
/*test_nprocs=*/12u, /*rows=*/8,
/*cols=*/8,
/*row_size=*/64, /*col_size=*/64);
BOOST_REQUIRE_GT(grid.proc_cols(), 0u);
BOOST_REQUIRE_GT(grid.proc_rows(), 0u);
for (std::size_t c = 0; c < grid.proc_cols(); ++c)
BOOST_CHECK_EQUAL(
grid.map_col(c),
grid.rank_row() * ProcessID(grid.proc_cols()) + ProcessID(c));
for (std::size_t r = 0; r < grid.proc_rows(); ++r)
BOOST_CHECK_EQUAL(
grid.map_row(r),
grid.rank_col() + ProcessID(r) * ProcessID(grid.proc_cols()));
}

// Offset grid: this is the regression. Construct a process grid whose first
// process is at a nonzero world rank (as the h-grouped batched SUMMA does
// for slab groups beyond group 0) and verify the maps the fix now uses
// include the offset -- so the broadcast root is the in-group world rank,
// not the offset-less one the old code computed (which would be -1 once the
// group is pruned). Requires the world to be large enough to host an offset
// sub-grid (rank_subset_t ctor asserts rank_offset + nprocs <= world.size()).
const auto world_size = GlobalFixture::world->size();
if (world_size >= 2) {
const std::size_t nprocs = world_size - (world_size / 2); // upper ~half
const ProcessID offset = world_size - ProcessID(nprocs); // nonzero offset
BOOST_REQUIRE_GT(offset, 0);
TiledArray::detail::ProcGrid grid(*GlobalFixture::world,
TiledArray::detail::rank_subset, offset,
nprocs, /*rows=*/8,
/*cols=*/8, /*row_size=*/64,
/*col_size=*/64);
// Only ranks inside the sub-grid have a valid (rank_row, rank_col).
if (GlobalFixture::world->rank() >= offset) {
for (std::size_t c = 0; c < grid.proc_cols(); ++c) {
const ProcessID offset_less =
grid.rank_row() * ProcessID(grid.proc_cols()) + ProcessID(c);
// map_col == the value make_group inserts as the row-broadcast root
BOOST_CHECK_EQUAL(grid.map_col(c), offset_less + offset);
// ... which differs from the old offset-less root computation
BOOST_CHECK_NE(grid.map_col(c), offset_less);
}
for (std::size_t r = 0; r < grid.proc_rows(); ++r) {
const ProcessID offset_less =
grid.rank_col() + ProcessID(r) * ProcessID(grid.proc_cols());
// map_row == the value make_group inserts as the col-broadcast root
BOOST_CHECK_EQUAL(grid.map_row(r), offset_less + offset);
BOOST_CHECK_NE(grid.map_row(r), offset_less);
}
}
}
}

#if 0
// This test case us used to evaluate distribute statistics. This unit test
// should only be enabled when changes are made to the ProcGrid algorithm, and
Expand Down
Loading