CoreNEURON
test_solver.cpp
Go to the documentation of this file.
1 /*
2 # =============================================================================
3 # Copyright (c) 2022 Blue Brain Project/EPFL
4 #
5 # See top-level LICENSE file for details.
6 # =============================================================================.
7 */
13 
14 #define BOOST_TEST_MODULE CoreNEURON solver
15 #include <boost/test/included/unit_test.hpp>
16 
17 #include <iostream>
18 #include <functional>
19 #include <map>
20 #include <random>
21 #include <utility>
22 #include <vector>
23 
24 using namespace coreneuron;
25 namespace utf = boost::unit_test;
26 
27 
28 struct SolverData {
29  std::vector<double> d, rhs;
30  std::vector<int> parent_index;
31 };
32 
33 constexpr auto magic_index_value = -2;
34 constexpr auto magic_double_value = std::numeric_limits<double>::lowest();
35 
36 enum struct SolverImplementation {
44 };
45 
46 std::ostream& operator<<(std::ostream& os, SolverImplementation impl) {
48  return os << "SolverImplementation::CellPermute0_CPU";
49  } else if (impl == SolverImplementation::CellPermute0_GPU) {
50  return os << "SolverImplementation::CellPermute0_GPU";
51  } else if (impl == SolverImplementation::CellPermute1_CPU) {
52  return os << "SolverImplementation::CellPermute1_CPU";
53  } else if (impl == SolverImplementation::CellPermute1_GPU) {
54  return os << "SolverImplementation::CellPermute1_GPU";
55  } else if (impl == SolverImplementation::CellPermute2_CPU) {
56  return os << "SolverImplementation::CellPermute2_CPU";
57  } else if (impl == SolverImplementation::CellPermute2_GPU) {
58  return os << "SolverImplementation::CellPermute2_GPU";
59  } else if (impl == SolverImplementation::CellPermute2_CUDA) {
60  return os << "SolverImplementation::CellPermute2_CUDA";
61  } else {
62  throw std::runtime_error("Invalid SolverImplementation");
63  }
64 }
65 
67  int num_threads{1};
68  int num_cells{1};
69  int num_segments_per_cell{3};
70  std::function<double(int, int)> produce_a{[](auto, auto) { return 3.14159; }},
71  produce_b{[](auto, auto) { return 42.0; }}, produce_d{[](auto, auto) { return 7.0; }},
72  produce_rhs{[](auto, auto) { return -16.0; }};
73 };
74 
75 // TODO include some global lock as a sanity check (only one instance of
76 // SetupThreads should exist at any given time)
77 struct SetupThreads {
80  corenrn_param.gpu = false;
81  switch (impl) {
83  corenrn_param.gpu = true;
84  [[fallthrough]];
87  break;
89  corenrn_param.gpu = true;
90  [[fallthrough]];
93  break;
96  [[fallthrough]];
98  corenrn_param.gpu = true;
99  [[fallthrough]];
102  break;
103  }
105  nrn_threads_create(config.num_threads);
107  int num_cells_remaining{config.num_cells}, total_cells{};
108  for (auto ithread = 0; ithread < nrn_nthread; ++ithread) {
109  auto& nt = nrn_threads[ithread];
110  // How many cells to distribute on this thread, trying to get the right
111  // total even if num_threads does not exactly divide num_cells.
112  nt.ncell = num_cells_remaining / (nrn_nthread - ithread);
113  total_cells += nt.ncell;
114  num_cells_remaining -= nt.ncell;
115  // How many segments are there in this thread?
116  nt.end = nt.ncell * config.num_segments_per_cell;
117  auto const padded_size = nrn_soa_padded_size(nt.end, 0);
118  // Allocate one big block because the GPU data transfer code assumes this.
119  nt._ndata = padded_size * 4;
120  nt._data = static_cast<double*>(emalloc_align(nt._ndata * sizeof(double)));
121  auto* vec_rhs = (nt._actual_rhs = nt._data + 0 * padded_size);
122  auto* vec_d = (nt._actual_d = nt._data + 1 * padded_size);
123  auto* vec_a = (nt._actual_a = nt._data + 2 * padded_size);
124  auto* vec_b = (nt._actual_b = nt._data + 3 * padded_size);
125  auto* parent_indices =
126  (nt._v_parent_index = static_cast<int*>(emalloc_align(padded_size * sizeof(int))));
127  // Magic value to check against later.
128  std::fill(parent_indices, parent_indices + nt.end, magic_index_value);
129  // Put all the root nodes first, then put the other segments
130  // in blocks. i.e. ABCDAAAABBBBCCCCDDDD
131  auto const get_index = [ncell = nt.ncell,
132  nseg = config.num_segments_per_cell](auto icell, auto iseg) {
133  if (iseg == 0) {
134  return icell;
135  } else {
136  return ncell + icell * (nseg - 1) + iseg - 1;
137  }
138  };
139  for (auto icell = 0; icell < nt.ncell; ++icell) {
140  for (auto iseg = 0; iseg < config.num_segments_per_cell; ++iseg) {
141  auto const global_index = get_index(icell, iseg);
142  vec_a[global_index] = config.produce_a(icell, iseg);
143  vec_b[global_index] = config.produce_b(icell, iseg);
144  vec_d[global_index] = config.produce_d(icell, iseg);
145  vec_rhs[global_index] = config.produce_rhs(icell, iseg);
146  // 0th element is the root node, which has no parent
147  // other elements are attached in a binary tree configuration
148  // | 0 |
149  // | / \ |
150  // | 1 2 |
151  // | / \ / \ |
152  // | 3 4 5 6 |
153  // TODO: include some other topologies, e.g. a long straight line, or
154  // an unbalanced tree.
155  auto const parent_id = iseg ? get_index(icell, (iseg - 1) / 2) : -1;
156  parent_indices[global_index] = parent_id;
157  }
158  }
159  // Check we didn't mess up populating any parent indices
160  for (auto i = 0; i < nt.end; ++i) {
161  BOOST_REQUIRE(parent_indices[i] != magic_index_value);
162  // Root nodes should come first for --cell-permute=0
163  if (i < nt.ncell) {
164  BOOST_REQUIRE(parent_indices[i] == -1);
165  }
166  }
168  nt._permute = interleave_order(nt.id, nt.ncell, nt.end, parent_indices);
169  BOOST_REQUIRE(nt._permute);
170  permute_data(vec_a, nt.end, nt._permute);
171  permute_data(vec_b, nt.end, nt._permute);
172  // This isn't done in CoreNEURON because these are reset every
173  // time step, but permute d/rhs here so that the initial values
174  // set by produce_d and produce_rhs are propagated consistently
175  // to all of the solver implementations.
176  permute_data(vec_d, nt.end, nt._permute);
177  permute_data(vec_rhs, nt.end, nt._permute);
178  // index values change as well as ordering
179  permute_ptr(parent_indices, nt.end, nt._permute);
180  node_permute(parent_indices, nt.end, nt._permute);
181  }
182  }
184  std::cout << "CellPermute0_GPU is a nonstandard configuration, copying data to the "
185  "device may produce warnings:";
186  }
187  if (corenrn_param.gpu) {
189  }
191  std::cout << "\n...no more warnings expected" << std::endl;
192  }
193  // Make sure we produced the number of cells we were aiming for
194  BOOST_REQUIRE(total_cells == config.num_cells);
195  BOOST_REQUIRE(num_cells_remaining == 0);
196  }
197 
199  if (corenrn_param.gpu) {
201  }
202  for (auto& nt: *this) {
203  free_memory(std::exchange(nt._data, nullptr));
204  delete[] std::exchange(nt._permute, nullptr);
205  free_memory(std::exchange(nt._v_parent_index, nullptr));
206  }
209  }
210 
212  std::vector<SolverData> ret{static_cast<std::size_t>(nrn_nthread)};
213  // Sync the solver data from GPU to host
215  // Un-permute the data in and store it in ret.{d,parent_index,rhs}
216  for (auto i = 0; i < nrn_nthread; ++i) {
217  auto& nt = nrn_threads[i];
218  auto& sd = ret[i];
219  sd.d.resize(nt.end, magic_double_value);
220  sd.parent_index.resize(nt.end, magic_index_value);
221  sd.rhs.resize(nt.end, magic_double_value);
222  auto* inv_permute = nt._permute ? inverse_permute(nt._permute, nt.end) : nullptr;
223  for (auto i = 0; i < nt.end; ++i) {
224  // index in permuted vectors
225  auto const p_i = nt._permute ? nt._permute[i] : i;
226  // parent index in permuted vectors
227  auto const p_parent = nt._v_parent_index[p_i];
228  // parent index in unpermuted vectors (i.e. on the same scale as `i`)
229  auto const parent = p_parent == -1
230  ? -1
231  : (inv_permute ? inv_permute[p_parent] : p_parent);
232  // Save the values to the de-permuted return structure
233  sd.d[i] = nt._actual_d[p_i];
234  sd.parent_index[i] = parent;
235  sd.rhs[i] = nt._actual_rhs[p_i];
236  }
237  delete[] inv_permute;
238  for (auto i = 0; i < nt.end; ++i) {
239  BOOST_REQUIRE(sd.d[i] != magic_double_value);
240  BOOST_REQUIRE(sd.parent_index[i] != magic_index_value);
241  BOOST_REQUIRE(sd.rhs[i] != magic_double_value);
242  }
243  }
244  return ret;
245  }
246 
247  void solve() {
248  for (auto& thread: *this) {
249  nrn_solve_minimal(&thread);
250  }
251  }
252 
253  NrnThread* begin() const {
254  return nrn_threads;
255  }
256  NrnThread* end() const {
257  return nrn_threads + nrn_nthread;
258  }
259 };
260 
261 template <typename... Args>
262 auto solve_and_dump(Args&&... args) {
263  SetupThreads threads{std::forward<Args>(args)...};
264  threads.solve();
265  return threads.dump_solver_data();
266 }
267 
269  // These are always available
270  std::vector<SolverImplementation> ret{SolverImplementation::CellPermute0_CPU,
273 #ifdef CORENEURON_ENABLE_GPU
274  // Consider making these steerable via a runtime switch in GPU builds
279 #endif
280  return ret;
281 }
282 
284  std::map<SolverImplementation, std::vector<SolverData>> const& solver_data) {
285  // CellPermute0_CPU is the simplest version of the solver, it should always
286  // be present and it's a good reference to use
287  constexpr auto ref_impl = SolverImplementation::CellPermute0_CPU;
288  BOOST_REQUIRE(solver_data.find(ref_impl) != solver_data.end());
289  auto const& ref_data = solver_data.at(ref_impl);
290  for (auto const& [impl, impl_data]: solver_data) {
291  // Must have compatible numbers of threads.
292  BOOST_REQUIRE(impl_data.size() == ref_data.size());
293  std::cout << "Comparing " << impl << " to " << ref_impl << std::endl;
294  for (auto n_thread = 0ul; n_thread < impl_data.size(); ++n_thread) {
295  // Must have compatible numbers of segments/data entries
296  BOOST_REQUIRE(impl_data[n_thread].d.size() == ref_data[n_thread].d.size());
297  BOOST_REQUIRE(impl_data[n_thread].parent_index.size() ==
298  ref_data[n_thread].parent_index.size());
299  BOOST_REQUIRE(impl_data[n_thread].rhs.size() == ref_data[n_thread].rhs.size());
300  BOOST_TEST(impl_data[n_thread].d == ref_data[n_thread].d,
301  boost::test_tools::per_element());
302  BOOST_TEST(impl_data[n_thread].parent_index == ref_data[n_thread].parent_index,
303  boost::test_tools::per_element());
304  BOOST_TEST(impl_data[n_thread].rhs == ref_data[n_thread].rhs,
305  boost::test_tools::per_element());
306  }
307  }
308 }
309 
310 template <typename... Args>
312  std::map<SolverImplementation, std::vector<SolverData>> solver_data;
313  for (auto impl: active_implementations()) {
314  solver_data[impl] = solve_and_dump(impl, std::forward<Args>(args)...);
315  }
316  compare_solver_data(solver_data);
317  return solver_data;
318 }
319 
320 // *Roughly* tuned to accomodate NVHPC 22.3 at -O0; the largest differences come
321 // from the pseudorandom seeded tests.
322 constexpr double default_tolerance = 2e-11;
323 
324 // May need to add some different tolerances here
325 BOOST_AUTO_TEST_CASE(SingleCellAndThread, *utf::tolerance(default_tolerance)) {
326  constexpr std::size_t segments = 32;
327  ToyModelConfig config{};
328  config.num_segments_per_cell = segments;
329  auto const solver_data = compare_all_active_implementations(config);
330  for (auto const& [impl, data]: solver_data) {
331  BOOST_REQUIRE(data.size() == 1); // nthreads
332  BOOST_REQUIRE(data[0].d.size() == segments);
333  BOOST_REQUIRE(data[0].parent_index.size() == segments);
334  BOOST_REQUIRE(data[0].rhs.size() == segments);
335  }
336 }
337 
338 BOOST_AUTO_TEST_CASE(UnbalancedCellSingleThread, *utf::tolerance(default_tolerance)) {
339  ToyModelConfig config{};
340  config.num_segments_per_cell = 19; // not a nice round number
342 }
343 
344 BOOST_AUTO_TEST_CASE(LargeCellSingleThread, *utf::tolerance(default_tolerance)) {
345  ToyModelConfig config{};
346  config.num_segments_per_cell = 4096;
348 }
349 
350 BOOST_AUTO_TEST_CASE(ManySmallCellsSingleThread, *utf::tolerance(default_tolerance)) {
351  ToyModelConfig config{};
352  config.num_cells = 1024;
354 }
355 
356 BOOST_AUTO_TEST_CASE(ManySmallCellsMultiThread, *utf::tolerance(default_tolerance)) {
357  ToyModelConfig config{};
358  config.num_cells = 1024;
359  config.num_threads = 2;
361 }
362 
364  std::mt19937_64 gen{42};
365  ToyModelConfig config{};
366  config.produce_a = [g = gen, d = std::normal_distribution{1.0, 0.1}](int icell,
367  int iseg) mutable {
368  return d(g);
369  };
370  config.produce_b = [g = gen, d = std::normal_distribution{7.0, 0.2}](int, int) mutable {
371  return d(g);
372  };
373  config.produce_d = [g = gen, d = std::normal_distribution{-0.1, 0.01}](int, int) mutable {
374  return d(g);
375  };
376  config.produce_rhs = [g = gen, d = std::normal_distribution{-15.0, 2.0}](int, int) mutable {
377  return d(g);
378  };
379  return config;
380 }
381 
382 BOOST_AUTO_TEST_CASE(LargeCellSingleThreadRandom, *utf::tolerance(default_tolerance)) {
383  auto config = random_config();
384  config.num_segments_per_cell = 4096;
386 }
387 
388 BOOST_AUTO_TEST_CASE(ManySmallCellsSingleThreadRandom, *utf::tolerance(default_tolerance)) {
389  auto config = random_config();
390  config.num_cells = 1024;
392 }
coreneuron::setup_nrnthreads_on_device
void setup_nrnthreads_on_device(NrnThread *threads, int nthreads)
Definition: nrn_acc_manager.cpp:466
coreneuron::interleave_order
int * interleave_order(int ith, int ncell, int nnode, int *parent)
Function that performs the permutation of the cells such that the execution threads access coalesced ...
Definition: cellorder.cpp:290
free_memory
void free_memory(void *pointer)
Definition: memory.h:196
solve_and_dump
auto solve_and_dump(Args &&... args)
Definition: test_solver.cpp:262
SolverImplementation::CellPermute2_CPU
@ CellPermute2_CPU
coreneuron::nrn_nthread
int nrn_nthread
Definition: multicore.cpp:55
ToyModelConfig::produce_a
std::function< double(int, int)> produce_a
Definition: test_solver.cpp:70
coreneuron::delete_nrnthreads_on_device
void delete_nrnthreads_on_device(NrnThread *threads, int nthreads)
Cleanup device memory that is being tracked by the OpenACC runtime.
Definition: nrn_acc_manager.cpp:1151
ToyModelConfig::num_cells
int num_cells
Definition: test_solver.cpp:68
coreneuron::inv_permute
static int inv_permute(int i, NrnThread &nt)
Definition: prcellstate.cpp:32
coreneuron::update_nrnthreads_on_host
void update_nrnthreads_on_host(NrnThread *threads, int nthreads)
Definition: nrn_acc_manager.cpp:1012
data
Definition: alignment.cpp:18
SetupThreads::end
NrnThread * end() const
Definition: test_solver.cpp:256
SolverData::parent_index
std::vector< int > parent_index
Definition: test_solver.cpp:30
SolverImplementation::CellPermute2_CUDA
@ CellPermute2_CUDA
SolverImplementation::CellPermute2_GPU
@ CellPermute2_GPU
coreneuron::nrn_threads_free
void nrn_threads_free()
Definition: multicore.cpp:125
nrn_acc_manager.hpp
magic_index_value
constexpr auto magic_index_value
Definition: test_solver.cpp:33
coreneuron::nrn_solve_minimal
void nrn_solve_minimal(NrnThread *)
Definition: solve_core.cpp:18
coreneuron::permute_data
void permute_data(double *vec, int n, int *p)
Definition: node_permute.cpp:349
coreneuron::nrn_threads_create
void nrn_threads_create(int n)
Definition: multicore.cpp:102
coreneuron::corenrn_parameters_data::gpu
bool gpu
Enable pthread/openmp.
Definition: corenrn_parameters.hpp:63
coreneuron::nrn_soa_padded_size
int nrn_soa_padded_size(int cnt, int layout)
calculate size after padding for specific memory layout
Definition: mem_layout_util.cpp:15
coreneuron::interleave_permute_type
int interleave_permute_type
Definition: cellorder.cpp:28
coreneuron
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
Definition: corenrn_parameters.cpp:12
compare_solver_data
void compare_solver_data(std::map< SolverImplementation, std::vector< SolverData >> const &solver_data)
Definition: test_solver.cpp:283
SolverData::rhs
std::vector< double > rhs
Definition: test_solver.cpp:29
coreneuron::ncell
icycle< ncycle;++icycle) { int istride=stride[icycle];nrn_pragma_acc(loop vector) nrn_pragma_omp(loop bind(parallel)) for(int icore=0;icore< warpsize;++icore) { int i=ii+icore;if(icore< istride) { int ip=GPU_PARENT(i);GPU_RHS(i) -=GPU_B(i) *GPU_RHS(ip);GPU_RHS(i)/=GPU_D(i);} i+=istride;} ii+=istride;}}void solve_interleaved2(int ith) { NrnThread *nt=nrn_threads+ith;InterleaveInfo &ii=interleave_info[ith];int nwarp=ii.nwarp;if(nwarp==0) return;int ncore=nwarp *warpsize;int *ncycles=ii.cellsize;int *stridedispl=ii.stridedispl;int *strides=ii.stride;int *rootbegin=ii.firstnode;int *nodebegin=ii.lastnode;nrn_pragma_acc(parallel loop gang present(nt[0:1], strides[0:nstride], ncycles[0:nwarp], stridedispl[0:nwarp+1], rootbegin[0:nwarp+1], nodebegin[0:nwarp+1]) if(nt->compute_gpu) async(nt->stream_id)) nrn_pragma_omp(target teams loop if(nt->compute_gpu)) for(int icore=0;icore< ncore;icore+=warpsize) { int iwarp=icore/warpsize;int ic=icore &(warpsize - 1);int ncycle=ncycles[iwarp];int *stride=strides+stridedispl[iwarp];int root=rootbegin[iwarp];int lastroot=rootbegin[iwarp+1];int firstnode=nodebegin[iwarp];int lastnode=nodebegin[iwarp+1];triang_interleaved2(nt, ic, ncycle, stride, lastnode);bksub_interleaved2(nt, root+ic, lastroot, ic, ncycle, stride, firstnode);} nrn_pragma_acc(wait(nt->stream_id))}void solve_interleaved1(int ith) { NrnThread *nt=nrn_threads+ith;int ncell=nt-> ncell
Definition: cellorder.cpp:636
corenrn_parameters.hpp
ToyModelConfig::num_segments_per_cell
int num_segments_per_cell
Definition: test_solver.cpp:69
coreneuron::i
int i
Definition: cellorder.cpp:485
random_config
auto random_config()
Definition: test_solver.cpp:363
coreneuron::destroy_interleave_info
void destroy_interleave_info()
Definition: cellorder.cpp:101
SetupThreads
Definition: test_solver.cpp:77
node_permute.h
coreneuron::NrnThread
Definition: multicore.hpp:75
SolverData
Definition: test_solver.cpp:28
coreneuron::permute_ptr
void permute_ptr(int *vec, int n, int *p)
Definition: node_permute.cpp:345
SolverImplementation::CellPermute1_CPU
@ CellPermute1_CPU
SetupThreads::solve
void solve()
Definition: test_solver.cpp:247
active_implementations
auto active_implementations()
Definition: test_solver.cpp:268
coreneuron::corenrn_parameters_data::cuda_interface
bool cuda_interface
Enable GPU computation.
Definition: corenrn_parameters.hpp:64
coreneuron::node_permute
void node_permute(int *vec, int n, int *permute)
Definition: node_permute.cpp:337
coreneuron::corenrn_param
corenrn_parameters corenrn_param
Printing method.
Definition: corenrn_parameters.cpp:268
SolverImplementation::CellPermute0_GPU
@ CellPermute0_GPU
coreneuron::nrn_threads
NrnThread * nrn_threads
Definition: multicore.cpp:56
magic_double_value
constexpr auto magic_double_value
Definition: test_solver.cpp:34
SetupThreads::dump_solver_data
auto dump_solver_data()
Definition: test_solver.cpp:211
default_tolerance
constexpr double default_tolerance
Definition: test_solver.cpp:322
BOOST_AUTO_TEST_CASE
BOOST_AUTO_TEST_CASE(SingleCellAndThread, *utf::tolerance(default_tolerance))
Definition: test_solver.cpp:325
cellorder.hpp
SolverImplementation::CellPermute0_CPU
@ CellPermute0_CPU
SetupThreads::SetupThreads
SetupThreads(SolverImplementation impl, ToyModelConfig config={})
Definition: test_solver.cpp:78
coreneuron::inverse_permute
int * inverse_permute(int *p, int n)
Definition: node_permute.cpp:131
coreneuron::use_solve_interleave
bool use_solve_interleave
Definition: solve_core.cpp:13
multicore.hpp
coreneuron::create_interleave_info
void create_interleave_info()
Definition: cellorder.cpp:96
compare_all_active_implementations
auto compare_all_active_implementations(Args &&... args)
Definition: test_solver.cpp:311
coreneuron::emalloc_align
void * emalloc_align(size_t size, size_t alignment)
SetupThreads::~SetupThreads
~SetupThreads()
Definition: test_solver.cpp:198
SolverImplementation
SolverImplementation
Definition: test_solver.cpp:36
SetupThreads::begin
NrnThread * begin() const
Definition: test_solver.cpp:253
SolverImplementation::CellPermute1_GPU
@ CellPermute1_GPU
ToyModelConfig
Definition: test_solver.cpp:66
coreneuron::NrnThread::ncell
int ncell
Definition: multicore.hpp:97
coreneuron::operator<<
std::ostream & operator<<(std::ostream &os, const corenrn_parameters &corenrn_param)
Definition: corenrn_parameters.cpp:218