Loading [MathJax]/extensions/tex2jax.js
CoreNEURON
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
cellorder2.cpp
Go to the documentation of this file.
1 /*
2 # =============================================================================
3 # Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
4 #
5 # See top-level LICENSE file for details.
6 # =============================================================================
7 */
8 
9 #include <cstdio>
10 #include <map>
11 #include <set>
12 #include <algorithm>
13 #include <cstring>
14 #include <numeric>
15 
20 
21 // experiment starting with identical cell ordering
22 // groupindex aleady defined that keeps identical cells together
23 // begin with leaf to root ordering
24 namespace coreneuron {
25 using VTN = VecTNode; // level of nodes
26 using VVTN = std::vector<VTN>; // group of levels
27 using VVVTN = std::vector<VVTN>; // groups
28 
29 // verify level in groups of nident identical nodes
30 void chklevel(VTN& level, size_t nident = 8) {}
31 
32 // first child before second child, etc
33 // if same parent level, then parent order
34 // if not same parent, then earlier parent (no parent earlier than parent)
35 // if same parents, then children order
36 // if no parents then nodevec_index order.
37 static bool sortlevel_cmp(TNode* a, TNode* b) {
38  // when starting with leaf to root order
39  // note that leaves are at max level and all roots at level 0
40  bool result = false;
41  // since cannot have an index < 0, just add 1 to level
42  size_t palevel = a->parent ? 1 + a->parent->level : 0;
43  size_t pblevel = b->parent ? 1 + b->parent->level : 0;
44  if (palevel < pblevel) { // only used when starting leaf to root order
45  result = true; // earlier level first
46  } else if (palevel == pblevel) { // always true when starting root to leaf
47  if (palevel == 0) { // a and b are roots
48  if (a->nodevec_index < b->nodevec_index) {
49  result = true;
50  }
51  } else { // parent order (already sorted with proper treenode_order)
52  if (a->treenode_order < b->treenode_order) { // children order
53  result = true;
54  } else if (a->treenode_order == b->treenode_order) {
56  result = true;
57  }
58  }
59  }
60  }
61  return result;
62 }
63 
64 static void sortlevel(VTN& level) {
65  std::sort(level.begin(), level.end(), sortlevel_cmp);
66 
67  for (size_t i = 0; i < level.size(); ++i) {
68  level[i]->treenode_order = i;
69  }
70 }
71 
72 // TODO: refactor since sortlevel() is traversing the nodes in same order
73 static void set_treenode_order(VVTN& levels) {
74  size_t order = 0;
75  for (auto& level: levels) {
76  for (auto* nd: level) {
77  nd->treenode_order = order++;
78  }
79  }
80 }
81 
82 #if CORENRN_DEBUG
83 // every level starts out with no race conditions involving both
84 // parent and child in the same level. Can we arrange things so that
85 // every level has at least 32 nodes?
86 static size_t g32(TNode* nd) {
87  return nd->nodevec_index / warpsize;
88 }
89 
90 static bool is_parent_race(TNode* nd) { // vitiating
91  size_t pg = g32(nd);
92  for (const auto& child: nd->children) {
93  if (pg == g32(child)) {
94  return true;
95  }
96  }
97  return false;
98 }
99 #endif
100 
101 // less than 32 apart
102 static bool is_parent_race2(TNode* nd) { // vitiating
103  size_t pi = nd->nodevec_index;
104  for (const auto& child: nd->children) {
105  if (child->nodevec_index - pi < warpsize) {
106  return true;
107  }
108  }
109  return false;
110 }
111 
112 #if CORENRN_DEBUG
113 static bool is_child_race(TNode* nd) { // potentially handleable by atomic
114  if (nd->children.size() < 2) {
115  return false;
116  }
117  if (nd->children.size() == 2) {
118  return g32(nd->children[0]) == g32(nd->children[1]);
119  }
120  std::set<size_t> s;
121  for (const auto& child: nd->children) {
122  std::size_t gc = g32(child);
123  if (s.find(gc) != s.end()) {
124  return true;
125  }
126  s.insert(gc);
127  }
128  return false;
129 }
130 #endif
131 
132 static bool is_child_race2(TNode* nd) { // potentially handleable by atomic
133  if (nd->children.size() < 2) {
134  return false;
135  }
136  if (nd->children.size() == 2) {
137  size_t c0 = nd->children[0]->nodevec_index;
138  size_t c1 = nd->children[1]->nodevec_index;
139  c0 = (c0 < c1) ? (c1 - c0) : (c0 - c1);
140  return c0 < warpsize;
141  }
142  size_t ic0 = nd->children[0]->nodevec_index;
143  for (size_t i = 1; i < nd->children.size(); ++i) {
144  size_t ic = nd->children[i]->nodevec_index;
145  if (ic - ic0 < warpsize) {
146  return true;
147  }
148  ic0 = ic;
149  }
150  return false;
151 }
152 
153 size_t dist2child(TNode* nd) {
154  size_t d = 1000;
155  size_t pi = nd->nodevec_index;
156  for (const auto& child: nd->children) {
157  std::size_t d1 = child->nodevec_index - pi;
158  if (d1 < d) {
159  d = d1;
160  }
161  }
162  return d;
163 }
164 
165 // from stackoverflow.com
166 template <typename T>
167 static void move_range(size_t start, size_t length, size_t dst, std::vector<T>& v) {
168  typename std::vector<T>::iterator first, middle, last;
169  if (start < dst) {
170  first = v.begin() + start;
171  middle = first + length;
172  last = v.begin() + dst;
173  } else {
174  first = v.begin() + dst;
175  middle = v.begin() + start;
176  last = middle + length;
177  }
178  std::rotate(first, middle, last);
179 }
180 
181 static void move_nodes(size_t start, size_t length, size_t dst, VTN& nodes) {
182  nrn_assert(dst <= nodes.size());
183  nrn_assert(start + length <= dst);
184  move_range(start, length, dst, nodes);
185 
186  // check correctness of move
187  for (size_t i = start; i < dst - length; ++i) {
188  nrn_assert(nodes[i]->nodevec_index == i + length);
189  }
190  for (size_t i = dst - length; i < dst; ++i) {
191  nrn_assert(nodes[i]->nodevec_index == start + (i - (dst - length)));
192  }
193 
194  // update nodevec_index
195  for (size_t i = start; i < dst; ++i) {
196  nodes[i]->nodevec_index = i;
197  }
198 }
199 
200 #if CORENRN_DEBUG
201 // least number of nodes to move after nd to eliminate prace
202 static size_t need2move(TNode* nd) {
203  size_t d = dist2child(nd);
204  return warpsize - ((nd->nodevec_index % warpsize) + d);
205 }
206 
207 static void how_many_warpsize_groups_have_only_leaves(VTN& nodes) {
208  size_t n = 0;
209  for (size_t i = 0; i < nodes.size(); i += warpsize) {
210  bool r = true;
211  for (size_t j = 0; j < warpsize; ++j) {
212  if (!nodes[i + j]->children.empty()) {
213  r = false;
214  break;
215  }
216  }
217  if (r) {
218  printf("warpsize group %ld starting at level %ld\n", i / warpsize, nodes[i]->level);
219  ++n;
220  }
221  }
222  printf("number of warpsize groups with only leaves = %ld\n", n);
223 }
224 
225 static void pr_race_situation(VTN& nodes) {
226  size_t prace2 = 0;
227  size_t prace = 0;
228  size_t crace = 0;
229  for (size_t i = nodes.size() - 1; nodes[i]->level != 0; --i) {
230  TNode* nd = nodes[i];
231  if (is_parent_race2(nd)) {
232  ++prace2;
233  }
234  if (is_parent_race(nd)) {
235  printf("level=%ld i=%ld d=%ld n=%ld",
236  nd->level,
237  nd->nodevec_index,
238  dist2child(nd),
239  need2move(nd));
240  for (const auto& cnd: nd->children) {
241  printf(" %ld %ld", cnd->level, cnd->nodevec_index);
242  }
243  printf("\n");
244  ++prace;
245  }
246  if (is_child_race(nd)) {
247  ++crace;
248  }
249  }
250  printf("prace=%ld crace=%ld prace2=%ld\n", prace, crace, prace2);
251 }
252 #endif
253 
254 static size_t next_leaf(TNode* nd, VTN& nodes) {
255  size_t i = 0;
256  for (i = nd->nodevec_index - 1; i > 0; --i) {
257  if (nodes[i]->children.empty()) {
258  return i;
259  }
260  }
261  // nrn_assert(i > 0);
262  return 0;
263 }
264 
265 static void checkrace(TNode* nd, VTN& nodes) {
266  for (size_t i = nd->nodevec_index; i < nodes.size(); ++i) {
267  if (is_parent_race2(nodes[i])) {
268  // printf("checkrace %ld\n", i);
269  }
270  }
271 }
272 
273 static bool eliminate_race(TNode* nd, size_t d, VTN& nodes, TNode* look) {
274  // printf("eliminate_race %ld %ld\n", nd->nodevec_index, d);
275  // opportunistically move that number of leaves
276  // error if no leaves left to move.
277  size_t i = look->nodevec_index;
278  while (d > 0) {
279  i = next_leaf(nodes[i], nodes);
280  if (i == 0) {
281  return false;
282  }
283  size_t n = 1;
284  while (nodes[i - 1]->children.empty() && n < d) {
285  --i;
286  ++n;
287  }
288  // printf(" move_nodes src=%ld len=%ld dest=%ld\n", i, n, nd->nodevec_index);
289  move_nodes(i, n, nd->nodevec_index + 1, nodes);
290  d -= n;
291  }
292  checkrace(nd, nodes);
293  return true;
294 }
295 
296 static void eliminate_prace(TNode* nd, VTN& nodes) {
297  size_t d = warpsize - dist2child(nd);
298  bool b = eliminate_race(nd, d, nodes, nd);
299  if (0 && !b) {
300  printf("could not eliminate prace for g=%ld c=%ld l=%ld o=%ld %ld\n",
301  nd->groupindex,
302  nd->cellindex,
303  nd->level,
304  nd->treenode_order,
305  nd->hash);
306  }
307 }
308 
309 static void eliminate_crace(TNode* nd, VTN& nodes) {
310  size_t c0 = nd->children[0]->nodevec_index;
311  size_t c1 = nd->children[1]->nodevec_index;
312  size_t d = warpsize - ((c0 > c1) ? (c0 - c1) : (c1 - c0));
313  TNode* cnd = nd->children[0];
314  bool b = eliminate_race(cnd, d, nodes, nd);
315  if (0 && !b) {
316  printf("could not eliminate crace for g=%ld c=%ld l=%ld o=%ld %ld\n",
317  nd->groupindex,
318  nd->cellindex,
319  nd->level,
320  nd->treenode_order,
321  nd->hash);
322  }
323 }
324 
325 static void question2(VVTN& levels) {
326  // number of compartments in the group
327  std::size_t nnode = std::accumulate(levels.begin(),
328  levels.end(),
329  0,
330  [](std::size_t s, const VTN& l) { return s + l.size(); });
331  VTN nodes(nnode); // store the sorted nodes from analyze function
332  nnode = 0;
333  for (const auto& level: levels) {
334  for (const auto& l: level) {
335  nodes[nnode++] = l;
336  }
337  }
338  for (size_t i = 0; i < nodes.size(); ++i) {
339  nodes[i]->nodevec_index = i;
340  }
341 
342  // how_many_warpsize_groups_have_only_leaves(nodes);
343 
344  // Here we need to make sure that the dependent nodes
345  // belong to separate warps
346 
347  // work backward and check the distance from parent to children.
348  // if parent in different group (warp?) then there is no vitiating race.
349  // if children in different group (warp?) then ther is no race (satisfied by
350  // atomic).
351  // If there is a vitiating race, then figure out how many nodes
352  // need to be inserted just before the parent to avoid the race.
353  // It is not clear if we should prioritize safe nodes (when moved they
354  // do not introduce a race) and/or contiguous nodes (probably, to keep
355  // the low hanging fruit together).
356  // At least, moved nodes should have proper tree order and not themselves
357  // introduce a race at their new location. Leaves are nice in that there
358  // are no restrictions in movement toward higher indices.
359  // Note that unless groups of 32 are inserted, it may be the case that
360  // races are generated at greater indices since otherwise a portion of
361  // each group is placed into the next group. This would not be an issue
362  // if, in fact, the stronger requirement of every parent having
363  // pi (parent index) + 32 <= ci (child index) is demanded instead of merely being in different
364  // warpsize. One nice thing about adding warpsize nodes is that it does not disturb any
365  // existing contiguous groups except the moved group which gets divided between parent
366  // warpsize and child, where the nodes past the parent get same relative indices in the next
367  // warpsize
368 
369  // let's see how well we can do by opportunistically moving leaves to
370  // separate parents from children by warpsize (ie is_parent_prace2 is false)
371  // Hopefully, we won't run out of leaves before eliminating all
372  // is_parent_prace2
373 
374  if (0 && nodes.size() % warpsize != 0) {
375  size_t nnode = nodes.size() - levels[0].size();
376  printf("warp of %ld cells has %ld nodes in last cycle %ld\n",
377  levels[0].size(),
378  nnode % warpsize,
379  nnode / warpsize + 1);
380  }
381 
382  // pr_race_situation(nodes);
383 
384  // eliminate parent and children races using leaves
385  // traverse all the children (no roots)
386  for (size_t i = nodes.size() - 1; i >= levels[0].size(); --i) {
387  TNode* nd = nodes[i];
388  if (is_child_race2(nd)) {
389  eliminate_crace(nd, nodes);
390  i = nd->nodevec_index;
391  }
392  if (is_parent_race2(nd)) {
393  eliminate_prace(nd, nodes);
394  i = nd->nodevec_index;
395  }
396  }
397  // copy nodes indices to treenode_order
398  for (size_t i = 0; i < nodes.size(); ++i) {
399  nodes[i]->treenode_order = i;
400  }
401 }
402 
403 // analyze each group of cells
404 // the cells are grouped based on warp balance (lpt) algorithm
405 static void analyze(VVTN& levels) {
406  // sort each level with respect to parent level order
407  // earliest parent level first.
408 
409  // treenode order can be anything as long as first children < second
410  // children etc.. After sorting a level, the order will be correct for
411  // that level, ranging from [0:level.size]
412  for (auto& level: levels) {
413  chklevel(level); // does nothing
414  for (const auto& nd: level) {
415  for (size_t k = 0; k < nd->children.size(); ++k) {
416  nd->children[k]->treenode_order = k;
417  }
418  }
419  }
420 
421  for (auto& level: levels) {
422  sortlevel(level);
423  chklevel(level); // does nothing
424  }
425 
426  set_treenode_order(levels);
427 }
428 
429 void prgroupsize(VVVTN& groups) {
430 #if CORENRN_DEBUG
431  for (size_t i = 0; i < groups[0].size(); ++i) {
432  printf("%5ld\n", i);
433  for (const auto& group: groups) {
434  printf(" %5ld", group[i].size());
435  }
436  printf("\n");
437  }
438 #endif
439 }
440 
441 // group index primary, treenode_order secondary
442 static bool final_nodevec_cmp(TNode* a, TNode* b) {
443  bool result = false;
444  if (a->groupindex < b->groupindex) {
445  result = true;
446  } else if (a->groupindex == b->groupindex) {
447  if (a->treenode_order < b->treenode_order) {
448  result = true;
449  }
450  }
451  return result;
452 }
453 
454 static void set_nodeindex(VecTNode& nodevec) {
455  for (size_t i = 0; i < nodevec.size(); ++i) {
456  nodevec[i]->nodevec_index = i;
457  }
458 }
459 
460 void group_order2(VecTNode& nodevec, size_t groupsize, size_t ncell) {
461  size_t maxlevel = level_from_root(nodevec);
462 
463  // reset TNode.groupindex
464  size_t nwarp = warp_balance(ncell, nodevec);
465 
466  // work on a cellgroup as a vector of levels. ie only possible race is
467  // two children in same warpsize
468 
469  // every warp deals with a group of cells
470  // the cell dispatching to the available groups is done through the warp_balance function (lpt
471  // algo)
472  VVVTN groups(nwarp ? nwarp : (ncell / groupsize + ((ncell % groupsize) ? 1 : 0)));
473 
474  for (auto& group: groups) {
475  group.resize(maxlevel + 1);
476  }
477 
478  // group the cells according to their groupindex and according to their level (see
479  // level_from_root)
480  for (const auto& nd: nodevec) {
481  groups[nd->groupindex][nd->level].push_back(nd);
482  }
483 
484  prgroupsize(groups); // debugging
485 
486  // deal with each group
487  for (auto& group: groups) {
488  analyze(group);
489  question2(group);
490  }
491 
492  // final nodevec order according to group_index and treenode_order
493  std::sort(nodevec.begin() + ncell, nodevec.end(), final_nodevec_cmp);
494  set_nodeindex(nodevec);
495 }
496 } // namespace coreneuron
coreneuron::dist2child
size_t dist2child(TNode *nd)
Definition: cellorder2.cpp:153
coreneuron::TNode::hash
size_t hash
Hash algorith that generates a hash based on the hash of the children and the number of compartments ...
Definition: tnode.hpp:31
coreneuron::level_from_root
size_t level_from_root(VecTNode &)
Definition: cellorder1.cpp:203
coreneuron::question2
static void question2(VVTN &levels)
Definition: cellorder2.cpp:325
coreneuron::VVTN
std::vector< VTN > VVTN
Definition: cellorder2.cpp:26
coreneuron::move_range
static void move_range(size_t start, size_t length, size_t dst, std::vector< T > &v)
Definition: cellorder2.cpp:167
warpsize
#define warpsize
Definition: tnode.hpp:85
coreneuron::sortlevel
static void sortlevel(VTN &level)
Definition: cellorder2.cpp:64
coreneuron::is_parent_race2
static bool is_parent_race2(TNode *nd)
Definition: cellorder2.cpp:102
coreneuron::chklevel
void chklevel(VTN &level, size_t nident=8)
Definition: cellorder2.cpp:30
coreneuron::next_leaf
static size_t next_leaf(TNode *nd, VTN &nodes)
Definition: cellorder2.cpp:254
coreneuron::checkrace
static void checkrace(TNode *nd, VTN &nodes)
Definition: cellorder2.cpp:265
coreneuron
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
Definition: corenrn_parameters.cpp:12
coreneuron::ncell
icycle< ncycle;++icycle) { int istride=stride[icycle];nrn_pragma_acc(loop vector) nrn_pragma_omp(loop bind(parallel)) for(int icore=0;icore< warpsize;++icore) { int i=ii+icore;if(icore< istride) { int ip=GPU_PARENT(i);GPU_RHS(i) -=GPU_B(i) *GPU_RHS(ip);GPU_RHS(i)/=GPU_D(i);} i+=istride;} ii+=istride;}}void solve_interleaved2(int ith) { NrnThread *nt=nrn_threads+ith;InterleaveInfo &ii=interleave_info[ith];int nwarp=ii.nwarp;if(nwarp==0) return;int ncore=nwarp *warpsize;int *ncycles=ii.cellsize;int *stridedispl=ii.stridedispl;int *strides=ii.stride;int *rootbegin=ii.firstnode;int *nodebegin=ii.lastnode;nrn_pragma_acc(parallel loop gang present(nt[0:1], strides[0:nstride], ncycles[0:nwarp], stridedispl[0:nwarp+1], rootbegin[0:nwarp+1], nodebegin[0:nwarp+1]) if(nt->compute_gpu) async(nt->stream_id)) nrn_pragma_omp(target teams loop if(nt->compute_gpu)) for(int icore=0;icore< ncore;icore+=warpsize) { int iwarp=icore/warpsize;int ic=icore &(warpsize - 1);int ncycle=ncycles[iwarp];int *stride=strides+stridedispl[iwarp];int root=rootbegin[iwarp];int lastroot=rootbegin[iwarp+1];int firstnode=nodebegin[iwarp];int lastnode=nodebegin[iwarp+1];triang_interleaved2(nt, ic, ncycle, stride, lastnode);bksub_interleaved2(nt, root+ic, lastroot, ic, ncycle, stride, firstnode);} nrn_pragma_acc(wait(nt->stream_id))}void solve_interleaved1(int ith) { NrnThread *nt=nrn_threads+ith;int ncell=nt-> ncell
Definition: cellorder.cpp:636
coreneuron::TNode
TNode is the tree node that represents the tree of the compartments.
Definition: tnode.hpp:23
coreneuron::analyze
static void analyze(VVTN &levels)
Definition: cellorder2.cpp:405
coreneuron::i
int i
Definition: cellorder.cpp:485
coreneuron::VVVTN
std::vector< VVTN > VVVTN
Definition: cellorder2.cpp:27
nrniv_decl.h
coreneuron::VTN
VecTNode VTN
Definition: cellorder2.cpp:25
coreneuron::set_nodeindex
static void set_nodeindex(VecTNode &nodevec)
Definition: cellorder2.cpp:454
coreneuron::prgroupsize
void prgroupsize(VVVTN &groups)
Definition: cellorder2.cpp:429
tnode.hpp
coreneuron::warp_balance
size_t warp_balance(size_t ncell, VecTNode &nodevec)
Use of the LPT (Least Processing Time) algorithm to create balanced groups of cells.
Definition: balance.cpp:43
coreneuron::TNode::groupindex
size_t groupindex
Cell ID that this compartment belongs to.
Definition: tnode.hpp:54
coreneuron::eliminate_race
static bool eliminate_race(TNode *nd, size_t d, VTN &nodes, TNode *look)
Definition: cellorder2.cpp:273
coreneuron::is_child_race2
static bool is_child_race2(TNode *nd)
Definition: cellorder2.cpp:132
coreneuron::final_nodevec_cmp
static bool final_nodevec_cmp(TNode *a, TNode *b)
Definition: cellorder2.cpp:442
coreneuron::group_order2
void group_order2(VecTNode &, size_t groupsize, size_t ncell)
Implementation of the advanced interleaving strategy (interleave_permute_type == 2)
Definition: cellorder2.cpp:460
coreneuron::sortlevel_cmp
static bool sortlevel_cmp(TNode *a, TNode *b)
Definition: cellorder2.cpp:37
coreneuron::eliminate_prace
static void eliminate_prace(TNode *nd, VTN &nodes)
Definition: cellorder2.cpp:296
coreneuron::TNode::cellindex
size_t cellindex
level of of this compartment in the tree
Definition: tnode.hpp:53
coreneuron::TNode::treenode_order
size_t treenode_order
index in nodevec that is set in check() In cell permute 2 this is set as Breadth First traversal
Definition: tnode.hpp:35
coreneuron::groupsize
static size_t groupsize
Definition: cellorder1.cpp:25
coreneuron::TNode::children
VecTNode children
Definition: tnode.hpp:28
coreneuron::TNode::level
size_t level
For cell permute 1 (Interleaved):
Definition: tnode.hpp:52
cellorder.hpp
coreneuron::move_nodes
static void move_nodes(size_t start, size_t length, size_t dst, VTN &nodes)
Definition: cellorder2.cpp:181
coreneuron::TNode::parent
TNode * parent
Definition: tnode.hpp:27
coreneuron::VecTNode
std::vector< TNode * > VecTNode
Definition: tnode.hpp:17
v
#define v
Definition: md1redef.h:11
coreneuron::eliminate_crace
static void eliminate_crace(TNode *nd, VTN &nodes)
Definition: cellorder2.cpp:309
coreneuron::TNode::nodevec_index
size_t nodevec_index
Total number of compartments from the current node and below.
Definition: tnode.hpp:33
coreneuron::pi
double pi
Definition: register_mech.cpp:22
coreneuron::NrnThread::end
int end
Definition: multicore.hpp:98
nrn_assert
#define nrn_assert(x)
assert()-like macro, independent of NDEBUG status
Definition: nrn_assert.h:33
coreneuron::set_treenode_order
static void set_treenode_order(VVTN &levels)
Definition: cellorder2.cpp:73
nrn_assert.h