CoreNEURON
cellorder.cpp
Go to the documentation of this file.
1 /*
2 # =============================================================================
3 # Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
4 #
5 # See top-level LICENSE file for details.
6 # =============================================================================
7 */
8 
9 #include "coreneuron/nrnconf.h"
14 #include "coreneuron/utils/lpt.hpp"
18 
19 #include "coreneuron/permute/node_permute.h" // for print_quality
20 
21 #ifdef _OPENACC
22 #include <openacc.h>
23 #endif
24 
25 #include <set>
26 
27 namespace coreneuron {
29 InterleaveInfo* interleave_info; // nrn_nthread array
30 
31 
33  std::swap(nwarp, info.nwarp);
34  std::swap(nstride, info.nstride);
35 
36  std::swap(stridedispl, info.stridedispl);
37  std::swap(stride, info.stride);
38  std::swap(firstnode, info.firstnode);
39  std::swap(lastnode, info.lastnode);
40  std::swap(cellsize, info.cellsize);
41 
42  std::swap(nnode, info.nnode);
43  std::swap(ncycle, info.ncycle);
44  std::swap(idle, info.idle);
45  std::swap(cache_access, info.cache_access);
46  std::swap(child_race, info.child_race);
47 }
48 
50  nwarp = info.nwarp;
51  nstride = info.nstride;
52 
58 
59  copy_array(nnode, info.nnode, nwarp);
60  copy_array(ncycle, info.ncycle, nwarp);
61  copy_array(idle, info.idle, nwarp);
64 }
65 
67  // self assignment
68  if (this == &info)
69  return *this;
70 
71  InterleaveInfo temp(info);
72 
73  this->swap(temp);
74  return *this;
75 }
76 
78  if (stride) {
83  }
84  if (stridedispl) {
86  }
87  if (idle) {
88  delete[] nnode;
89  delete[] ncycle;
90  delete[] idle;
91  delete[] cache_access;
92  delete[] child_race;
93  }
94 }
95 
99 }
100 
102  if (interleave_info) {
103  delete[] interleave_info;
104  interleave_info = nullptr;
105  }
106 }
107 
108 // more precise visualization of the warp quality
109 // can be called after admin2
110 static void print_quality2(int iwarp, InterleaveInfo& ii, int* p) {
111  int pc = (iwarp == 0); // print warp 0
112  pc = 0; // turn off printing
113  int nodebegin = ii.lastnode[iwarp];
114  int* stride = ii.stride + ii.stridedispl[iwarp];
115  int ncycle = ii.cellsize[iwarp];
116 
117  int inode = nodebegin;
118 
119  size_t nn = 0; // number of nodes in warp. '.'
120  size_t nx = 0; // number of idle cores on all cycles. 'X'
121  size_t ncacheline = 0;
122  ; // number of parent memory cacheline accesses.
123  // assmue warpsize is max number in a cachline so all o
124  size_t ncr = 0; // number of child race. nchild-1 of same parent in same cycle
125 
126  for (int icycle = 0; icycle < ncycle; ++icycle) {
127  int s = stride[icycle];
128  int lastp = -2;
129  if (pc)
130  printf(" ");
131  std::set<int> crace; // how many children have same parent in a cycle
132  for (int icore = 0; icore < warpsize; ++icore) {
133  char ch = '.';
134  if (icore < s) {
135  int par = p[inode];
136  if (crace.find(par) != crace.end()) {
137  ch = 'r';
138  ++ncr;
139  } else {
140  crace.insert(par);
141  }
142 
143  if (par != lastp + 1) {
144  ch = (ch == 'r') ? 'R' : 'o';
145  ++ncacheline;
146  }
147  lastp = p[inode++];
148  ++nn;
149  } else {
150  ch = 'X';
151  ++nx;
152  }
153  if (pc)
154  printf("%c", ch);
155  }
156  if (pc)
157  printf("\n");
158  }
159 
160  ii.nnode[iwarp] = nn;
161  ii.ncycle[iwarp] = size_t(ncycle);
162  ii.idle[iwarp] = nx;
163  ii.cache_access[iwarp] = ncacheline;
164  ii.child_race[iwarp] = ncr;
165  if (pc)
166  printf("warp %d: %ld nodes, %d cycles, %ld idle, %ld cache access, %ld child races\n",
167  iwarp,
168  nn,
169  ncycle,
170  nx,
171  ncacheline,
172  ncr);
173 }
174 
175 static void print_quality1(int iwarp, InterleaveInfo& ii, int ncell, int* p) {
176  int pc = ((iwarp == 0) || iwarp == (ii.nwarp - 1)); // warp not to skip printing
177  pc = 0; // turn off printing.
178  int* stride = ii.stride;
179  int cellbegin = iwarp * warpsize;
180  int cellend = cellbegin + warpsize;
181  cellend = (cellend < stride[0]) ? cellend : stride[0];
182 
183  int ncycle = 0;
184  for (int i = cellbegin; i < cellend; ++i) {
185  if (ncycle < ii.cellsize[i]) {
186  ncycle = ii.cellsize[i];
187  }
188  }
189  nrn_assert(ncycle == ii.cellsize[cellend - 1]);
190  nrn_assert(ncycle <= ii.nstride);
191 
192  int ncell_in_warp = cellend - cellbegin;
193 
194  size_t n = 0; // number of nodes in warp (not including roots)
195  size_t nx = 0; // number of idle cores on all cycles. X
196  size_t ncacheline = 0;
197  ; // number of parent memory cacheline accesses.
198  // assume warpsize is max number in a cachline so
199  // first core has all o
200 
201  int inode = ii.firstnode[cellbegin];
202  for (int icycle = 0; icycle < ncycle; ++icycle) {
203  int sbegin = ncell - stride[icycle] - cellbegin;
204  int lastp = -2;
205  if (pc)
206  printf(" ");
207  for (int icore = 0; icore < warpsize; ++icore) {
208  char ch = '.';
209  if (icore < ncell_in_warp && icore >= sbegin) {
210  int par = p[inode + icore];
211  if (par != lastp + 1) {
212  ch = 'o';
213  ++ncacheline;
214  }
215  lastp = par;
216  ++n;
217  } else {
218  ch = 'X';
219  ++nx;
220  }
221  if (pc)
222  printf("%c", ch);
223  }
224  if (pc)
225  printf("\n");
226  inode += ii.stride[icycle + 1];
227  }
228 
229  ii.nnode[iwarp] = n;
230  ii.ncycle[iwarp] = (size_t) ncycle;
231  ii.idle[iwarp] = nx;
232  ii.cache_access[iwarp] = ncacheline;
233  ii.child_race[iwarp] = 0;
234  if (pc)
235  printf("warp %d: %ld nodes, %d cycles, %ld idle, %ld cache access\n",
236  iwarp,
237  n,
238  ncycle,
239  nx,
240  ncacheline);
241 }
242 
243 static void warp_balance(int ith, InterleaveInfo& ii) {
244  size_t nwarp = size_t(ii.nwarp);
245  size_t smm[4][3]; // sum_min_max see cp below
246  for (size_t j = 0; j < 4; ++j) {
247  smm[j][0] = 0;
248  smm[j][1] = 1000000000;
249  smm[j][2] = 0;
250  }
251  double emax = 0.0, emin = 1.0;
252  for (size_t i = 0; i < nwarp; ++i) {
253  size_t n = ii.nnode[i];
254  double e = double(n) / (n + ii.idle[i]);
255  if (emax < e) {
256  emax = e;
257  }
258  if (emin > e) {
259  emin = e;
260  }
261  size_t s[4] = {n, ii.idle[i], ii.cache_access[i], ii.child_race[i]};
262  for (size_t j = 0; j < 4; ++j) {
263  smm[j][0] += s[j];
264  if (smm[j][1] > s[j]) {
265  smm[j][1] = s[j];
266  }
267  if (smm[j][2] < s[j]) {
268  smm[j][2] = s[j];
269  }
270  }
271  }
272  std::vector<size_t> v(nwarp);
273  for (size_t i = 0; i < nwarp; ++i) {
274  v[i] = ii.ncycle[i];
275  }
276  double bal = load_balance(v);
277 #ifdef DEBUG
278  printf(
279  "thread %d nwarp=%ld balance=%g warp_efficiency %g to %g\n", ith, nwarp, bal, emin, emax);
280  const char* cp[4] = {"nodes", "idle", "ca", "cr"};
281  for (size_t i = 0; i < 4; ++i) {
282  printf(" %s=%ld (%ld:%ld)", cp[i], smm[i][0], smm[i][1], smm[i][2]);
283  }
284  printf("\n");
285 #else
286  (void) bal; // Remove warning about unused
287 #endif
288 }
289 
290 int* interleave_order(int ith, int ncell, int nnode, int* parent) {
291  // return if there are no nodes to permute
292  if (nnode <= 0)
293  return nullptr;
294 
295  // ensure parent of root = -1
296  for (int i = 0; i < ncell; ++i) {
297  if (parent[i] == 0) {
298  parent[i] = -1;
299  }
300  }
301 
302  int nwarp = 0, nstride = 0, *stride = nullptr, *firstnode = nullptr;
303  int *lastnode = nullptr, *cellsize = nullptr, *stridedispl = nullptr;
304 
305  int* order = node_order(
306  ncell, nnode, parent, nwarp, nstride, stride, firstnode, lastnode, cellsize, stridedispl);
307 
308  if (interleave_info) {
310  ii.nwarp = nwarp;
311  ii.nstride = nstride;
312  ii.stridedispl = stridedispl;
313  ii.stride = stride;
314  ii.firstnode = firstnode;
315  ii.lastnode = lastnode;
316  ii.cellsize = cellsize;
317  if (0 && ith == 0 && interleave_permute_type == 1) {
318  printf("ith=%d nstride=%d ncell=%d nnode=%d\n", ith, nstride, ncell, nnode);
319  for (int i = 0; i < ncell; ++i) {
320  printf("icell=%d cellsize=%d first=%d last=%d\n",
321  i,
322  cellsize[i],
323  firstnode[i],
324  lastnode[i]);
325  }
326  for (int i = 0; i < nstride; ++i) {
327  printf("istride=%d stride=%d\n", i, stride[i]);
328  }
329  }
330  if (ith == 0) {
331  // needed for print_quality[12] and done once here to save time
332  int* p = new int[nnode];
333  for (int i = 0; i < nnode; ++i) {
334  p[i] = parent[i];
335  }
336  permute_ptr(p, nnode, order);
337  node_permute(p, nnode, order);
338 
339  ii.nnode = new size_t[nwarp];
340  ii.ncycle = new size_t[nwarp];
341  ii.idle = new size_t[nwarp];
342  ii.cache_access = new size_t[nwarp];
343  ii.child_race = new size_t[nwarp];
344  for (int i = 0; i < nwarp; ++i) {
345  if (interleave_permute_type == 1) {
347  }
348  if (interleave_permute_type == 2) {
349  print_quality2(i, interleave_info[ith], p);
350  }
351  }
352  delete[] p;
353  warp_balance(ith, interleave_info[ith]);
354  }
355  }
356 
357  return order;
358 }
359 
360 #if INTERLEAVE_DEBUG // only the cell per core style
361 static int** cell_indices_debug(NrnThread& nt, InterleaveInfo& ii) {
362  int ncell = nt.ncell;
363  int nnode = nt.end;
364  int* parents = nt._v_parent_index;
365 
366  // we expect the nodes to be interleave ordered with smallest cell first
367  // establish consistency with ii.
368  // first ncell parents are -1
369  for (int i = 0; i < ncell; ++i) {
370  nrn_assert(parents[i] == -1);
371  }
372  int* sz = new int[ncell];
373  int* cell = new int[nnode];
374  for (int i = 0; i < ncell; ++i) {
375  sz[i] = 0;
376  cell[i] = i;
377  }
378  for (int i = ncell; i < nnode; ++i) {
379  cell[i] = cell[parents[i]];
380  sz[cell[i]] += 1;
381  }
382 
383  // cells are in inceasing sz order;
384  for (int i = 1; i < ncell; ++i) {
385  nrn_assert(sz[i - 1] <= sz[i]);
386  }
387  // same as ii.cellsize
388  for (int i = 0; i < ncell; ++i) {
389  nrn_assert(sz[i] == ii.cellsize[i]);
390  }
391 
392  int** cellindices = new int*[ncell];
393  for (int i = 0; i < ncell; ++i) {
394  cellindices[i] = new int[sz[i]];
395  sz[i] = 0; // restart sz counts
396  }
397  for (int i = ncell; i < nnode; ++i) {
398  cellindices[cell[i]][sz[cell[i]]] = i;
399  sz[cell[i]] += 1;
400  }
401  // cellindices first and last same as ii first and last
402  for (int i = 0; i < ncell; ++i) {
403  nrn_assert(cellindices[i][0] == ii.firstnode[i]);
404  nrn_assert(cellindices[i][sz[i] - 1] == ii.lastnode[i]);
405  }
406 
407  delete[] sz;
408  delete[] cell;
409 
410  return cellindices;
411 }
412 
413 static int*** cell_indices_threads;
414 void mk_cell_indices() {
415  cell_indices_threads = new int**[nrn_nthread];
416  for (int i = 0; i < nrn_nthread; ++i) {
417  NrnThread& nt = nrn_threads[i];
418  if (nt.ncell) {
419  cell_indices_threads[i] = cell_indices_debug(nt, interleave_info[i]);
420  } else {
421  cell_indices_threads[i] = nullptr;
422  }
423  }
424 }
425 #endif // INTERLEAVE_DEBUG
426 
427 #define GPU_V(i) nt->_actual_v[i]
428 #define GPU_A(i) nt->_actual_a[i]
429 #define GPU_B(i) nt->_actual_b[i]
430 #define GPU_D(i) nt->_actual_d[i]
431 #define GPU_RHS(i) nt->_actual_rhs[i]
432 #define GPU_PARENT(i) nt->_v_parent_index[i]
433 
434 // How does the interleaved permutation with stride get used in
435 // triagularization?
436 
437 // each cell in parallel regardless of inhomogeneous topology
439  int icell,
440  int icellsize,
441  int nstride,
442  int* stride,
443  int* lastnode) {
444  int i = lastnode[icell];
445  for (int istride = nstride - 1; istride >= 0; --istride) {
446  if (istride < icellsize) { // only first icellsize strides matter
447  // what is the index
448  int ip = GPU_PARENT(i);
449 #ifndef CORENEURON_ENABLE_GPU
450  nrn_assert(ip >= 0); // if (ip < 0) return;
451 #endif
452  double p = GPU_A(i) / GPU_D(i);
453  GPU_D(ip) -= p * GPU_B(i);
454  GPU_RHS(ip) -= p * GPU_RHS(i);
455  i -= stride[istride];
456  }
457  }
458 }
459 
460 // back substitution?
461 static void bksub_interleaved(NrnThread* nt,
462  int icell,
463  int icellsize,
464  int /* nstride */,
465  int* stride,
466  int* firstnode) {
467  int i = firstnode[icell];
468  GPU_RHS(icell) /= GPU_D(icell); // the root
469  for (int istride = 0; istride < icellsize; ++istride) {
470  int ip = GPU_PARENT(i);
471 #ifndef CORENEURON_ENABLE_GPU
472  nrn_assert(ip >= 0);
473 #endif
474  GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip);
475  GPU_RHS(i) /= GPU_D(i);
476  i += stride[istride + 1];
477  }
478 }
479 
480 // icore ranges [0:warpsize) ; stride[ncycle]
481 nrn_pragma_acc(routine vector)
482 static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* stride, int lastnode) {
483  int icycle = ncycle - 1;
484  int istride = stride[icycle];
485  int i = lastnode - istride + icore;
486  int ii = i;
487 
488  // execute until all tree depths are executed
490 
491  // clang-format off
492  nrn_pragma_acc(loop seq)
493  for (; has_subtrees_to_compute; ) { // ncycle loop
494  // serial test, gpu does this in parallel
495  nrn_pragma_acc(loop vector)
496  nrn_pragma_omp(loop bind(parallel))
497  for (int icore = 0; icore < warpsize; ++icore) {
498  int i = ii + icore;
499  if (icore < istride) { // most efficient if istride equal warpsize
500  // what is the index
501  int ip = GPU_PARENT(i);
502  double p = GPU_A(i) / GPU_D(i);
503  nrn_pragma_acc(atomic update)
504  nrn_pragma_omp(atomic update)
505  GPU_D(ip) -= p * GPU_B(i);
506  nrn_pragma_acc(atomic update)
507  nrn_pragma_omp(atomic update)
508  GPU_RHS(ip) -= p * GPU_RHS(i);
509  }
510  }
511  // if finished with all tree depths then ready to break
512  // (note that break is not allowed in OpenACC)
513  if (icycle == 0) {
514  has_subtrees_to_compute = false;
515  continue;
516  }
517  --icycle;
518  istride = stride[icycle];
519  i -= istride;
520  ii -= istride;
521  }
522 }
523 
524 // icore ranges [0:warpsize) ; stride[ncycle]
525 nrn_pragma_acc(routine vector)
526 static void bksub_interleaved2(NrnThread* nt,
527  int root,
528  int lastroot,
529  int icore,
530  int ncycle,
531  int* stride,
532  int firstnode) {
533  nrn_pragma_acc(loop seq)
534  for (int i = root; i < lastroot; i += 1) {
535  GPU_RHS(i) /= GPU_D(i); // the root
536  }
537 
538  int i = firstnode + icore;
539  int ii = i;
540  nrn_pragma_acc(loop seq)
541  for (int icycle = 0; icycle < ncycle; ++icycle) {
542  int istride = stride[icycle];
543  // serial test, gpu does this in parallel
544  nrn_pragma_acc(loop vector)
545  nrn_pragma_omp(loop bind(parallel))
546  for (int icore = 0; icore < warpsize; ++icore) {
547  int i = ii + icore;
548  if (icore < istride) {
549  int ip = GPU_PARENT(i);
550  GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip);
551  GPU_RHS(i) /= GPU_D(i);
552  }
553  i += istride;
554  }
555  ii += istride;
556  }
557 }
558 
559 /**
560  * \brief Solve Hines matrices/cells with compartment-based granularity.
561  *
562  * The node ordering/permuation guarantees cell interleaving (as much coalesced memory access as
563  * possible) and balanced warps (through the use of lpt algorithm to define the groups/warps). Every
564  * warp deals with a group of cells, therefore multiple compartments (finer level of parallelism).
565  */
566 void solve_interleaved2(int ith) {
567  NrnThread* nt = nrn_threads + ith;
568  InterleaveInfo& ii = interleave_info[ith];
569  int nwarp = ii.nwarp;
570  if (nwarp == 0)
571  return;
572 
573  int ncore = nwarp * warpsize;
574 
575 #ifdef _OPENACC
577  auto* d_nt = static_cast<NrnThread*>(acc_deviceptr(nt));
578  auto* d_info = static_cast<InterleaveInfo*>(acc_deviceptr(interleave_info + ith));
579  solve_interleaved2_launcher(d_nt, d_info, ncore, acc_get_cuda_stream(nt->stream_id));
580  } else {
581 #endif
582  int* ncycles = ii.cellsize; // nwarp of these
583  int* stridedispl = ii.stridedispl; // nwarp+1 of these
584  int* strides = ii.stride; // sum ncycles of these (bad since ncompart/warpsize)
585  int* rootbegin = ii.firstnode; // nwarp+1 of these
586  int* nodebegin = ii.lastnode; // nwarp+1 of these
587 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
588  defined(_OPENACC)
589  int nstride = stridedispl[nwarp];
590 #endif
591  /* If we compare this loop with the one from cellorder.cu (CUDA version), we will understand
592  * that the parallelism here is exposed in steps, while in the CUDA version all the parallelism
593  * is exposed from the very beginning of the loop. In more details, here we initially distribute
594  * the outermost loop, e.g. in the CUDA blocks, and for the innermost loops we explicitly use multiple
595  * threads for the parallelization (see for example the loop directives in triang/bksub_interleaved2).
596  * On the other hand, in the CUDA version the outermost loop is distributed to all the available threads,
597  * and therefore there is no need to have the innermost loops. Here, the loop/icore jumps every warpsize,
598  * while in the CUDA version the icore increases by one. Other than this, the two loop versions
599  * are equivalent (same results).
600  */
601  nrn_pragma_acc(parallel loop gang present(nt [0:1],
602  strides [0:nstride],
603  ncycles [0:nwarp],
604  stridedispl [0:nwarp + 1],
605  rootbegin [0:nwarp + 1],
606  nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id))
607  nrn_pragma_omp(target teams loop if(nt->compute_gpu))
608  for (int icore = 0; icore < ncore; icore += warpsize) {
609  int iwarp = icore / warpsize; // figure out the >> value
610  int ic = icore & (warpsize - 1); // figure out the & mask
611  int ncycle = ncycles[iwarp];
612  int* stride = strides + stridedispl[iwarp];
613  int root = rootbegin[iwarp]; // cell ID -> [0, ncell)
614  int lastroot = rootbegin[iwarp + 1];
615  int firstnode = nodebegin[iwarp];
616  int lastnode = nodebegin[iwarp + 1];
617 
618  triang_interleaved2(nt, ic, ncycle, stride, lastnode);
619  bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
620  }
621  nrn_pragma_acc(wait(nt->stream_id))
622 #ifdef _OPENACC
623  }
624 #endif
625 }
626 
627 /**
628  * \brief Solve Hines matrices/cells with cell-based granularity.
629  *
630  * The node ordering guarantees cell interleaving (as much coalesced memory access as possible),
631  * but parallelism granularity is limited to a per cell basis. Therefore every execution stream
632  * is mapped to a cell/tree.
633  */
634 void solve_interleaved1(int ith) {
635  NrnThread* nt = nrn_threads + ith;
636  int ncell = nt->ncell;
637  if (ncell == 0) {
638  return;
639  }
640  InterleaveInfo& ii = interleave_info[ith];
641  int nstride = ii.nstride;
642  int* stride = ii.stride;
643  int* firstnode = ii.firstnode;
644  int* lastnode = ii.lastnode;
645  int* cellsize = ii.cellsize;
646 
647  // OL211123: can we preserve the error checking behaviour of OpenACC's
648  // present clause with OpenMP? It is a bug if these data are not present,
649  // so diagnostics are helpful...
650  nrn_pragma_acc(parallel loop present(nt [0:1],
651  stride [0:nstride],
652  firstnode [0:ncell],
653  lastnode [0:ncell],
654  cellsize [0:ncell]) if (nt->compute_gpu)
655  async(nt->stream_id))
656  nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
657  for (int icell = 0; icell < ncell; ++icell) {
658  int icellsize = cellsize[icell];
659  triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode);
660  bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode);
661  }
662  nrn_pragma_acc(wait(nt->stream_id))
663 }
664 
665 void solve_interleaved(int ith) {
666  if (interleave_permute_type != 1) {
667  solve_interleaved2(ith);
668  } else {
669  solve_interleaved1(ith);
670  }
671 }
672 } // namespace coreneuron
coreneuron::InterleaveInfo::nnode
size_t * nnode
Definition: cellorder.hpp:65
coreneuron::interleave_order
int * interleave_order(int ith, int ncell, int nnode, int *parent)
Function that performs the permutation of the cells such that the execution threads access coalesced ...
Definition: cellorder.cpp:290
coreneuron::InterleaveInfo::InterleaveInfo
InterleaveInfo()=default
free_memory
void free_memory(void *pointer)
Definition: memory.h:196
coreneuron::nrn_nthread
int nrn_nthread
Definition: multicore.cpp:55
coreneuron::InterleaveInfo::nstride
int nstride
Definition: cellorder.hpp:57
coreneuron::node_order
int * node_order(int ncell, int nnode, int *parents, int &nwarp, int &nstride, int *&stride, int *&firstnode, int *&lastnode, int *&cellsize, int *&stridedispl)
Function that returns a permutation of length nnode.
Definition: cellorder1.cpp:300
coreneuron::lastnode
int int int int lastnode
Definition: cellorder.cpp:482
coreneuron::triang_interleaved
static void triang_interleaved(NrnThread *nt, int icell, int icellsize, int nstride, int *stride, int *lastnode)
Definition: cellorder.cpp:438
GPU_B
#define GPU_B(i)
Definition: cellorder.cpp:429
coreneuron::copy_align_array
void copy_align_array(T *&dest, T *src, size_t n)
Definition: cellorder.hpp:118
warpsize
#define warpsize
Definition: tnode.hpp:85
lpt.hpp
coreneuron::InterleaveInfo::~InterleaveInfo
~InterleaveInfo()
Definition: cellorder.cpp:77
coreneuron::istride
int istride
Definition: cellorder.cpp:484
nrn_pragma_omp
nrn_pragma_acc(routine seq) nrn_pragma_omp(declare target) philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron nrn_pragma_omp(end declare target) namespace coreneuron
Provide a helper function in global namespace that is declared target for OpenMP offloading to functi...
Definition: nrnran123.h:69
coreneuron::lastroot
int int lastroot
Definition: cellorder.cpp:528
coreneuron::InterleaveInfo::swap
void swap(InterleaveInfo &info)
Definition: cellorder.cpp:32
coreneuron::ii
int ii
Definition: cellorder.cpp:486
coreneuron::corenrn_parameters_data::gpu
bool gpu
Enable pthread/openmp.
Definition: corenrn_parameters.hpp:63
coreneuron::interleave_permute_type
int interleave_permute_type
Definition: cellorder.cpp:28
coreneuron::InterleaveInfo::lastnode
int * lastnode
Definition: cellorder.hpp:61
coreneuron::InterleaveInfo::nwarp
int nwarp
Definition: cellorder.hpp:56
coreneuron
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
Definition: corenrn_parameters.cpp:12
GPU_PARENT
#define GPU_PARENT(i)
Definition: cellorder.cpp:432
coreneuron::ncell
icycle< ncycle;++icycle) { int istride=stride[icycle];nrn_pragma_acc(loop vector) nrn_pragma_omp(loop bind(parallel)) for(int icore=0;icore< warpsize;++icore) { int i=ii+icore;if(icore< istride) { int ip=GPU_PARENT(i);GPU_RHS(i) -=GPU_B(i) *GPU_RHS(ip);GPU_RHS(i)/=GPU_D(i);} i+=istride;} ii+=istride;}}void solve_interleaved2(int ith) { NrnThread *nt=nrn_threads+ith;InterleaveInfo &ii=interleave_info[ith];int nwarp=ii.nwarp;if(nwarp==0) return;int ncore=nwarp *warpsize;int *ncycles=ii.cellsize;int *stridedispl=ii.stridedispl;int *strides=ii.stride;int *rootbegin=ii.firstnode;int *nodebegin=ii.lastnode;nrn_pragma_acc(parallel loop gang present(nt[0:1], strides[0:nstride], ncycles[0:nwarp], stridedispl[0:nwarp+1], rootbegin[0:nwarp+1], nodebegin[0:nwarp+1]) if(nt->compute_gpu) async(nt->stream_id)) nrn_pragma_omp(target teams loop if(nt->compute_gpu)) for(int icore=0;icore< ncore;icore+=warpsize) { int iwarp=icore/warpsize;int ic=icore &(warpsize - 1);int ncycle=ncycles[iwarp];int *stride=strides+stridedispl[iwarp];int root=rootbegin[iwarp];int lastroot=rootbegin[iwarp+1];int firstnode=nodebegin[iwarp];int lastnode=nodebegin[iwarp+1];triang_interleaved2(nt, ic, ncycle, stride, lastnode);bksub_interleaved2(nt, root+ic, lastroot, ic, ncycle, stride, firstnode);} nrn_pragma_acc(wait(nt->stream_id))}void solve_interleaved1(int ith) { NrnThread *nt=nrn_threads+ith;int ncell=nt-> ncell
Definition: cellorder.cpp:636
corenrn_parameters.hpp
coreneuron::i
int i
Definition: cellorder.cpp:485
coreneuron::InterleaveInfo::stridedispl
int * stridedispl
Definition: cellorder.hpp:58
coreneuron::solve_interleaved2_launcher
void solve_interleaved2_launcher(NrnThread *nt, InterleaveInfo *info, int ncore, void *stream)
CUDA branch of the solve_interleaved with interleave_permute_type == 2.
coreneuron::InterleaveInfo::cellsize
int * cellsize
Definition: cellorder.hpp:62
coreneuron::update
void update(NrnThread *_nt)
Definition: fadvance_core.cpp:201
coreneuron::destroy_interleave_info
void destroy_interleave_info()
Definition: cellorder.cpp:101
coreneuron::InterleaveInfo::firstnode
int * firstnode
Definition: cellorder.hpp:60
coreneuron::InterleaveInfo::cache_access
size_t * cache_access
Definition: cellorder.hpp:68
coreneuron::InterleaveInfo::idle
size_t * idle
Definition: cellorder.hpp:67
coreneuron::InterleaveInfo::operator=
InterleaveInfo & operator=(const InterleaveInfo &)
Definition: cellorder.cpp:66
coreneuron::ncycle
int int ncycle
Definition: cellorder.cpp:482
tnode.hpp
node_permute.h
coreneuron::NrnThread
Definition: multicore.hpp:75
coreneuron::warp_balance
size_t warp_balance(size_t ncell, VecTNode &nodevec)
Use of the LPT (Least Processing Time) algorithm to create balanced groups of cells.
Definition: balance.cpp:43
coreneuron::permute_ptr
void permute_ptr(int *vec, int n, int *p)
Definition: node_permute.cpp:345
coreneuron::bksub_interleaved
static void bksub_interleaved(NrnThread *nt, int icell, int icellsize, int, int *stride, int *firstnode)
Definition: cellorder.cpp:461
coreneuron::interleave_info
InterleaveInfo * interleave_info
Definition: cellorder.cpp:29
coreneuron::corenrn_parameters_data::cuda_interface
bool cuda_interface
Enable GPU computation.
Definition: corenrn_parameters.hpp:64
coreneuron::node_permute
void node_permute(int *vec, int n, int *permute)
Definition: node_permute.cpp:337
coreneuron::corenrn_param
corenrn_parameters corenrn_param
Printing method.
Definition: corenrn_parameters.cpp:268
coreneuron::InterleaveInfo::stride
int * stride
Definition: cellorder.hpp:59
coreneuron::solve_interleaved
void solve_interleaved(int ith)
Solve the Hines matrices based on the interleave_permute_type (1 or 2).
coreneuron::nrn_threads
NrnThread * nrn_threads
Definition: multicore.cpp:56
GPU_A
#define GPU_A(i)
Definition: cellorder.cpp:428
GPU_RHS
#define GPU_RHS(i)
Definition: cellorder.cpp:431
nrnconf.h
cellorder.hpp
coreneuron::print_quality2
static void print_quality2(int iwarp, InterleaveInfo &ii, int *p)
Definition: cellorder.cpp:110
coreneuron::root
int root
Definition: cellorder.cpp:527
coreneuron::copy_array
void copy_array(T *&dest, T *src, size_t n)
Definition: cellorder.hpp:111
multicore.hpp
coreneuron::create_interleave_info
void create_interleave_info()
Definition: cellorder.cpp:96
coreneuron::nstride
int nstride
Definition: cellorder.cpp:641
coreneuron::InterleaveInfo::ncycle
size_t * ncycle
Definition: cellorder.hpp:66
v
#define v
Definition: md1redef.h:11
coreneuron::stride
int int int * stride
Definition: cellorder.cpp:482
coreneuron::InterleaveInfo
Definition: cellorder.hpp:50
coreneuron::print_quality1
static void print_quality1(int iwarp, InterleaveInfo &ii, int ncell, int *p)
Definition: cellorder.cpp:175
coreneuron::icore
int icore
Definition: cellorder.cpp:482
offload.hpp
coreneuron::nrn_pragma_acc
nrn_pragma_acc(routine vector) static void triang_interleaved2(NrnThread *nt
Definition: ivocvect.cpp:30
coreneuron::if
if(ncell==0)
Definition: cellorder.cpp:637
GPU_D
#define GPU_D(i)
Definition: cellorder.cpp:430
coreneuron::firstnode
int int int int int int firstnode
Definition: cellorder.cpp:532
nrn_assert
#define nrn_assert(x)
assert()-like macro, independent of NDEBUG status
Definition: nrn_assert.h:33
coreneuron::cellsize
int * cellsize
Definition: cellorder.cpp:645
coreneuron::InterleaveInfo::child_race
size_t * child_race
Definition: cellorder.hpp:69
coreneuron::has_subtrees_to_compute
bool has_subtrees_to_compute
Definition: cellorder.cpp:489
nrn_assert.h
memory.h
coreneuron::NrnThread::ncell
int ncell
Definition: multicore.hpp:97
load_balance
double load_balance(std::vector< size_t > &v)
Definition: lpt.cpp:69