Go to the documentation of this file.
111 int pc = (iwarp == 0);
113 int nodebegin =
ii.lastnode[iwarp];
114 int*
stride =
ii.stride +
ii.stridedispl[iwarp];
117 int inode = nodebegin;
121 size_t ncacheline = 0;
126 for (
int icycle = 0; icycle <
ncycle; ++icycle) {
136 if (crace.find(par) != crace.end()) {
143 if (par != lastp + 1) {
144 ch = (ch ==
'r') ?
'R' :
'o';
160 ii.nnode[iwarp] = nn;
163 ii.cache_access[iwarp] = ncacheline;
164 ii.child_race[iwarp] = ncr;
166 printf(
"warp %d: %ld nodes, %d cycles, %ld idle, %ld cache access, %ld child races\n",
176 int pc = ((iwarp == 0) || iwarp == (
ii.nwarp - 1));
184 for (
int i = cellbegin;
i < cellend; ++
i) {
192 int ncell_in_warp = cellend - cellbegin;
196 size_t ncacheline = 0;
201 int inode =
ii.firstnode[cellbegin];
202 for (
int icycle = 0; icycle <
ncycle; ++icycle) {
209 if (icore < ncell_in_warp && icore >= sbegin) {
210 int par = p[inode +
icore];
211 if (par != lastp + 1) {
226 inode +=
ii.stride[icycle + 1];
232 ii.cache_access[iwarp] = ncacheline;
233 ii.child_race[iwarp] = 0;
235 printf(
"warp %d: %ld nodes, %d cycles, %ld idle, %ld cache access\n",
244 size_t nwarp = size_t(
ii.nwarp);
246 for (
size_t j = 0; j < 4; ++j) {
248 smm[j][1] = 1000000000;
251 double emax = 0.0, emin = 1.0;
252 for (
size_t i = 0;
i < nwarp; ++
i) {
253 size_t n =
ii.nnode[
i];
254 double e = double(n) / (n +
ii.idle[
i]);
261 size_t s[4] = {n,
ii.idle[
i],
ii.cache_access[
i],
ii.child_race[
i]};
262 for (
size_t j = 0; j < 4; ++j) {
264 if (smm[j][1] > s[j]) {
267 if (smm[j][2] < s[j]) {
272 std::vector<size_t>
v(nwarp);
273 for (
size_t i = 0;
i < nwarp; ++
i) {
279 "thread %d nwarp=%ld balance=%g warp_efficiency %g to %g\n", ith, nwarp, bal, emin, emax);
280 const char* cp[4] = {
"nodes",
"idle",
"ca",
"cr"};
281 for (
size_t i = 0;
i < 4; ++
i) {
282 printf(
" %s=%ld (%ld:%ld)", cp[
i], smm[
i][0], smm[
i][1], smm[
i][2]);
297 if (parent[
i] == 0) {
312 ii.stridedispl = stridedispl;
318 printf(
"ith=%d nstride=%d ncell=%d nnode=%d\n", ith,
nstride,
ncell, nnode);
320 printf(
"icell=%d cellsize=%d first=%d last=%d\n",
327 printf(
"istride=%d stride=%d\n",
i,
stride[
i]);
332 int* p =
new int[nnode];
333 for (
int i = 0;
i < nnode; ++
i) {
339 ii.nnode =
new size_t[nwarp];
340 ii.ncycle =
new size_t[nwarp];
341 ii.idle =
new size_t[nwarp];
342 ii.cache_access =
new size_t[nwarp];
343 ii.child_race =
new size_t[nwarp];
344 for (
int i = 0;
i < nwarp; ++
i) {
360 #if INTERLEAVE_DEBUG // only the cell per core style
361 static int** cell_indices_debug(NrnThread& nt, InterleaveInfo&
ii) {
364 int* parents = nt._v_parent_index;
372 int* sz =
new int[
ncell];
373 int* cell =
new int[nnode];
378 for (
int i =
ncell;
i < nnode; ++
i) {
379 cell[
i] = cell[parents[
i]];
392 int** cellindices =
new int*[
ncell];
394 cellindices[
i] =
new int[sz[
i]];
397 for (
int i =
ncell;
i < nnode; ++
i) {
398 cellindices[cell[
i]][sz[cell[
i]]] =
i;
413 static int*** cell_indices_threads;
414 void mk_cell_indices() {
421 cell_indices_threads[
i] =
nullptr;
425 #endif // INTERLEAVE_DEBUG
427 #define GPU_V(i) nt->_actual_v[i]
428 #define GPU_A(i) nt->_actual_a[i]
429 #define GPU_B(i) nt->_actual_b[i]
430 #define GPU_D(i) nt->_actual_d[i]
431 #define GPU_RHS(i) nt->_actual_rhs[i]
432 #define GPU_PARENT(i) nt->_v_parent_index[i]
449 #ifndef CORENEURON_ENABLE_GPU
471 #ifndef CORENEURON_ENABLE_GPU
526 static void bksub_interleaved2(NrnThread* nt,
541 for (
int icycle = 0; icycle <
ncycle; ++icycle) {
566 void solve_interleaved2(
int ith) {
569 int nwarp =
ii.nwarp;
577 auto* d_nt =
static_cast<NrnThread*
>(acc_deviceptr(nt));
578 auto* d_info =
static_cast<InterleaveInfo*
>(acc_deviceptr(
interleave_info + ith));
582 int* ncycles =
ii.cellsize;
583 int* stridedispl =
ii.stridedispl;
584 int* strides =
ii.stride;
585 int* rootbegin =
ii.firstnode;
586 int* nodebegin =
ii.lastnode;
587 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
589 int nstride = stridedispl[nwarp];
604 stridedispl [0:nwarp + 1],
605 rootbegin [0:nwarp + 1],
606 nodebegin [0:nwarp + 1])
if (nt->compute_gpu) async(nt->stream_id))
611 int ncycle = ncycles[iwarp];
612 int*
stride = strides + stridedispl[iwarp];
613 int root = rootbegin[iwarp];
614 int lastroot = rootbegin[iwarp + 1];
616 int lastnode = nodebegin[iwarp + 1];
634 void solve_interleaved1(
int ith) {
655 async(nt->stream_id))
656 nrn_pragma_omp(target teams distribute parallel for simd
if(nt->compute_gpu))
657 for (
int icell = 0; icell <
ncell; ++icell) {
667 solve_interleaved2(ith);
669 solve_interleaved1(ith);
int * interleave_order(int ith, int ncell, int nnode, int *parent)
Function that performs the permutation of the cells such that the execution threads access coalesced ...
void free_memory(void *pointer)
int * node_order(int ncell, int nnode, int *parents, int &nwarp, int &nstride, int *&stride, int *&firstnode, int *&lastnode, int *&cellsize, int *&stridedispl)
Function that returns a permutation of length nnode.
static void triang_interleaved(NrnThread *nt, int icell, int icellsize, int nstride, int *stride, int *lastnode)
void copy_align_array(T *&dest, T *src, size_t n)
nrn_pragma_acc(routine seq) nrn_pragma_omp(declare target) philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron nrn_pragma_omp(end declare target) namespace coreneuron
Provide a helper function in global namespace that is declared target for OpenMP offloading to functi...
void swap(InterleaveInfo &info)
bool gpu
Enable pthread/openmp.
int interleave_permute_type
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
icycle< ncycle;++icycle) { int istride=stride[icycle];nrn_pragma_acc(loop vector) nrn_pragma_omp(loop bind(parallel)) for(int icore=0;icore< warpsize;++icore) { int i=ii+icore;if(icore< istride) { int ip=GPU_PARENT(i);GPU_RHS(i) -=GPU_B(i) *GPU_RHS(ip);GPU_RHS(i)/=GPU_D(i);} i+=istride;} ii+=istride;}}void solve_interleaved2(int ith) { NrnThread *nt=nrn_threads+ith;InterleaveInfo &ii=interleave_info[ith];int nwarp=ii.nwarp;if(nwarp==0) return;int ncore=nwarp *warpsize;int *ncycles=ii.cellsize;int *stridedispl=ii.stridedispl;int *strides=ii.stride;int *rootbegin=ii.firstnode;int *nodebegin=ii.lastnode;nrn_pragma_acc(parallel loop gang present(nt[0:1], strides[0:nstride], ncycles[0:nwarp], stridedispl[0:nwarp+1], rootbegin[0:nwarp+1], nodebegin[0:nwarp+1]) if(nt->compute_gpu) async(nt->stream_id)) nrn_pragma_omp(target teams loop if(nt->compute_gpu)) for(int icore=0;icore< ncore;icore+=warpsize) { int iwarp=icore/warpsize;int ic=icore &(warpsize - 1);int ncycle=ncycles[iwarp];int *stride=strides+stridedispl[iwarp];int root=rootbegin[iwarp];int lastroot=rootbegin[iwarp+1];int firstnode=nodebegin[iwarp];int lastnode=nodebegin[iwarp+1];triang_interleaved2(nt, ic, ncycle, stride, lastnode);bksub_interleaved2(nt, root+ic, lastroot, ic, ncycle, stride, firstnode);} nrn_pragma_acc(wait(nt->stream_id))}void solve_interleaved1(int ith) { NrnThread *nt=nrn_threads+ith;int ncell=nt-> ncell
void solve_interleaved2_launcher(NrnThread *nt, InterleaveInfo *info, int ncore, void *stream)
CUDA branch of the solve_interleaved with interleave_permute_type == 2.
void update(NrnThread *_nt)
void destroy_interleave_info()
InterleaveInfo & operator=(const InterleaveInfo &)
size_t warp_balance(size_t ncell, VecTNode &nodevec)
Use of the LPT (Least Processing Time) algorithm to create balanced groups of cells.
void permute_ptr(int *vec, int n, int *p)
static void bksub_interleaved(NrnThread *nt, int icell, int icellsize, int, int *stride, int *firstnode)
InterleaveInfo * interleave_info
bool cuda_interface
Enable GPU computation.
void node_permute(int *vec, int n, int *permute)
corenrn_parameters corenrn_param
Printing method.
void solve_interleaved(int ith)
Solve the Hines matrices based on the interleave_permute_type (1 or 2).
static void print_quality2(int iwarp, InterleaveInfo &ii, int *p)
void copy_array(T *&dest, T *src, size_t n)
void create_interleave_info()
static void print_quality1(int iwarp, InterleaveInfo &ii, int ncell, int *p)
nrn_pragma_acc(routine vector) static void triang_interleaved2(NrnThread *nt
int int int int int int firstnode
#define nrn_assert(x)
assert()-like macro, independent of NDEBUG status
bool has_subtrees_to_compute
double load_balance(std::vector< size_t > &v)