CoreNEURON
nrn_acc_manager.cpp
Go to the documentation of this file.
1 /*
2 # =============================================================================
3 # Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
4 #
5 # See top-level LICENSE file for details.
6 # =============================================================================
7 */
8 
9 #include <queue>
10 #include <utility>
11 
26 
27 #ifdef CRAYPAT
28 #include <pat_api.h>
29 #endif
30 
31 #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
32 #include <cuda_runtime_api.h>
33 #endif
34 
35 #if __has_include(<cxxabi.h>)
36 #define USE_CXXABI
37 #include <cxxabi.h>
38 #include <memory>
39 #include <string>
40 #endif
41 
42 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
43 #include <cassert>
44 #include <cstddef>
45 #include <iostream>
46 #include <map>
47 #include <shared_mutex>
48 namespace {
49 struct present_table_value {
50  std::size_t ref_count{}, size{};
51  std::byte* dev_ptr{};
52 };
53 std::map<std::byte const*, present_table_value> present_table;
54 std::shared_mutex present_table_mutex;
55 } // namespace
56 #endif
57 
58 namespace {
59 /** @brief Try to demangle a type name, return the mangled name on failure.
60  */
61 std::string cxx_demangle(const char* mangled) {
62 #ifdef USE_CXXABI
63  int status{};
64  // Note that the third argument to abi::__cxa_demangle returns the length of
65  // the allocated buffer, which may be larger than strlen(demangled) + 1.
66  std::unique_ptr<char, decltype(free)*> demangled{
67  abi::__cxa_demangle(mangled, nullptr, nullptr, &status), free};
68  return status ? mangled : demangled.get();
69 #else
70  return mangled;
71 #endif
72 }
73 bool cnrn_target_debug_output_enabled() {
74  const char* env = std::getenv("CORENEURON_GPU_DEBUG");
75  if (!env) {
76  return false;
77  }
78  std::string env_s{env};
79  if (env_s == "1") {
80  return true;
81  } else if (env_s == "0") {
82  return false;
83  } else {
84  throw std::runtime_error("CORENEURON_GPU_DEBUG must be set to 0 or 1 (got " + env_s + ")");
85  }
86 }
87 bool cnrn_target_enable_debug{cnrn_target_debug_output_enabled()};
88 } // namespace
89 
90 namespace coreneuron {
91 extern InterleaveInfo* interleave_info;
94 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
95 void nrn_VecPlay_delete_from_device(NrnThread* nt);
96 
97 void cnrn_target_copyin_debug(std::string_view file,
98  int line,
99  std::size_t sizeof_T,
100  std::type_info const& typeid_T,
101  void const* h_ptr,
102  std::size_t len,
103  void* d_ptr) {
104  if (!cnrn_target_enable_debug) {
105  return;
106  }
107  std::cerr << file << ':' << line << ": cnrn_target_copyin<" << cxx_demangle(typeid_T.name())
108  << ">(" << h_ptr << ", " << len << " * " << sizeof_T << " = " << len * sizeof_T
109  << ") -> " << d_ptr << std::endl;
110 }
111 void cnrn_target_delete_debug(std::string_view file,
112  int line,
113  std::size_t sizeof_T,
114  std::type_info const& typeid_T,
115  void const* h_ptr,
116  std::size_t len) {
117  if (!cnrn_target_enable_debug) {
118  return;
119  }
120  std::cerr << file << ':' << line << ": cnrn_target_delete<" << cxx_demangle(typeid_T.name())
121  << ">(" << h_ptr << ", " << len << " * " << sizeof_T << " = " << len * sizeof_T << ')'
122  << std::endl;
123 }
124 void cnrn_target_deviceptr_debug(std::string_view file,
125  int line,
126  std::type_info const& typeid_T,
127  void const* h_ptr,
128  void* d_ptr) {
129  if (!cnrn_target_enable_debug) {
130  return;
131  }
132  std::cerr << file << ':' << line << ": cnrn_target_deviceptr<" << cxx_demangle(typeid_T.name())
133  << ">(" << h_ptr << ") -> " << d_ptr << std::endl;
134 }
135 void cnrn_target_is_present_debug(std::string_view file,
136  int line,
137  std::type_info const& typeid_T,
138  void const* h_ptr,
139  void* d_ptr) {
140  if (!cnrn_target_enable_debug) {
141  return;
142  }
143  std::cerr << file << ':' << line << ": cnrn_target_is_present<" << cxx_demangle(typeid_T.name())
144  << ">(" << h_ptr << ") -> " << d_ptr << std::endl;
145 }
146 void cnrn_target_memcpy_to_device_debug(std::string_view file,
147  int line,
148  std::size_t sizeof_T,
149  std::type_info const& typeid_T,
150  void const* h_ptr,
151  std::size_t len,
152  void* d_ptr) {
153  if (!cnrn_target_enable_debug) {
154  return;
155  }
156  std::cerr << file << ':' << line << ": cnrn_target_memcpy_to_device<"
157  << cxx_demangle(typeid_T.name()) << ">(" << d_ptr << ", " << h_ptr << ", " << len
158  << " * " << sizeof_T << " = " << len * sizeof_T << ')' << std::endl;
159 }
160 
161 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
162 std::pair<void*, bool> cnrn_target_deviceptr_impl(bool must_be_present_or_null, void const* h_ptr) {
163  if (!h_ptr) {
164  return {nullptr, false};
165  }
166  // Concurrent calls to this method are safe, but they must be serialised
167  // w.r.t. calls to the cnrn_target_*_update_present_table methods.
168  std::shared_lock _{present_table_mutex};
169  if (present_table.empty()) {
170  return {nullptr, must_be_present_or_null};
171  }
172  // prev(first iterator greater than h_ptr or last if not found) gives the first iterator less
173  // than or equal to h_ptr
174  auto const iter = std::prev(std::upper_bound(
175  present_table.begin(), present_table.end(), h_ptr, [](void const* hp, auto const& entry) {
176  return hp < entry.first;
177  }));
178  if (iter == present_table.end()) {
179  return {nullptr, must_be_present_or_null};
180  }
181  std::byte const* const h_byte_ptr{static_cast<std::byte const*>(h_ptr)};
182  std::byte const* const h_start_of_block{iter->first};
183  std::size_t const block_size{iter->second.size};
184  std::byte* const d_start_of_block{iter->second.dev_ptr};
185  bool const is_present{h_byte_ptr < h_start_of_block + block_size};
186  if (!is_present) {
187  return {nullptr, must_be_present_or_null};
188  }
189  return {d_start_of_block + (h_byte_ptr - h_start_of_block), false};
190 }
191 
192 void cnrn_target_copyin_update_present_table(void const* h_ptr, void* d_ptr, std::size_t len) {
193  if (!h_ptr) {
194  assert(!d_ptr);
195  return;
196  }
197  std::lock_guard _{present_table_mutex};
198  // TODO include more pedantic overlap checking?
199  present_table_value new_val{};
200  new_val.size = len;
201  new_val.ref_count = 1;
202  new_val.dev_ptr = static_cast<std::byte*>(d_ptr);
203  auto const [iter, inserted] = present_table.emplace(static_cast<std::byte const*>(h_ptr),
204  std::move(new_val));
205  if (!inserted) {
206  // Insertion didn't occur because h_ptr was already in the present table
207  assert(iter->second.size == len);
208  assert(iter->second.dev_ptr == new_val.dev_ptr);
209  ++(iter->second.ref_count);
210  }
211 }
212 void cnrn_target_delete_update_present_table(void const* h_ptr, std::size_t len) {
213  if (!h_ptr) {
214  return;
215  }
216  std::lock_guard _{present_table_mutex};
217  auto const iter = present_table.find(static_cast<std::byte const*>(h_ptr));
218  assert(iter != present_table.end());
219  assert(iter->second.size == len);
220  --(iter->second.ref_count);
221  if (iter->second.ref_count == 0) {
222  present_table.erase(iter);
223  }
224 }
225 #endif
226 
228 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
229  defined(_OPENACC)
230  // choose nvidia GPU by default
231  acc_device_t device_type = acc_device_nvidia;
232  // check how many gpu devices available per node
233  return acc_get_num_devices(device_type);
234 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
235  defined(_OPENMP)
236  return omp_get_num_devices();
237 #else
238  throw std::runtime_error(
239  "cnrn_target_get_num_devices() not implemented without OpenACC/OpenMP and gpu build");
240 #endif
241 }
242 
243 void cnrn_target_set_default_device(int device_num) {
244 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
245  defined(_OPENACC)
246  acc_set_device_num(device_num, acc_device_nvidia);
247 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
248  defined(_OPENMP)
249  omp_set_default_device(device_num);
250  // It seems that with NVHPC 21.9 then only setting the default OpenMP device
251  // is not enough: there were errors on some nodes when not-the-0th GPU was
252  // used. These seemed to be related to the NMODL instance structs, which are
253  // allocated using cudaMallocManaged.
254  auto const cuda_code = cudaSetDevice(device_num);
255  assert(cuda_code == cudaSuccess);
256 #else
257  throw std::runtime_error(
258  "cnrn_target_set_default_device() not implemented without OpenACC/OpenMP and gpu build");
259 #endif
260 }
261 
262 #ifdef CORENEURON_ENABLE_GPU
263 #ifndef CORENEURON_UNIFIED_MEMORY
264 static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) {
265  // As we never run code for artificial cell inside GPU we don't copy it.
266  int is_art = corenrn.get_is_artificial()[type];
267  if (is_art) {
268  return nullptr;
269  }
270 
271  auto d_ml = cnrn_target_copyin(ml);
272 
273  if (ml->global_variables) {
274  assert(ml->global_variables_size);
275  void* d_inst = cnrn_target_copyin(static_cast<std::byte*>(ml->global_variables),
276  ml->global_variables_size);
277  cnrn_target_memcpy_to_device(&(d_ml->global_variables), &d_inst);
278  }
279 
280 
281  int n = ml->nodecount;
282  int szp = corenrn.get_prop_param_size()[type];
283  int szdp = corenrn.get_prop_dparam_size()[type];
284 
285  double* dptr = cnrn_target_deviceptr(ml->data);
286  cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr));
287 
288 
289  int* d_nodeindices = cnrn_target_copyin(ml->nodeindices, n);
290  cnrn_target_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices);
291 
292  if (szdp) {
293  int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
294  int* d_pdata = cnrn_target_copyin(ml->pdata, pcnt);
295  cnrn_target_memcpy_to_device(&(d_ml->pdata), &d_pdata);
296  }
297 
298  int ts = corenrn.get_memb_funcs()[type].thread_size_;
299  if (ts) {
300  ThreadDatum* td = cnrn_target_copyin(ml->_thread, ts);
301  cnrn_target_memcpy_to_device(&(d_ml->_thread), &td);
302  }
303 
304  // net_receive buffer associated with mechanism
305  NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
306 
307  // if net receive buffer exist for mechanism
308  if (nrb) {
309  NetReceiveBuffer_t* d_nrb = cnrn_target_copyin(nrb);
310  cnrn_target_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb);
311 
312  int* d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size);
313  cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index);
314 
315  int* d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size);
316  cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index);
317 
318  double* d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size);
319  cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t);
320 
321  double* d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size);
322  cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag);
323 
324  int* d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1);
325  cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ);
326 
327  int* d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size);
328  cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index);
329  }
330 
331  /* copy NetSendBuffer_t on to GPU */
332  NetSendBuffer_t* nsb = ml->_net_send_buffer;
333 
334  if (nsb) {
335  NetSendBuffer_t* d_nsb;
336  int* d_iptr;
337  double* d_dptr;
338 
339  d_nsb = cnrn_target_copyin(nsb);
340  cnrn_target_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb);
341 
342  d_iptr = cnrn_target_copyin(nsb->_sendtype, nsb->_size);
343  cnrn_target_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr);
344 
345  d_iptr = cnrn_target_copyin(nsb->_vdata_index, nsb->_size);
346  cnrn_target_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr);
347 
348  d_iptr = cnrn_target_copyin(nsb->_pnt_index, nsb->_size);
349  cnrn_target_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr);
350 
351  d_iptr = cnrn_target_copyin(nsb->_weight_index, nsb->_size);
352  cnrn_target_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr);
353 
354  d_dptr = cnrn_target_copyin(nsb->_nsb_t, nsb->_size);
355  cnrn_target_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr);
356 
357  d_dptr = cnrn_target_copyin(nsb->_nsb_flag, nsb->_size);
358  cnrn_target_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr);
359  }
360 
361  return d_ml;
362 }
363 #endif
364 
365 static void update_ml_on_host(const Memb_list* ml, int type) {
366  int is_art = corenrn.get_is_artificial()[type];
367  if (is_art) {
368  // Artificial mechanisms such as PatternStim and IntervalFire
369  // are not copied onto the GPU. They should not, therefore, be
370  // updated from the GPU.
371  return;
372  }
373 
374  int n = ml->nodecount;
375  int szp = corenrn.get_prop_param_size()[type];
376  int szdp = corenrn.get_prop_dparam_size()[type];
377 
378  int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp;
379 
380  nrn_pragma_acc(update self(ml->data[:pcnt], ml->nodeindices[:n]))
381  nrn_pragma_omp(target update from(ml->data[:pcnt], ml->nodeindices[:n]))
382 
383  int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
384  nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if (szdp))
385  nrn_pragma_omp(target update from(ml->pdata[:dpcnt]) if (szdp))
386 
387  auto nrb = ml->_net_receive_buffer;
388 
389  // clang-format off
390  nrn_pragma_acc(update self(nrb->_cnt,
391  nrb->_size,
392  nrb->_pnt_offset,
393  nrb->_displ_cnt,
394  nrb->_pnt_index[:nrb->_size],
395  nrb->_weight_index[:nrb->_size],
396  nrb->_displ[:nrb->_size + 1],
397  nrb->_nrb_index[:nrb->_size])
398  if (nrb != nullptr))
399  nrn_pragma_omp(target update from(nrb->_cnt,
400  nrb->_size,
401  nrb->_pnt_offset,
402  nrb->_displ_cnt,
403  nrb->_pnt_index[:nrb->_size],
404  nrb->_weight_index[:nrb->_size],
405  nrb->_displ[:nrb->_size + 1],
406  nrb->_nrb_index[:nrb->_size])
407  if (nrb != nullptr))
408  // clang-format on
409 }
410 
411 static void delete_ml_from_device(Memb_list* ml, int type) {
412  int is_art = corenrn.get_is_artificial()[type];
413  if (is_art) {
414  return;
415  }
416  // Cleanup the net send buffer if it exists
417  {
418  NetSendBuffer_t* nsb{ml->_net_send_buffer};
419  if (nsb) {
420  cnrn_target_delete(nsb->_nsb_flag, nsb->_size);
421  cnrn_target_delete(nsb->_nsb_t, nsb->_size);
422  cnrn_target_delete(nsb->_weight_index, nsb->_size);
423  cnrn_target_delete(nsb->_pnt_index, nsb->_size);
424  cnrn_target_delete(nsb->_vdata_index, nsb->_size);
425  cnrn_target_delete(nsb->_sendtype, nsb->_size);
426  cnrn_target_delete(nsb);
427  }
428  }
429  // Cleanup the net receive buffer if it exists.
430  {
431  NetReceiveBuffer_t* nrb{ml->_net_receive_buffer};
432  if (nrb) {
433  cnrn_target_delete(nrb->_nrb_index, nrb->_size);
434  cnrn_target_delete(nrb->_displ, nrb->_size + 1);
435  cnrn_target_delete(nrb->_nrb_flag, nrb->_size);
436  cnrn_target_delete(nrb->_nrb_t, nrb->_size);
437  cnrn_target_delete(nrb->_weight_index, nrb->_size);
438  cnrn_target_delete(nrb->_pnt_index, nrb->_size);
439  cnrn_target_delete(nrb);
440  }
441  }
442  int n = ml->nodecount;
443  int szdp = corenrn.get_prop_dparam_size()[type];
444  int ts = corenrn.get_memb_funcs()[type].thread_size_;
445  if (ts) {
446  cnrn_target_delete(ml->_thread, ts);
447  }
448  if (szdp) {
449  int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
450  cnrn_target_delete(ml->pdata, pcnt);
451  }
452  cnrn_target_delete(ml->nodeindices, n);
453 
454  if (ml->global_variables) {
455  assert(ml->global_variables_size);
456  cnrn_target_delete(static_cast<std::byte*>(ml->global_variables),
457  ml->global_variables_size);
458  }
459 
460  cnrn_target_delete(ml);
461 }
462 
463 #endif
464 
465 /* note: threads here are corresponding to global nrn_threads array */
466 void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
467 #ifdef CORENEURON_ENABLE_GPU
468  // initialize NrnThreads for gpu execution
469  // empty thread or only artificial cells should be on cpu
470  for (int i = 0; i < nthreads; i++) {
471  NrnThread* nt = threads + i;
472  nt->compute_gpu = (nt->end > 0) ? 1 : 0;
473  nt->_dt = dt;
474  }
475 
477 
478 #ifdef CORENEURON_UNIFIED_MEMORY
479  for (int i = 0; i < nthreads; i++) {
480  NrnThread* nt = threads + i; // NrnThread on host
481 
482  if (nt->n_presyn) {
483  PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn);
484  }
485 
486  if (nt->n_vecplay) {
487  /* copy VecPlayContinuous instances */
488  /** just empty containers */
489  void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay);
490  // note: we are using unified memory for NrnThread. Once VecPlay is copied to gpu,
491  // we dont want to update nt->vecplay because it will also set gpu pointer of vecplay
492  // inside nt on cpu (due to unified memory).
493 
494  nrn_VecPlay_copyto_device(nt, d_vecplay);
495  }
496 
497  if (!nt->_permute && nt->end > 0) {
498  printf("\n WARNING: NrnThread %d not permuted, error for linear algebra?", i);
499  }
500  }
501 
502 #else
503  /* -- copy NrnThread to device. this needs to be contigious vector because offset is used to
504  * find
505  * corresponding NrnThread using Point_process in NET_RECEIVE block
506  */
507  NrnThread* d_threads = cnrn_target_copyin(threads, nthreads);
508 
509  if (interleave_info == nullptr) {
510  printf("\n Warning: No permutation data? Required for linear algebra!");
511  }
512 
513  /* pointers for data struct on device, starting with d_ */
514 
515  for (int i = 0; i < nthreads; i++) {
516  NrnThread* nt = threads + i; // NrnThread on host
517  NrnThread* d_nt = d_threads + i; // NrnThread on device
518  if (!nt->compute_gpu) {
519  continue;
520  }
521  double* d__data; // nrn_threads->_data on device
522 
523  /* -- copy _data to device -- */
524 
525  /*copy all double data for thread */
526  d__data = cnrn_target_copyin(nt->_data, nt->_ndata);
527 
528 
529  /* Here is the example of using OpenACC data enter/exit
530  * Remember that we are not allowed to use nt->_data but we have to use:
531  * double *dtmp = nt->_data; // now use dtmp!
532  #pragma acc enter data copyin(dtmp[0:nt->_ndata]) async(nt->stream_id)
533  #pragma acc wait(nt->stream_id)
534  */
535 
536  /*update d_nt._data to point to device copy */
537  cnrn_target_memcpy_to_device(&(d_nt->_data), &d__data);
538 
539  /* -- setup rhs, d, a, b, v, node_aread to point to device copy -- */
540  double* dptr;
541 
542  /* for padding, we have to recompute ne */
543  int ne = nrn_soa_padded_size(nt->end, 0);
544 
545  dptr = d__data + 0 * ne;
546  cnrn_target_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr));
547 
548  dptr = d__data + 1 * ne;
549  cnrn_target_memcpy_to_device(&(d_nt->_actual_d), &(dptr));
550 
551  dptr = d__data + 2 * ne;
552  cnrn_target_memcpy_to_device(&(d_nt->_actual_a), &(dptr));
553 
554  dptr = d__data + 3 * ne;
555  cnrn_target_memcpy_to_device(&(d_nt->_actual_b), &(dptr));
556 
557  dptr = d__data + 4 * ne;
558  cnrn_target_memcpy_to_device(&(d_nt->_actual_v), &(dptr));
559 
560  dptr = d__data + 5 * ne;
561  cnrn_target_memcpy_to_device(&(d_nt->_actual_area), &(dptr));
562 
563  if (nt->_actual_diam) {
564  dptr = d__data + 6 * ne;
565  cnrn_target_memcpy_to_device(&(d_nt->_actual_diam), &(dptr));
566  }
567 
568  int* d_v_parent_index = cnrn_target_copyin(nt->_v_parent_index, nt->end);
569  cnrn_target_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index));
570 
571  /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/
572  Memb_list** d_ml_list = cnrn_target_copyin(nt->_ml_list, corenrn.get_memb_funcs().size());
573  cnrn_target_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list));
574 
575  /* -- copy NrnThreadMembList list ml to device -- */
576 
577  NrnThreadMembList* d_last_tml;
578 
579  bool first_tml = true;
580 
581  for (auto tml = nt->tml; tml; tml = tml->next) {
582  /*copy tml to device*/
583  /*QUESTIONS: does tml will point to nullptr as in host ? : I assume so!*/
584  auto d_tml = cnrn_target_copyin(tml);
585 
586  /*first tml is pointed by nt */
587  if (first_tml) {
588  cnrn_target_memcpy_to_device(&(d_nt->tml), &d_tml);
589  first_tml = false;
590  } else {
591  /*rest of tml forms linked list */
592  cnrn_target_memcpy_to_device(&(d_last_tml->next), &d_tml);
593  }
594 
595  // book keeping for linked-list
596  d_last_tml = d_tml;
597 
598  /* now for every tml, there is a ml. copy that and setup pointer */
599  Memb_list* d_ml = copy_ml_to_device(tml->ml, tml->index);
600  cnrn_target_memcpy_to_device(&(d_tml->ml), &d_ml);
601  /* setup nt._ml_list */
602  cnrn_target_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml);
603  }
604 
605  if (nt->shadow_rhs_cnt) {
606  double* d_shadow_ptr;
607 
608  int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
609 
610  /* copy shadow_rhs to device and fix-up the pointer */
611  d_shadow_ptr = cnrn_target_copyin(nt->_shadow_rhs, pcnt);
612  cnrn_target_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr);
613 
614  /* copy shadow_d to device and fix-up the pointer */
615  d_shadow_ptr = cnrn_target_copyin(nt->_shadow_d, pcnt);
616  cnrn_target_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr);
617  }
618 
619  /* Fast membrane current calculation struct */
620  if (nt->nrn_fast_imem) {
621  NrnFastImem* d_fast_imem = cnrn_target_copyin(nt->nrn_fast_imem);
622  cnrn_target_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem);
623  {
624  double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end);
625  cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr);
626  }
627  {
628  double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end);
629  cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr);
630  }
631  }
632 
633  if (nt->n_pntproc) {
634  /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU
635  */
637  cnrn_target_memcpy_to_device(&(d_nt->pntprocs), &pntptr);
638  }
639 
640  if (nt->n_weight) {
641  /* copy weight vector used in NET_RECEIVE which is pointed by netcon.weight */
642  double* d_weights = cnrn_target_copyin(nt->weights, nt->n_weight);
643  cnrn_target_memcpy_to_device(&(d_nt->weights), &d_weights);
644  }
645 
646  if (nt->_nvdata) {
647  /* copy vdata which is setup in bbcore_read. This contains cuda allocated
648  * nrnran123_State * */
649  void** d_vdata = cnrn_target_copyin(nt->_vdata, nt->_nvdata);
650  cnrn_target_memcpy_to_device(&(d_nt->_vdata), &d_vdata);
651  }
652 
653  if (nt->n_presyn) {
654  /* copy presyn vector used for spike exchange, note we have added new PreSynHelper due
655  * to issue
656  * while updating PreSyn objects which has virtual base class. May be this is issue due
657  * to
658  * VTable and alignment */
659  PreSynHelper* d_presyns_helper = cnrn_target_copyin(nt->presyns_helper, nt->n_presyn);
660  cnrn_target_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper);
661  PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn);
662  cnrn_target_memcpy_to_device(&(d_nt->presyns), &d_presyns);
663  }
664 
665  if (nt->_net_send_buffer_size) {
666  /* copy send_receive buffer */
667  int* d_net_send_buffer = cnrn_target_copyin(nt->_net_send_buffer,
669  cnrn_target_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer);
670  }
671 
672  if (nt->n_vecplay) {
673  /* copy VecPlayContinuous instances */
674  /** just empty containers */
675  void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay);
676  cnrn_target_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay);
677 
678  nrn_VecPlay_copyto_device(nt, d_vecplay);
679  }
680 
681  if (nt->_permute) {
682  if (interleave_permute_type == 1) {
683  /* todo: not necessary to setup pointers, just copy it */
684  InterleaveInfo* info = interleave_info + i;
685  int* d_ptr = nullptr;
686  InterleaveInfo* d_info = cnrn_target_copyin(info);
687 
688  d_ptr = cnrn_target_copyin(info->stride, info->nstride + 1);
689  cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr);
690 
691  d_ptr = cnrn_target_copyin(info->firstnode, nt->ncell);
692  cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr);
693 
694  d_ptr = cnrn_target_copyin(info->lastnode, nt->ncell);
695  cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr);
696 
697  d_ptr = cnrn_target_copyin(info->cellsize, nt->ncell);
698  cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr);
699 
700  } else if (interleave_permute_type == 2) {
701  /* todo: not necessary to setup pointers, just copy it */
702  InterleaveInfo* info = interleave_info + i;
703  InterleaveInfo* d_info = cnrn_target_copyin(info);
704  int* d_ptr = nullptr;
705 
706  d_ptr = cnrn_target_copyin(info->stride, info->nstride);
707  cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr);
708 
709  d_ptr = cnrn_target_copyin(info->firstnode, info->nwarp + 1);
710  cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr);
711 
712  d_ptr = cnrn_target_copyin(info->lastnode, info->nwarp + 1);
713  cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr);
714 
715  d_ptr = cnrn_target_copyin(info->stridedispl, info->nwarp + 1);
716  cnrn_target_memcpy_to_device(&(d_info->stridedispl), &d_ptr);
717 
718  d_ptr = cnrn_target_copyin(info->cellsize, info->nwarp);
719  cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr);
720  } else {
721  printf("\n ERROR: only --cell_permute = [12] implemented");
722  abort();
723  }
724  } else {
725  printf("\n WARNING: NrnThread %d not permuted, error for linear algebra?", i);
726  }
727 
728  {
730  if (tr) {
731  // Create a device-side copy of the `trajec_requests` struct and
732  // make sure the device-side NrnThread object knows about it.
733  TrajectoryRequests* d_trajec_requests = cnrn_target_copyin(tr);
734  cnrn_target_memcpy_to_device(&(d_nt->trajec_requests), &d_trajec_requests);
735  // Initialise the double** gather member of the struct.
736  double** d_tr_gather = cnrn_target_copyin(tr->gather, tr->n_trajec);
737  cnrn_target_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather);
738  // Initialise the double** varrays member of the struct if it's
739  // set.
740  double** d_tr_varrays{nullptr};
741  if (tr->varrays) {
742  d_tr_varrays = cnrn_target_copyin(tr->varrays, tr->n_trajec);
743  cnrn_target_memcpy_to_device(&(d_trajec_requests->varrays), &d_tr_varrays);
744  }
745  for (int i = 0; i < tr->n_trajec; ++i) {
746  if (tr->varrays) {
747  // tr->varrays[i] is a buffer of tr->bsize doubles on the host,
748  // make a device-side copy of it and store a pointer to it in
749  // the device-side version of tr->varrays.
750  double* d_buf_traj_i = cnrn_target_copyin(tr->varrays[i], tr->bsize);
751  cnrn_target_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i);
752  }
753  // tr->gather[i] is a double* referring to (host) data in the
754  // (host) _data block
755  auto* d_gather_i = cnrn_target_deviceptr(tr->gather[i]);
756  cnrn_target_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i);
757  }
758  // TODO: other `double** scatter` and `void** vpr` members of
759  // the TrajectoryRequests struct are not copied to the device.
760  // The `int vsize` member is updated during the simulation but
761  // not kept up to date timestep-by-timestep on the device.
762  }
763  }
764  {
765  auto* d_fornetcon_perm_indices = cnrn_target_copyin(nt->_fornetcon_perm_indices,
768  &d_fornetcon_perm_indices);
769  }
770  {
771  auto* d_fornetcon_weight_perm = cnrn_target_copyin(nt->_fornetcon_weight_perm,
773  cnrn_target_memcpy_to_device(&(d_nt->_fornetcon_weight_perm), &d_fornetcon_weight_perm);
774  }
775  }
776 
777 #endif
778 #else
779  (void) threads;
780  (void) nthreads;
781 #endif
782 }
783 
785 #ifdef CORENEURON_ENABLE_GPU
786  /// by default `to` is desitionation pointer on a device
787  IvocVect* d_iv = &to;
788 
789  size_t n = from.size();
790  if (n) {
791  double* d_data = cnrn_target_copyin(from.data(), n);
792  cnrn_target_memcpy_to_device(&(d_iv->data_), &d_data);
793  }
794 #else
795  (void) from;
796  (void) to;
797 #endif
798 }
799 
801 #ifdef CORENEURON_ENABLE_GPU
802  auto const n = vec.size();
803  if (n) {
804  cnrn_target_delete(vec.data(), n);
805  }
806 #else
807  static_cast<void>(vec);
808 #endif
809 }
810 
813  if (!nrb) {
814  return;
815  }
816 
817 #ifdef CORENEURON_ENABLE_GPU
818  if (nt->compute_gpu) {
819  // free existing vectors in buffers on gpu
820  cnrn_target_delete(nrb->_pnt_index, nrb->_size);
822  cnrn_target_delete(nrb->_nrb_t, nrb->_size);
823  cnrn_target_delete(nrb->_nrb_flag, nrb->_size);
824  cnrn_target_delete(nrb->_displ, nrb->_size + 1);
825  cnrn_target_delete(nrb->_nrb_index, nrb->_size);
826  }
827 #endif
828  // Reallocate host buffers using ecalloc_align (as in phase2.cpp) and
829  // free_memory (as in nrn_setup.cpp)
830  auto const realloc = [old_size = nrb->_size, nrb](auto*& ptr, std::size_t extra_size = 0) {
831  using T = std::remove_pointer_t<std::remove_reference_t<decltype(ptr)>>;
832  static_assert(std::is_trivial<T>::value,
833  "Only trivially constructible and copiable types are supported.");
834  static_assert(std::is_same<decltype(ptr), T*&>::value,
835  "ptr should be reference-to-pointer");
836  auto* const new_data = static_cast<T*>(ecalloc_align((nrb->_size + extra_size), sizeof(T)));
837  std::memcpy(new_data, ptr, (old_size + extra_size) * sizeof(T));
838  free_memory(ptr);
839  ptr = new_data;
840  };
841  nrb->_size *= 2;
842  realloc(nrb->_pnt_index);
843  realloc(nrb->_weight_index);
844  realloc(nrb->_nrb_t);
845  realloc(nrb->_nrb_flag);
846  realloc(nrb->_displ, 1);
847  realloc(nrb->_nrb_index);
848 #ifdef CORENEURON_ENABLE_GPU
849  if (nt->compute_gpu) {
850  // update device copy
851  nrn_pragma_acc(update device(nrb));
852  nrn_pragma_omp(target update to(nrb));
853 
854  NetReceiveBuffer_t* const d_nrb{cnrn_target_deviceptr(nrb)};
855  // recopy the vectors in the buffer
856  int* const d_pnt_index{cnrn_target_copyin(nrb->_pnt_index, nrb->_size)};
857  cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index);
858 
859  int* const d_weight_index{cnrn_target_copyin(nrb->_weight_index, nrb->_size)};
860  cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index);
861 
862  double* const d_nrb_t{cnrn_target_copyin(nrb->_nrb_t, nrb->_size)};
863  cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t);
864 
865  double* const d_nrb_flag{cnrn_target_copyin(nrb->_nrb_flag, nrb->_size)};
866  cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag);
867 
868  int* const d_displ{cnrn_target_copyin(nrb->_displ, nrb->_size + 1)};
869  cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ);
870 
871  int* const d_nrb_index{cnrn_target_copyin(nrb->_nrb_index, nrb->_size)};
872  cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index);
873  }
874 #endif
875 }
876 
877 using NRB_P = std::pair<int, int>;
878 
879 struct comp {
880  bool operator()(const NRB_P& a, const NRB_P& b) {
881  if (a.first == b.first) {
882  return a.second > b.second; // same instances in original net_receive order
883  }
884  return a.first > b.first;
885  }
886 };
887 
889  Instrumentor::phase p_net_receive_buffer_order("net-receive-buf-order");
890  if (nrb->_cnt == 0) {
891  nrb->_displ_cnt = 0;
892  return;
893  }
894 
895  std::priority_queue<NRB_P, std::vector<NRB_P>, comp> nrbq;
896 
897  for (int i = 0; i < nrb->_cnt; ++i) {
898  nrbq.push(NRB_P(nrb->_pnt_index[i], i));
899  }
900 
901  int displ_cnt = 0;
902  int index_cnt = 0;
903  int last_instance_index = -1;
904  nrb->_displ[0] = 0;
905 
906  while (!nrbq.empty()) {
907  const NRB_P& p = nrbq.top();
908  nrb->_nrb_index[index_cnt++] = p.second;
909  if (p.first != last_instance_index) {
910  ++displ_cnt;
911  }
912  nrb->_displ[displ_cnt] = index_cnt;
913  last_instance_index = p.first;
914  nrbq.pop();
915  }
916  nrb->_displ_cnt = displ_cnt;
917 }
918 
919 /* when we execute NET_RECEIVE block on GPU, we provide the index of synapse instances
920  * which we need to execute during the current timestep. In order to do this, we have
921  * update NetReceiveBuffer_t object to GPU. When size of cpu buffer changes, we set
922  * reallocated to true and hence need to reallocate buffer on GPU and then need to copy
923  * entire buffer. If reallocated is 0, that means buffer size is not changed and hence
924  * only need to copy _size elements to GPU.
925  * Note: this is very preliminary implementation, optimisations will be done after first
926  * functional version.
927  */
929  Instrumentor::phase p_update_net_receive_buffer("update-net-receive-buf");
930  for (auto tml = nt->tml; tml; tml = tml->next) {
931  int is_art = corenrn.get_is_artificial()[tml->index];
932  if (is_art) {
933  continue;
934  }
935  // net_receive buffer to copy
936  NetReceiveBuffer_t* nrb = tml->ml->_net_receive_buffer;
937 
938  // if net receive buffer exist for mechanism
939  if (nrb && nrb->_cnt) {
940  // instance order to avoid race. setup _displ and _nrb_index
942 
943  if (nt->compute_gpu) {
944  Instrumentor::phase p_net_receive_buffer_order("net-receive-buf-cpu2gpu");
945  // note that dont update nrb otherwise we lose pointers
946 
947  // clang-format off
948 
949  /* update scalar elements */
950  nrn_pragma_acc(update device(nrb->_cnt,
951  nrb->_displ_cnt,
952  nrb->_pnt_index[:nrb->_cnt],
953  nrb->_weight_index[:nrb->_cnt],
954  nrb->_nrb_t[:nrb->_cnt],
955  nrb->_nrb_flag[:nrb->_cnt],
956  nrb->_displ[:nrb->_displ_cnt + 1],
957  nrb->_nrb_index[:nrb->_cnt])
958  async(nt->stream_id))
959  nrn_pragma_omp(target update to(nrb->_cnt,
960  nrb->_displ_cnt,
961  nrb->_pnt_index[:nrb->_cnt],
962  nrb->_weight_index[:nrb->_cnt],
963  nrb->_nrb_t[:nrb->_cnt],
964  nrb->_nrb_flag[:nrb->_cnt],
965  nrb->_displ[:nrb->_displ_cnt + 1],
966  nrb->_nrb_index[:nrb->_cnt]))
967  // clang-format on
968  }
969  }
970  }
971  nrn_pragma_acc(wait(nt->stream_id))
972 }
973 
975 #ifdef CORENEURON_ENABLE_GPU
976  if (!nt->compute_gpu)
977  return;
978 
979  // check if nsb->_cnt was exceeded on GPU: as the buffer can not be increased
980  // during gpu execution, we should just abort the execution.
981  // \todo: this needs to be fixed with different memory allocation strategy
982  if (nsb->_cnt > nsb->_size) {
983  printf("ERROR: NetSendBuffer exceeded during GPU execution (rank %d)\n", nrnmpi_myid);
984  nrn_abort(1);
985  }
986 
987  if (nsb->_cnt) {
988  Instrumentor::phase p_net_receive_buffer_order("net-send-buf-gpu2cpu");
989  }
990  // clang-format off
991  nrn_pragma_acc(update self(nsb->_sendtype[:nsb->_cnt],
992  nsb->_vdata_index[:nsb->_cnt],
993  nsb->_pnt_index[:nsb->_cnt],
994  nsb->_weight_index[:nsb->_cnt],
995  nsb->_nsb_t[:nsb->_cnt],
996  nsb->_nsb_flag[:nsb->_cnt])
997  if (nsb->_cnt))
998  nrn_pragma_omp(target update from(nsb->_sendtype[:nsb->_cnt],
999  nsb->_vdata_index[:nsb->_cnt],
1000  nsb->_pnt_index[:nsb->_cnt],
1001  nsb->_weight_index[:nsb->_cnt],
1002  nsb->_nsb_t[:nsb->_cnt],
1003  nsb->_nsb_flag[:nsb->_cnt])
1004  if (nsb->_cnt))
1005  // clang-format on
1006 #else
1007  (void) nt;
1008  (void) nsb;
1009 #endif
1010 }
1011 
1012 void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
1013 #ifdef CORENEURON_ENABLE_GPU
1014 
1015  for (int i = 0; i < nthreads; i++) {
1016  NrnThread* nt = threads + i;
1017 
1018  if (nt->compute_gpu && (nt->end > 0)) {
1019  /* -- copy data to host -- */
1020 
1021  int ne = nrn_soa_padded_size(nt->end, 0);
1022 
1023  // clang-format off
1024  nrn_pragma_acc(update self(nt->_actual_rhs[:ne],
1025  nt->_actual_d[:ne],
1026  nt->_actual_a[:ne],
1027  nt->_actual_b[:ne],
1028  nt->_actual_v[:ne],
1029  nt->_actual_area[:ne]))
1030  nrn_pragma_omp(target update from(nt->_actual_rhs[:ne],
1031  nt->_actual_d[:ne],
1032  nt->_actual_a[:ne],
1033  nt->_actual_b[:ne],
1034  nt->_actual_v[:ne],
1035  nt->_actual_area[:ne]))
1036  // clang-format on
1037 
1038  nrn_pragma_acc(update self(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
1040  target update from(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
1041 
1042  /* @todo: nt._ml_list[tml->index] = tml->ml; */
1043 
1044  /* -- copy NrnThreadMembList list ml to host -- */
1045  for (auto tml = nt->tml; tml; tml = tml->next) {
1046  if (!corenrn.get_is_artificial()[tml->index]) {
1047  nrn_pragma_acc(update self(tml->index, tml->ml->nodecount))
1048  nrn_pragma_omp(target update from(tml->index, tml->ml->nodecount))
1049  }
1050  update_ml_on_host(tml->ml, tml->index);
1051  }
1052 
1053  int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
1054  /* copy shadow_rhs to host */
1055  /* copy shadow_d to host */
1057  update self(nt->_shadow_rhs[:pcnt], nt->_shadow_d[:pcnt]) if (nt->shadow_rhs_cnt))
1058  nrn_pragma_omp(target update from(
1059  nt->_shadow_rhs[:pcnt], nt->_shadow_d[:pcnt]) if (nt->shadow_rhs_cnt))
1060 
1061  // clang-format off
1063  nt->nrn_fast_imem->nrn_sav_d[:nt->end])
1064  if (nt->nrn_fast_imem != nullptr))
1065  nrn_pragma_omp(target update from(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
1066  nt->nrn_fast_imem->nrn_sav_d[:nt->end])
1067  if (nt->nrn_fast_imem != nullptr))
1068  // clang-format on
1069 
1070  nrn_pragma_acc(update self(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc))
1071  nrn_pragma_omp(target update from(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc))
1072 
1073  nrn_pragma_acc(update self(nt->weights[:nt->n_weight]) if (nt->n_weight))
1074  nrn_pragma_omp(target update from(nt->weights[:nt->n_weight]) if (nt->n_weight))
1075 
1076  nrn_pragma_acc(update self(
1077  nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) if (nt->n_presyn))
1078  nrn_pragma_omp(target update from(
1079  nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) if (nt->n_presyn))
1080 
1081  {
1083  if (tr && tr->varrays) {
1084  // The full buffers have `bsize` entries, but only `vsize`
1085  // of them are valid.
1086  for (int i = 0; i < tr->n_trajec; ++i) {
1087  nrn_pragma_acc(update self(tr->varrays[i][:tr->vsize]))
1088  nrn_pragma_omp(target update from(tr->varrays[i][:tr->vsize]))
1089  }
1090  }
1091  }
1092 
1093  /* dont update vdata, its pointer array
1094  nrn_pragma_acc(update self(nt->_vdata[:nt->_nvdata) if nt->_nvdata)
1095  nrn_pragma_omp(target update from(nt->_vdata[:nt->_nvdata) if (nt->_nvdata))
1096  */
1097  }
1098  }
1099 #else
1100  (void) threads;
1101  (void) nthreads;
1102 #endif
1103 }
1104 
1105 /**
1106  * Copy weights from GPU to CPU
1107  *
1108  * User may record NetCon weights at the end of simulation.
1109  * For this purpose update weights of all NrnThread objects
1110  * from GPU to CPU.
1111  */
1112 void update_weights_from_gpu(NrnThread* threads, int nthreads) {
1113 #ifdef CORENEURON_ENABLE_GPU
1114  for (int i = 0; i < nthreads; i++) {
1115  NrnThread* nt = threads + i;
1116  size_t n_weight = nt->n_weight;
1117  if (nt->compute_gpu && n_weight > 0) {
1118  double* weights = nt->weights;
1119  nrn_pragma_acc(update host(weights [0:n_weight]))
1120  nrn_pragma_omp(target update from(weights [0:n_weight]))
1121  }
1122  }
1123 #endif
1124 }
1125 
1126 /** Cleanup device memory that is being tracked by the OpenACC runtime.
1127  *
1128  * This function painstakingly calls `cnrn_target_delete` in reverse order on all
1129  * pointers that were passed to `cnrn_target_copyin` in `setup_nrnthreads_on_device`.
1130  * This cleanup ensures that if the GPU is initialised multiple times from the
1131  * same process then the OpenACC runtime will not be polluted with old
1132  * pointers, which can cause errors. In particular if we do:
1133  * @code
1134  * {
1135  * // ... some_ptr is dynamically allocated ...
1136  * cnrn_target_copyin(some_ptr, some_size);
1137  * // ... do some work ...
1138  * // cnrn_target_delete(some_ptr);
1139  * free(some_ptr);
1140  * }
1141  * {
1142  * // ... same_ptr_again is dynamically allocated at the same address ...
1143  * cnrn_target_copyin(same_ptr_again, some_other_size); // ERROR
1144  * }
1145  * @endcode
1146  * the application will/may abort with an error such as:
1147  * FATAL ERROR: variable in data clause is partially present on the device.
1148  * The pattern above is typical of calling CoreNEURON on GPU multiple times in
1149  * the same process.
1150  */
1151 void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
1152 #ifdef CORENEURON_ENABLE_GPU
1153  for (int i = 0; i < nthreads; i++) {
1154  NrnThread* nt = threads + i;
1155  if (!nt->compute_gpu) {
1156  continue;
1157  }
1160  {
1162  if (tr) {
1163  if (tr->varrays) {
1164  for (int i = 0; i < tr->n_trajec; ++i) {
1165  cnrn_target_delete(tr->varrays[i], tr->bsize);
1166  }
1168  }
1170  cnrn_target_delete(tr);
1171  }
1172  }
1173  if (nt->_permute) {
1174  if (interleave_permute_type == 1) {
1175  InterleaveInfo* info = interleave_info + i;
1176  cnrn_target_delete(info->cellsize, nt->ncell);
1177  cnrn_target_delete(info->lastnode, nt->ncell);
1178  cnrn_target_delete(info->firstnode, nt->ncell);
1179  cnrn_target_delete(info->stride, info->nstride + 1);
1180  cnrn_target_delete(info);
1181  } else if (interleave_permute_type == 2) {
1182  InterleaveInfo* info = interleave_info + i;
1183  cnrn_target_delete(info->cellsize, info->nwarp);
1184  cnrn_target_delete(info->stridedispl, info->nwarp + 1);
1185  cnrn_target_delete(info->lastnode, info->nwarp + 1);
1186  cnrn_target_delete(info->firstnode, info->nwarp + 1);
1187  cnrn_target_delete(info->stride, info->nstride);
1188  cnrn_target_delete(info);
1189  }
1190  }
1191 
1192  if (nt->n_vecplay) {
1195  }
1196 
1197  // Cleanup send_receive buffer.
1198  if (nt->_net_send_buffer_size) {
1200  }
1201 
1202  if (nt->n_presyn) {
1205  }
1206 
1207  // Cleanup data that's setup in bbcore_read.
1208  if (nt->_nvdata) {
1209  cnrn_target_delete(nt->_vdata, nt->_nvdata);
1210  }
1211 
1212  // Cleanup weight vector used in NET_RECEIVE
1213  if (nt->n_weight) {
1215  }
1216 
1217  // Cleanup point processes
1218  if (nt->n_pntproc) {
1220  }
1221 
1222  if (nt->nrn_fast_imem) {
1226  }
1227 
1228  if (nt->shadow_rhs_cnt) {
1229  int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
1230  cnrn_target_delete(nt->_shadow_d, pcnt);
1231  cnrn_target_delete(nt->_shadow_rhs, pcnt);
1232  }
1233 
1234  for (auto tml = nt->tml; tml; tml = tml->next) {
1235  delete_ml_from_device(tml->ml, tml->index);
1236  cnrn_target_delete(tml);
1237  }
1240  cnrn_target_delete(nt->_data, nt->_ndata);
1241  }
1242  cnrn_target_delete(threads, nthreads);
1244 #endif
1245 }
1246 
1247 
1249 #ifdef CORENEURON_ENABLE_GPU
1250  // FIXME this check needs to be tweaked if we ever want to run with a mix
1251  // of CPU and GPU threads.
1252  if (nrn_threads[0].compute_gpu == 0) {
1253  return;
1254  }
1255 
1256  int n = ns->n * ns->n_instance;
1257  // actually, the values of double do not matter, only the pointers.
1258  NewtonSpace* d_ns = cnrn_target_copyin(ns);
1259 
1260  double* pd;
1261 
1262  pd = cnrn_target_copyin(ns->delta_x, n);
1263  cnrn_target_memcpy_to_device(&(d_ns->delta_x), &pd);
1264 
1265  pd = cnrn_target_copyin(ns->high_value, n);
1267 
1268  pd = cnrn_target_copyin(ns->low_value, n);
1269  cnrn_target_memcpy_to_device(&(d_ns->low_value), &pd);
1270 
1271  pd = cnrn_target_copyin(ns->rowmax, n);
1272  cnrn_target_memcpy_to_device(&(d_ns->rowmax), &pd);
1273 
1274  auto pint = cnrn_target_copyin(ns->perm, n);
1275  cnrn_target_memcpy_to_device(&(d_ns->perm), &pint);
1276 
1277  auto ppd = cnrn_target_copyin(ns->jacobian, ns->n);
1278  cnrn_target_memcpy_to_device(&(d_ns->jacobian), &ppd);
1279 
1280  // the actual jacobian doubles were allocated as a single array
1281  double* d_jacdat = cnrn_target_copyin(ns->jacobian[0], ns->n * n);
1282 
1283  for (int i = 0; i < ns->n; ++i) {
1284  pd = d_jacdat + i * n;
1285  cnrn_target_memcpy_to_device(&(ppd[i]), &pd);
1286  }
1287 #endif
1288 }
1289 
1291 #ifdef CORENEURON_ENABLE_GPU
1292  // FIXME this check needs to be tweaked if we ever want to run with a mix
1293  // of CPU and GPU threads.
1294  if (nrn_threads[0].compute_gpu == 0) {
1295  return;
1296  }
1297  int n = ns->n * ns->n_instance;
1298  cnrn_target_delete(ns->jacobian[0], ns->n * n);
1299  cnrn_target_delete(ns->jacobian, ns->n);
1300  cnrn_target_delete(ns->perm, n);
1301  cnrn_target_delete(ns->rowmax, n);
1302  cnrn_target_delete(ns->low_value, n);
1304  cnrn_target_delete(ns->delta_x, n);
1305  cnrn_target_delete(ns);
1306 #endif
1307 }
1308 
1310 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY)
1311  // FIXME this check needs to be tweaked if we ever want to run with a mix
1312  // of CPU and GPU threads.
1313  if (nrn_threads[0].compute_gpu == 0) {
1314  return;
1315  }
1316 
1317  unsigned n1 = so->neqn + 1;
1318  SparseObj* d_so = cnrn_target_copyin(so);
1319  // only pointer fields in SparseObj that need setting up are
1320  // rowst, diag, rhs, ngetcall, coef_list
1321  // only pointer fields in Elm that need setting up are
1322  // r_down, c_right, value
1323  // do not care about the Elm* ptr value, just the space.
1324 
1325  Elm** d_rowst = cnrn_target_copyin(so->rowst, n1);
1326  cnrn_target_memcpy_to_device(&(d_so->rowst), &d_rowst);
1327 
1328  Elm** d_diag = cnrn_target_copyin(so->diag, n1);
1329  cnrn_target_memcpy_to_device(&(d_so->diag), &d_diag);
1330 
1331  unsigned* pu = cnrn_target_copyin(so->ngetcall, so->_cntml_padded);
1332  cnrn_target_memcpy_to_device(&(d_so->ngetcall), &pu);
1333 
1334  double* pd = cnrn_target_copyin(so->rhs, n1 * so->_cntml_padded);
1335  cnrn_target_memcpy_to_device(&(d_so->rhs), &pd);
1336 
1337  double** d_coef_list = cnrn_target_copyin(so->coef_list, so->coef_list_size);
1338  cnrn_target_memcpy_to_device(&(d_so->coef_list), &d_coef_list);
1339 
1340  // Fill in relevant Elm pointer values
1341 
1342  for (unsigned irow = 1; irow < n1; ++irow) {
1343  for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
1344  Elm* pelm = cnrn_target_copyin(elm);
1345 
1346  if (elm == so->rowst[irow]) {
1347  cnrn_target_memcpy_to_device(&(d_rowst[irow]), &pelm);
1348  } else {
1349  Elm* d_e = cnrn_target_deviceptr(elm->c_left);
1350  cnrn_target_memcpy_to_device(&(pelm->c_left), &d_e);
1351  }
1352 
1353  if (elm->col == elm->row) {
1354  cnrn_target_memcpy_to_device(&(d_diag[irow]), &pelm);
1355  }
1356 
1357  if (irow > 1) {
1358  if (elm->r_up) {
1359  Elm* d_e = cnrn_target_deviceptr(elm->r_up);
1360  cnrn_target_memcpy_to_device(&(pelm->r_up), &d_e);
1361  }
1362  }
1363 
1364  pd = cnrn_target_copyin(elm->value, so->_cntml_padded);
1365  cnrn_target_memcpy_to_device(&(pelm->value), &pd);
1366  }
1367  }
1368 
1369  // visit all the Elm again and fill in pelm->r_down and pelm->c_left
1370  for (unsigned irow = 1; irow < n1; ++irow) {
1371  for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
1372  auto pelm = cnrn_target_deviceptr(elm);
1373  if (elm->r_down) {
1374  auto d_e = cnrn_target_deviceptr(elm->r_down);
1375  cnrn_target_memcpy_to_device(&(pelm->r_down), &d_e);
1376  }
1377  if (elm->c_right) {
1378  auto d_e = cnrn_target_deviceptr(elm->c_right);
1379  cnrn_target_memcpy_to_device(&(pelm->c_right), &d_e);
1380  }
1381  }
1382  }
1383 
1384  // Fill in the d_so->coef_list
1385  for (unsigned i = 0; i < so->coef_list_size; ++i) {
1386  pd = cnrn_target_deviceptr(so->coef_list[i]);
1387  cnrn_target_memcpy_to_device(&(d_coef_list[i]), &pd);
1388  }
1389 #endif
1390 }
1391 
1393 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY)
1394  // FIXME this check needs to be tweaked if we ever want to run with a mix
1395  // of CPU and GPU threads.
1396  if (nrn_threads[0].compute_gpu == 0) {
1397  return;
1398  }
1399  unsigned n1 = so->neqn + 1;
1400  for (unsigned irow = 1; irow < n1; ++irow) {
1401  for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
1402  cnrn_target_delete(elm->value, so->_cntml_padded);
1403  cnrn_target_delete(elm);
1404  }
1405  }
1407  cnrn_target_delete(so->rhs, n1 * so->_cntml_padded);
1409  cnrn_target_delete(so->diag, n1);
1410  cnrn_target_delete(so->rowst, n1);
1411  cnrn_target_delete(so);
1412 #endif
1413 }
1414 
1415 #ifdef CORENEURON_ENABLE_GPU
1416 
1420  for (int j = 0; j < nrn_ion_global_map_size; j++) {
1421  if (nrn_ion_global_map[j]) {
1422  double* d_mechmap = cnrn_target_copyin(nrn_ion_global_map[j],
1424  cnrn_target_memcpy_to_device(&(d_data[j]), &d_mechmap);
1425  }
1426  }
1427  }
1428 }
1429 
1431  for (int j = 0; j < nrn_ion_global_map_size; j++) {
1432  if (nrn_ion_global_map[j]) {
1434  }
1435  }
1438  }
1439 }
1440 
1441 void init_gpu() {
1442  // check how many gpu devices available per node
1443  int num_devices_per_node = cnrn_target_get_num_devices();
1444 
1445  // if no gpu found, can't run on GPU
1446  if (num_devices_per_node == 0) {
1447  nrn_fatal_error("\n ERROR : Enabled GPU execution but couldn't find NVIDIA GPU!\n");
1448  }
1449 
1450  if (corenrn_param.num_gpus != 0) {
1451  if (corenrn_param.num_gpus > num_devices_per_node) {
1452  nrn_fatal_error("Fatal error: asking for '%d' GPUs per node but only '%d' available\n",
1454  num_devices_per_node);
1455  } else {
1456  num_devices_per_node = corenrn_param.num_gpus;
1457  }
1458  }
1459 
1460  // get local rank within a node and assign specific gpu gpu for this node.
1461  // multiple threads within the node will use same device.
1462  int local_rank = 0;
1463  int local_size = 1;
1464 #if NRNMPI
1465  if (corenrn_param.mpi_enable) {
1466  local_rank = nrnmpi_local_rank();
1467  local_size = nrnmpi_local_size();
1468  }
1469 #endif
1470 
1471  cnrn_target_set_default_device(local_rank % num_devices_per_node);
1472 
1473  if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
1474  std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size
1475  << " ranks per node\n";
1476  }
1477 }
1478 
1479 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
1480  for (int i = 0; i < nt->n_vecplay; i++) {
1481  VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[i];
1482 
1483  /** just VecPlayContinuous object */
1484  VecPlayContinuous* d_vecplay_instance = cnrn_target_copyin(vecplay_instance);
1485  cnrn_target_memcpy_to_device((VecPlayContinuous**) (&(d_vecplay[i])), &d_vecplay_instance);
1486 
1487  /** copy y_, t_ and discon_indices_ */
1488  copy_ivoc_vect_to_device(vecplay_instance->y_, d_vecplay_instance->y_);
1489  copy_ivoc_vect_to_device(vecplay_instance->t_, d_vecplay_instance->t_);
1490  // OL211213: beware, the test suite does not currently include anything
1491  // with a non-null discon_indices_.
1492  if (vecplay_instance->discon_indices_) {
1493  IvocVect* d_discon_indices = cnrn_target_copyin(vecplay_instance->discon_indices_);
1494  cnrn_target_memcpy_to_device(&(d_vecplay_instance->discon_indices_), &d_discon_indices);
1495  copy_ivoc_vect_to_device(*(vecplay_instance->discon_indices_),
1496  *(d_vecplay_instance->discon_indices_));
1497  }
1498 
1499  /** copy PlayRecordEvent : todo: verify this */
1500  PlayRecordEvent* d_e_ = cnrn_target_copyin(vecplay_instance->e_);
1501 
1502  cnrn_target_memcpy_to_device(&(d_e_->plr_), (PlayRecord**) (&d_vecplay_instance));
1503  cnrn_target_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_);
1504 
1505  /** copy pd_ : note that it's pointer inside ml->data and hence data itself is
1506  * already on GPU */
1507  double* d_pd_ = cnrn_target_deviceptr(vecplay_instance->pd_);
1508  cnrn_target_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_);
1509  }
1510 }
1511 
1512 void nrn_VecPlay_delete_from_device(NrnThread* nt) {
1513  for (int i = 0; i < nt->n_vecplay; i++) {
1514  auto* vecplay_instance = static_cast<VecPlayContinuous*>(nt->_vecplay[i]);
1515  cnrn_target_delete(vecplay_instance->e_);
1516  if (vecplay_instance->discon_indices_) {
1517  delete_ivoc_vect_from_device(*(vecplay_instance->discon_indices_));
1518  }
1519  delete_ivoc_vect_from_device(vecplay_instance->t_);
1520  delete_ivoc_vect_from_device(vecplay_instance->y_);
1521  cnrn_target_delete(vecplay_instance);
1522  }
1523 }
1524 
1525 #endif
1526 } // namespace coreneuron
coreneuron::SparseObj::rowst
Elm ** rowst
Definition: mod2c_core_thread.hpp:48
coreneuron::net_receive_buffer_order
static void net_receive_buffer_order(NetReceiveBuffer_t *nrb)
Definition: nrn_acc_manager.cpp:888
coreneuron::NewtonSpace::high_value
double * high_value
Definition: newton_struct.h:20
coreneuron::delete_ivoc_vect_from_device
void delete_ivoc_vect_from_device(IvocVect &vec)
Definition: nrn_acc_manager.cpp:800
coreneuron::copy_ivoc_vect_to_device
void copy_ivoc_vect_to_device(const IvocVect &from, IvocVect &to)
Definition: nrn_acc_manager.cpp:784
coreneuron::NrnThread::nrn_fast_imem
NrnFastImem * nrn_fast_imem
Definition: multicore.hpp:124
coreneuron::NewtonSpace::perm
int * perm
Definition: newton_struct.h:19
coreneuron::setup_nrnthreads_on_device
void setup_nrnthreads_on_device(NrnThread *threads, int nthreads)
Definition: nrn_acc_manager.cpp:466
coreneuron::comp
Definition: nrn_acc_manager.cpp:879
coreneuron::nrn_VecPlay_copyto_device
void nrn_VecPlay_copyto_device(NrnThread *nt, void **d_vecplay)
free_memory
void free_memory(void *pointer)
Definition: memory.h:196
coreneuron::corenrn_parameters::is_quiet
bool is_quiet()
Definition: corenrn_parameters.hpp:109
coreneuron::cnrn_target_delete_debug
void cnrn_target_delete_debug(std::string_view file, int line, std::size_t sizeof_T, std::type_info const &typeid_T, void const *h_ptr, std::size_t len)
Definition: nrn_acc_manager.cpp:111
coreneuron::NrnThread::_vdata
void ** _vdata
Definition: multicore.hpp:108
coreneuron::corenrn_parameters_data::num_gpus
unsigned num_gpus
Number of warps to balance for cell_interleave_permute == 2.
Definition: corenrn_parameters.hpp:55
coreneuron::NrnThread::_shadow_d
double * _shadow_d
Definition: multicore.hpp:120
coreneuron::InterleaveInfo::nstride
int nstride
Definition: cellorder.hpp:57
coreneuron::fixed_vector::data_
T * data_
Definition: ivocvect.hpp:22
utils.hpp
coreneuron::delete_nrnthreads_on_device
void delete_nrnthreads_on_device(NrnThread *threads, int nthreads)
Cleanup device memory that is being tracked by the OpenACC runtime.
Definition: nrn_acc_manager.cpp:1151
coreneuron::TrajectoryRequests::vsize
int vsize
Definition: multicore.hpp:65
coreneuron::NrnThread::_fornetcon_weight_perm
size_t * _fornetcon_weight_perm
Definition: multicore.hpp:152
coreneuron::Point_process
Definition: mechanism.hpp:35
coreneuron::NrnFastImem::nrn_sav_d
double * nrn_sav_d
Definition: multicore.hpp:54
coreneuron::update_nrnthreads_on_host
void update_nrnthreads_on_host(NrnThread *threads, int nthreads)
Definition: nrn_acc_manager.cpp:1012
coreneuron::NewtonSpace
Definition: newton_struct.h:14
SOA_LAYOUT
#define SOA_LAYOUT
Definition: data_layout.hpp:11
coreneuron::Elm::value
double * value
Definition: mod2c_core_thread.hpp:31
nrnoc_aux.hpp
coreneuron::PreSynHelper
Definition: multicore.hpp:71
coreneuron::CoreNeuron::get_is_artificial
auto & get_is_artificial()
Definition: coreneuron.hpp:178
coreneuron::NrnThread::_net_send_buffer_size
int _net_send_buffer_size
Definition: multicore.hpp:138
coreneuron::ion_global_map_member_size
const int ion_global_map_member_size
coreneuron::NrnThread::presyns
PreSyn * presyns
Definition: multicore.hpp:83
coreneuron::TrajectoryRequests::n_trajec
int n_trajec
Definition: multicore.hpp:63
coreneuron::NrnThread::presyns_helper
PreSynHelper * presyns_helper
Definition: multicore.hpp:84
coreneuron::NewtonSpace::delta_x
double * delta_x
Definition: newton_struct.h:17
nrn_acc_manager.hpp
coreneuron::init_gpu
void init_gpu()
coreneuron::NetReceiveBuffer_t::_displ_cnt
int _displ_cnt
Definition: mechanism.hpp:50
coreneuron::nrn_ion_global_map_copyto_device
void nrn_ion_global_map_copyto_device()
nrn_pragma_omp
nrn_pragma_acc(routine seq) nrn_pragma_omp(declare target) philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron nrn_pragma_omp(end declare target) namespace coreneuron
Provide a helper function in global namespace that is declared target for OpenMP offloading to functi...
Definition: nrnran123.h:69
nrnmpidec.h
coreneuron::cnrn_target_delete
void cnrn_target_delete(std::string_view file, int line, T *h_ptr, std::size_t len=1)
Definition: offload.hpp:132
coreneuron::NetSendBuffer_t
Definition: mechanism.hpp:62
coreneuron::NetSendBuffer_t::_size
int _size
Definition: mechanism.hpp:70
coreneuron::NewtonSpace::jacobian
double ** jacobian
Definition: newton_struct.h:18
coreneuron::Memb_list
Definition: mechanism.hpp:131
coreneuron::cnrn_target_memcpy_to_device
void cnrn_target_memcpy_to_device(std::string_view file, int line, T *d_ptr, const T *h_ptr, std::size_t len=1)
Definition: offload.hpp:150
coreneuron::NrnThread::tml
NrnThreadMembList * tml
Definition: multicore.hpp:80
coreneuron::nrn_soa_padded_size
int nrn_soa_padded_size(int cnt, int layout)
calculate size after padding for specific memory layout
Definition: mem_layout_util.cpp:15
coreneuron::interleave_permute_type
int interleave_permute_type
Definition: cellorder.cpp:28
coreneuron::cnrn_target_set_default_device
void cnrn_target_set_default_device(int device_num)
Definition: nrn_acc_manager.cpp:243
coreneuron::nrn_ion_global_map_delete_from_device
void nrn_ion_global_map_delete_from_device()
coreneuron::NrnThread::_fornetcon_weight_perm_size
std::size_t _fornetcon_weight_perm_size
Definition: multicore.hpp:151
coreneuron::NrnThread::_actual_rhs
double * _actual_rhs
Definition: multicore.hpp:111
coreneuron::update_weights_from_gpu
void update_weights_from_gpu(NrnThread *threads, int nthreads)
Copy weights from GPU to CPU.
Definition: nrn_acc_manager.cpp:1112
coreneuron::NrnThread::compute_gpu
int compute_gpu
Definition: multicore.hpp:136
coreneuron::NewtonSpace::n_instance
int n_instance
Definition: newton_struct.h:16
profiler_interface.h
coreneuron::InterleaveInfo::lastnode
int * lastnode
Definition: cellorder.hpp:61
coreneuron.hpp
coreneuron::NetReceiveBuffer_t::_pnt_index
int * _pnt_index
Definition: mechanism.hpp:45
coreneuron::nrn_fatal_error
static void nrn_fatal_error(const char *msg)
Definition: nrnmpi.cpp:30
coreneuron::NrnThread::_actual_a
double * _actual_a
Definition: multicore.hpp:113
coreneuron::NrnThread::_actual_diam
double * _actual_diam
Definition: multicore.hpp:117
coreneuron::NrnThread::_nvdata
size_t _nvdata
Definition: multicore.hpp:104
coreneuron::SparseObj::coef_list_size
unsigned coef_list_size
Definition: mod2c_core_thread.hpp:58
coreneuron::InterleaveInfo::nwarp
int nwarp
Definition: cellorder.hpp:56
coreneuron
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
Definition: corenrn_parameters.cpp:12
coreneuron::nrn_VecPlay_delete_from_device
void nrn_VecPlay_delete_from_device(NrnThread *nt)
coreneuron::NrnThread::trajec_requests
TrajectoryRequests * trajec_requests
Definition: multicore.hpp:146
coreneuron::NetReceiveBuffer_t::_nrb_index
int * _nrb_index
Definition: mechanism.hpp:43
coreneuron::NrnThread::n_weight
int n_weight
Definition: multicore.hpp:91
coreneuron::NrnThread::n_pntproc
int n_pntproc
Definition: multicore.hpp:90
corenrn_parameters.hpp
coreneuron::nrn_newtonspace_delete_from_device
void nrn_newtonspace_delete_from_device(NewtonSpace *ns)
Definition: nrn_acc_manager.cpp:1290
coreneuron::nrn_sparseobj_delete_from_device
void nrn_sparseobj_delete_from_device(SparseObj *so)
Definition: nrn_acc_manager.cpp:1392
coreneuron::NrnThread::_v_parent_index
int * _v_parent_index
Definition: multicore.hpp:126
coreneuron::i
int i
Definition: cellorder.cpp:485
coreneuron::InterleaveInfo::stridedispl
int * stridedispl
Definition: cellorder.hpp:58
coreneuron::cnrn_target_copyin
T * cnrn_target_copyin(std::string_view file, int line, const T *h_ptr, std::size_t len=1)
Definition: offload.hpp:110
coreneuron::NrnThread::_dt
double _dt
Definition: multicore.hpp:77
coreneuron::InterleaveInfo::cellsize
int * cellsize
Definition: cellorder.hpp:62
coreneuron::NrnThread::_ml_list
Memb_list ** _ml_list
Definition: multicore.hpp:81
coreneuron::update
void update(NrnThread *_nt)
Definition: fadvance_core.cpp:201
nrniv_decl.h
coreneuron::PreSyn
Definition: netcon.hpp:104
coreneuron::dt
double dt
Definition: register_mech.cpp:22
coreneuron::cnrn_target_copyin_debug
void cnrn_target_copyin_debug(std::string_view file, int line, std::size_t sizeof_T, std::type_info const &typeid_T, void const *h_ptr, std::size_t len, void *d_ptr)
Definition: nrn_acc_manager.cpp:97
coreneuron::update_net_send_buffer_on_host
void update_net_send_buffer_on_host(NrnThread *nt, NetSendBuffer_t *nsb)
Definition: nrn_acc_manager.cpp:974
coreneuron::realloc_net_receive_buffer
void realloc_net_receive_buffer(NrnThread *nt, Memb_list *ml)
Definition: nrn_acc_manager.cpp:811
coreneuron::InterleaveInfo::firstnode
int * firstnode
Definition: cellorder.hpp:60
coreneuron::nrn_newtonspace_copyto_device
void nrn_newtonspace_copyto_device(NewtonSpace *ns)
Definition: nrn_acc_manager.cpp:1248
coreneuron::CoreNeuron::get_prop_dparam_size
auto & get_prop_dparam_size()
Definition: coreneuron.hpp:170
coreneuron::NrnThread::shadow_rhs_cnt
int shadow_rhs_cnt
Definition: multicore.hpp:135
coreneuron::CoreNeuron::get_memb_funcs
auto & get_memb_funcs()
Definition: coreneuron.hpp:134
coreneuron::NewtonSpace::low_value
double * low_value
Definition: newton_struct.h:21
coreneuron::nrn_ion_global_map
double ** nrn_ion_global_map
coreneuron::NetReceiveBuffer_t
Definition: mechanism.hpp:41
coreneuron::NrnThread::_net_send_buffer
int * _net_send_buffer
Definition: multicore.hpp:140
coreneuron::NrnThread::n_presyn
int n_presyn
Definition: multicore.hpp:94
coreneuron::nrnmpi_local_rank
mpi_function< cnrn_make_integral_constant_t(nrnmpi_local_rank_impl)> nrnmpi_local_rank
Definition: nrnmpidec.cpp:54
coreneuron::NrnThread::_ndata
size_t _ndata
Definition: multicore.hpp:103
coreneuron::nrn_ion_global_map_size
int nrn_ion_global_map_size
coreneuron::NetSendBuffer_t::_weight_index
int * _weight_index
Definition: mechanism.hpp:66
coreneuron::NrnThread
Definition: multicore.hpp:75
coreneuron::NrnThreadMembList
Definition: multicore.hpp:32
coreneuron::SparseObj
Definition: mod2c_core_thread.hpp:47
coreneuron::NetReceiveBuffer_t::_cnt
int _cnt
Definition: mechanism.hpp:49
coreneuron::cnrn_target_memcpy_to_device_debug
void cnrn_target_memcpy_to_device_debug(std::string_view file, int line, std::size_t sizeof_T, std::type_info const &typeid_T, void const *h_ptr, std::size_t len, void *d_ptr)
Definition: nrn_acc_manager.cpp:146
coreneuron::TrajectoryRequests::varrays
double ** varrays
Definition: multicore.hpp:60
coreneuron::SparseObj::diag
Elm ** diag
Definition: mod2c_core_thread.hpp:49
coreneuron::NetSendBuffer_t::_vdata_index
int * _vdata_index
Definition: mechanism.hpp:64
coreneuron::cnrn_target_is_present_debug
void cnrn_target_is_present_debug(std::string_view file, int line, std::type_info const &typeid_T, void const *h_ptr, void *d_ptr)
Definition: nrn_acc_manager.cpp:135
coreneuron::Instrumentor::phase
Definition: profiler_interface.h:289
coreneuron::NrnThread::_fornetcon_perm_indices_size
std::size_t _fornetcon_perm_indices_size
Definition: multicore.hpp:149
coreneuron::NetReceiveBuffer_t::_weight_index
int * _weight_index
Definition: mechanism.hpp:46
netcon.hpp
coreneuron::NrnThread::_shadow_rhs
double * _shadow_rhs
Definition: multicore.hpp:118
coreneuron::NetSendBuffer_t::_pnt_index
int * _pnt_index
Definition: mechanism.hpp:65
coreneuron::interleave_info
InterleaveInfo * interleave_info
Definition: cellorder.cpp:29
coreneuron::fixed_vector< double >
coreneuron::corenrn_param
corenrn_parameters corenrn_param
Printing method.
Definition: corenrn_parameters.cpp:268
coreneuron::NrnThread::stream_id
int stream_id
Definition: multicore.hpp:137
coreneuron::SparseObj::_cntml_padded
unsigned _cntml_padded
Definition: mod2c_core_thread.hpp:52
coreneuron::InterleaveInfo::stride
int * stride
Definition: cellorder.hpp:59
newton_struct.h
coreneuron::NrnThread::_data
double * _data
Definition: multicore.hpp:106
coreneuron::Elm
Definition: mod2c_core_thread.hpp:28
coreneuron::nrn_threads
NrnThread * nrn_threads
Definition: multicore.cpp:56
coreneuron::corenrn
CoreNeuron corenrn
Definition: multicore.cpp:53
coreneuron::TrajectoryRequests
Definition: multicore.hpp:57
coreneuron::NetSendBuffer_t::_nsb_flag
double * _nsb_flag
Definition: mechanism.hpp:68
coreneuron::IvocVect
fixed_vector< double > IvocVect
Definition: ivocvect.hpp:72
coreneuron::nrn_abort
void nrn_abort(int errcode)
Definition: utils.cpp:13
coreneuron::NrnThread::_actual_d
double * _actual_d
Definition: multicore.hpp:112
coreneuron::NetSendBuffer_t::_sendtype
int * _sendtype
Definition: mechanism.hpp:63
coreneuron::update_net_receive_buffer
void update_net_receive_buffer(NrnThread *nt)
Definition: nrn_acc_manager.cpp:928
coreneuron::Memb_list::_net_receive_buffer
NetReceiveBuffer_t * _net_receive_buffer
Definition: mechanism.hpp:142
coreneuron::NRB_P
std::pair< int, int > NRB_P
Definition: nrn_acc_manager.cpp:877
coreneuron::SparseObj::coef_list
double ** coef_list
Definition: mod2c_core_thread.hpp:59
coreneuron::NrnThread::_fornetcon_perm_indices
size_t * _fornetcon_perm_indices
Definition: multicore.hpp:150
coreneuron::NrnThread::_vecplay
void ** _vecplay
Definition: multicore.hpp:109
coreneuron::Elm::c_left
struct Elm * c_left
Definition: mod2c_core_thread.hpp:34
coreneuron::NrnThreadMembList::next
NrnThreadMembList * next
Definition: multicore.hpp:33
weights
#define weights
Definition: md1redef.h:42
coreneuron::TrajectoryRequests::bsize
int bsize
Definition: multicore.hpp:64
coreneuron::NrnThread::weights
double * weights
Definition: multicore.hpp:88
cellorder.hpp
coreneuron::NetSendBuffer_t::_nsb_t
double * _nsb_t
Definition: mechanism.hpp:67
coreneuron::SparseObj::neqn
unsigned neqn
Definition: mod2c_core_thread.hpp:51
coreneuron::cnrn_target_get_num_devices
int cnrn_target_get_num_devices()
Definition: nrn_acc_manager.cpp:227
cnrn_target_deviceptr
#define cnrn_target_deviceptr(...)
Definition: offload.hpp:188
coreneuron::NrnFastImem::nrn_sav_rhs
double * nrn_sav_rhs
Definition: multicore.hpp:53
coreneuron::Elm::c_right
struct Elm * c_right
Definition: mod2c_core_thread.hpp:35
vrecitem.h
multicore.hpp
coreneuron::NrnThread::pntprocs
Point_process * pntprocs
Definition: multicore.hpp:82
data_layout.hpp
coreneuron::NetReceiveBuffer_t::_nrb_flag
double * _nrb_flag
Definition: mechanism.hpp:48
coreneuron::CoreNeuron::get_prop_param_size
auto & get_prop_param_size()
Definition: coreneuron.hpp:166
coreneuron::NetSendBuffer_t::_cnt
int _cnt
Definition: mechanism.hpp:69
coreneuron::NewtonSpace::rowmax
double * rowmax
Definition: newton_struct.h:22
coreneuron::corenrn_parameters_data::mpi_enable
bool mpi_enable
Initialization seed for random number generator (int)
Definition: corenrn_parameters.hpp:59
coreneuron::NrnThread::_actual_v
double * _actual_v
Definition: multicore.hpp:115
coreneuron::NrnFastImem
Definition: multicore.hpp:52
coreneuron::InterleaveInfo
Definition: cellorder.hpp:50
coreneuron::comp::operator()
bool operator()(const NRB_P &a, const NRB_P &b)
Definition: nrn_acc_manager.cpp:880
coreneuron::nrn_sparseobj_copyto_device
void nrn_sparseobj_copyto_device(SparseObj *so)
Definition: nrn_acc_manager.cpp:1309
coreneuron::NewtonSpace::n
int n
Definition: newton_struct.h:15
coreneuron::NrnThread::_actual_b
double * _actual_b
Definition: multicore.hpp:114
coreneuron::nrnmpi_myid
int nrnmpi_myid
Definition: nrnmpi_def_cinc.cpp:11
coreneuron::SparseObj::rhs
double * rhs
Definition: mod2c_core_thread.hpp:54
coreneuron::TrajectoryRequests::gather
double ** gather
Definition: multicore.hpp:61
coreneuron::Elm::r_up
struct Elm * r_up
Definition: mod2c_core_thread.hpp:32
coreneuron::NrnThread::end
int end
Definition: multicore.hpp:98
coreneuron::nrn_pragma_acc
nrn_pragma_acc(routine vector) static void triang_interleaved2(NrnThread *nt
Definition: ivocvect.cpp:30
coreneuron::NetReceiveBuffer_t::_displ
int * _displ
Definition: mechanism.hpp:42
coreneuron::if
if(ncell==0)
Definition: cellorder.cpp:637
coreneuron::cnrn_target_deviceptr_debug
void cnrn_target_deviceptr_debug(std::string_view file, int line, std::type_info const &typeid_T, void const *h_ptr, void *d_ptr)
Definition: nrn_acc_manager.cpp:124
coreneuron::NrnThread::_actual_area
double * _actual_area
Definition: multicore.hpp:116
coreneuron::ecalloc_align
void * ecalloc_align(size_t n, size_t size, size_t alignment)
coreneuron::NetReceiveBuffer_t::_size
int _size
Definition: mechanism.hpp:51
coreneuron::NrnThread::ncell
int ncell
Definition: multicore.hpp:97
coreneuron::NrnThread::_permute
int * _permute
Definition: multicore.hpp:127
coreneuron::SparseObj::ngetcall
unsigned * ngetcall
Definition: mod2c_core_thread.hpp:55
coreneuron::NetReceiveBuffer_t::_nrb_t
double * _nrb_t
Definition: mechanism.hpp:47
coreneuron::nrnmpi_local_size
mpi_function< cnrn_make_integral_constant_t(nrnmpi_local_size_impl)> nrnmpi_local_size
Definition: nrnmpidec.cpp:56
coreneuron::NrnThread::n_vecplay
int n_vecplay
Definition: multicore.hpp:101