31 #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
32 #include <cuda_runtime_api.h>
35 #if __has_include(<cxxabi.h>)
42 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
47 #include <shared_mutex>
49 struct present_table_value {
50 std::size_t ref_count{}, size{};
53 std::map<std::byte const*, present_table_value> present_table;
54 std::shared_mutex present_table_mutex;
61 std::string cxx_demangle(
const char* mangled) {
66 std::unique_ptr<char, decltype(free)*> demangled{
67 abi::__cxa_demangle(mangled,
nullptr,
nullptr, &status), free};
68 return status ? mangled : demangled.get();
73 bool cnrn_target_debug_output_enabled() {
74 const char* env = std::getenv(
"CORENEURON_GPU_DEBUG");
78 std::string env_s{env};
81 }
else if (env_s ==
"0") {
84 throw std::runtime_error(
"CORENEURON_GPU_DEBUG must be set to 0 or 1 (got " + env_s +
")");
87 bool cnrn_target_enable_debug{cnrn_target_debug_output_enabled()};
100 std::type_info
const& typeid_T,
104 if (!cnrn_target_enable_debug) {
107 std::cerr << file <<
':' << line <<
": cnrn_target_copyin<" << cxx_demangle(typeid_T.name())
108 <<
">(" << h_ptr <<
", " << len <<
" * " << sizeof_T <<
" = " << len * sizeof_T
109 <<
") -> " << d_ptr << std::endl;
113 std::size_t sizeof_T,
114 std::type_info
const& typeid_T,
117 if (!cnrn_target_enable_debug) {
120 std::cerr << file <<
':' << line <<
": cnrn_target_delete<" << cxx_demangle(typeid_T.name())
121 <<
">(" << h_ptr <<
", " << len <<
" * " << sizeof_T <<
" = " << len * sizeof_T <<
')'
126 std::type_info
const& typeid_T,
129 if (!cnrn_target_enable_debug) {
132 std::cerr << file <<
':' << line <<
": cnrn_target_deviceptr<" << cxx_demangle(typeid_T.name())
133 <<
">(" << h_ptr <<
") -> " << d_ptr << std::endl;
137 std::type_info
const& typeid_T,
140 if (!cnrn_target_enable_debug) {
143 std::cerr << file <<
':' << line <<
": cnrn_target_is_present<" << cxx_demangle(typeid_T.name())
144 <<
">(" << h_ptr <<
") -> " << d_ptr << std::endl;
148 std::size_t sizeof_T,
149 std::type_info
const& typeid_T,
153 if (!cnrn_target_enable_debug) {
156 std::cerr << file <<
':' << line <<
": cnrn_target_memcpy_to_device<"
157 << cxx_demangle(typeid_T.name()) <<
">(" << d_ptr <<
", " << h_ptr <<
", " << len
158 <<
" * " << sizeof_T <<
" = " << len * sizeof_T <<
')' << std::endl;
161 #ifdef CORENEURON_ENABLE_PRESENT_TABLE
162 std::pair<void*, bool> cnrn_target_deviceptr_impl(
bool must_be_present_or_null,
void const* h_ptr) {
164 return {
nullptr,
false};
168 std::shared_lock _{present_table_mutex};
169 if (present_table.empty()) {
170 return {
nullptr, must_be_present_or_null};
174 auto const iter = std::prev(std::upper_bound(
175 present_table.begin(), present_table.end(), h_ptr, [](
void const* hp,
auto const& entry) {
176 return hp < entry.first;
178 if (iter == present_table.end()) {
179 return {
nullptr, must_be_present_or_null};
181 std::byte
const*
const h_byte_ptr{
static_cast<std::byte const*
>(h_ptr)};
182 std::byte
const*
const h_start_of_block{iter->first};
183 std::size_t
const block_size{iter->second.size};
184 std::byte*
const d_start_of_block{iter->second.dev_ptr};
185 bool const is_present{h_byte_ptr < h_start_of_block + block_size};
187 return {
nullptr, must_be_present_or_null};
189 return {d_start_of_block + (h_byte_ptr - h_start_of_block),
false};
192 void cnrn_target_copyin_update_present_table(
void const* h_ptr,
void* d_ptr, std::size_t len) {
197 std::lock_guard _{present_table_mutex};
199 present_table_value new_val{};
201 new_val.ref_count = 1;
202 new_val.dev_ptr =
static_cast<std::byte*
>(d_ptr);
203 auto const [iter, inserted] = present_table.emplace(
static_cast<std::byte const*
>(h_ptr),
207 assert(iter->second.size == len);
208 assert(iter->second.dev_ptr == new_val.dev_ptr);
209 ++(iter->second.ref_count);
212 void cnrn_target_delete_update_present_table(
void const* h_ptr, std::size_t len) {
216 std::lock_guard _{present_table_mutex};
217 auto const iter = present_table.find(
static_cast<std::byte const*
>(h_ptr));
218 assert(iter != present_table.end());
219 assert(iter->second.size == len);
220 --(iter->second.ref_count);
221 if (iter->second.ref_count == 0) {
222 present_table.erase(iter);
228 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
231 acc_device_t device_type = acc_device_nvidia;
233 return acc_get_num_devices(device_type);
234 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
236 return omp_get_num_devices();
238 throw std::runtime_error(
239 "cnrn_target_get_num_devices() not implemented without OpenACC/OpenMP and gpu build");
244 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
246 acc_set_device_num(device_num, acc_device_nvidia);
247 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
249 omp_set_default_device(device_num);
254 auto const cuda_code = cudaSetDevice(device_num);
255 assert(cuda_code == cudaSuccess);
257 throw std::runtime_error(
258 "cnrn_target_set_default_device() not implemented without OpenACC/OpenMP and gpu build");
262 #ifdef CORENEURON_ENABLE_GPU
263 #ifndef CORENEURON_UNIFIED_MEMORY
264 static Memb_list* copy_ml_to_device(
const Memb_list* ml,
int type) {
273 if (ml->global_variables) {
274 assert(ml->global_variables_size);
276 ml->global_variables_size);
281 int n = ml->nodecount;
305 NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
332 NetSendBuffer_t* nsb = ml->_net_send_buffer;
335 NetSendBuffer_t* d_nsb;
365 static void update_ml_on_host(
const Memb_list* ml,
int type) {
374 int n = ml->nodecount;
387 auto nrb = ml->_net_receive_buffer;
394 nrb->_pnt_index[:nrb->_size],
395 nrb->_weight_index[:nrb->_size],
396 nrb->_displ[:nrb->_size + 1],
397 nrb->_nrb_index[:nrb->_size])
403 nrb->_pnt_index[:nrb->_size],
404 nrb->_weight_index[:nrb->_size],
405 nrb->_displ[:nrb->_size + 1],
406 nrb->_nrb_index[:nrb->_size])
411 static
void delete_ml_from_device(Memb_list* ml,
int type) {
418 NetSendBuffer_t* nsb{ml->_net_send_buffer};
431 NetReceiveBuffer_t* nrb{ml->_net_receive_buffer};
442 int n = ml->nodecount;
454 if (ml->global_variables) {
455 assert(ml->global_variables_size);
457 ml->global_variables_size);
467 #ifdef CORENEURON_ENABLE_GPU
470 for (
int i = 0;
i < nthreads;
i++) {
478 #ifdef CORENEURON_UNIFIED_MEMORY
479 for (
int i = 0;
i < nthreads;
i++) {
498 printf(
"\n WARNING: NrnThread %d not permuted, error for linear algebra?",
i);
510 printf(
"\n Warning: No permutation data? Required for linear algebra!");
515 for (
int i = 0;
i < nthreads;
i++) {
545 dptr = d__data + 0 * ne;
548 dptr = d__data + 1 * ne;
551 dptr = d__data + 2 * ne;
554 dptr = d__data + 3 * ne;
557 dptr = d__data + 4 * ne;
560 dptr = d__data + 5 * ne;
564 dptr = d__data + 6 * ne;
579 bool first_tml =
true;
581 for (
auto tml = nt->
tml; tml; tml = tml->
next) {
599 Memb_list* d_ml = copy_ml_to_device(tml->ml, tml->index);
606 double* d_shadow_ptr;
685 int* d_ptr =
nullptr;
704 int* d_ptr =
nullptr;
721 printf(
"\n ERROR: only --cell_permute = [12] implemented");
725 printf(
"\n WARNING: NrnThread %d not permuted, error for linear algebra?",
i);
740 double** d_tr_varrays{
nullptr};
768 &d_fornetcon_perm_indices);
785 #ifdef CORENEURON_ENABLE_GPU
789 size_t n = from.size();
801 #ifdef CORENEURON_ENABLE_GPU
802 auto const n = vec.size();
807 static_cast<void>(vec);
817 #ifdef CORENEURON_ENABLE_GPU
830 auto const realloc = [old_size = nrb->
_size, nrb](
auto*& ptr, std::size_t extra_size = 0) {
831 using T = std::remove_pointer_t<std::remove_reference_t<decltype(ptr)>>;
832 static_assert(std::is_trivial<T>::value,
833 "Only trivially constructible and copiable types are supported.");
834 static_assert(std::is_same<decltype(ptr), T*&>::value,
835 "ptr should be reference-to-pointer");
836 auto*
const new_data =
static_cast<T*
>(
ecalloc_align((nrb->
_size + extra_size),
sizeof(T)));
837 std::memcpy(new_data, ptr, (old_size + extra_size) *
sizeof(T));
848 #ifdef CORENEURON_ENABLE_GPU
881 if (a.first == b.first) {
882 return a.second > b.second;
884 return a.first > b.first;
890 if (nrb->
_cnt == 0) {
895 std::priority_queue<NRB_P, std::vector<NRB_P>,
comp> nrbq;
897 for (
int i = 0;
i < nrb->
_cnt; ++
i) {
903 int last_instance_index = -1;
906 while (!nrbq.empty()) {
907 const NRB_P& p = nrbq.top();
909 if (p.first != last_instance_index) {
912 nrb->
_displ[displ_cnt] = index_cnt;
913 last_instance_index = p.first;
930 for (
auto tml = nt->
tml; tml; tml = tml->
next) {
939 if (nrb && nrb->
_cnt) {
975 #ifdef CORENEURON_ENABLE_GPU
983 printf(
"ERROR: NetSendBuffer exceeded during GPU execution (rank %d)\n",
nrnmpi_myid);
1013 #ifdef CORENEURON_ENABLE_GPU
1015 for (
int i = 0;
i < nthreads;
i++) {
1045 for (
auto tml = nt->
tml; tml; tml = tml->
next) {
1050 update_ml_on_host(tml->ml, tml->index);
1113 #ifdef CORENEURON_ENABLE_GPU
1114 for (
int i = 0;
i < nthreads;
i++) {
1152 #ifdef CORENEURON_ENABLE_GPU
1153 for (
int i = 0;
i < nthreads;
i++) {
1234 for (
auto tml = nt->
tml; tml; tml = tml->
next) {
1235 delete_ml_from_device(tml->ml, tml->index);
1249 #ifdef CORENEURON_ENABLE_GPU
1283 for (
int i = 0;
i < ns->
n; ++
i) {
1284 pd = d_jacdat +
i * n;
1291 #ifdef CORENEURON_ENABLE_GPU
1310 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY)
1317 unsigned n1 = so->
neqn + 1;
1342 for (
unsigned irow = 1; irow < n1; ++irow) {
1346 if (elm == so->
rowst[irow]) {
1353 if (elm->col == elm->row) {
1370 for (
unsigned irow = 1; irow < n1; ++irow) {
1393 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY)
1399 unsigned n1 = so->
neqn + 1;
1400 for (
unsigned irow = 1; irow < n1; ++irow) {
1415 #ifdef CORENEURON_ENABLE_GPU
1446 if (num_devices_per_node == 0) {
1447 nrn_fatal_error(
"\n ERROR : Enabled GPU execution but couldn't find NVIDIA GPU!\n");
1452 nrn_fatal_error(
"Fatal error: asking for '%d' GPUs per node but only '%d' available\n",
1454 num_devices_per_node);
1474 std::cout <<
" Info : " << num_devices_per_node <<
" GPUs shared by " << local_size
1475 <<
" ranks per node\n";
1480 for (
int i = 0;
i < nt->n_vecplay;
i++) {
1481 VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[
i];
1492 if (vecplay_instance->discon_indices_) {
1496 *(d_vecplay_instance->discon_indices_));
1513 for (
int i = 0;
i < nt->n_vecplay;
i++) {
1514 auto* vecplay_instance =
static_cast<VecPlayContinuous*
>(nt->_vecplay[
i]);
1516 if (vecplay_instance->discon_indices_) {