31 #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) 
   32 #include <cuda_runtime_api.h> 
   35 #if __has_include(<cxxabi.h>) 
   42 #ifdef CORENEURON_ENABLE_PRESENT_TABLE 
   47 #include <shared_mutex> 
   49 struct present_table_value {
 
   50     std::size_t ref_count{}, size{};
 
   53 std::map<std::byte const*, present_table_value> present_table;
 
   54 std::shared_mutex present_table_mutex;
 
   61 std::string cxx_demangle(
const char* mangled) {
 
   66     std::unique_ptr<char, decltype(free)*> demangled{
 
   67         abi::__cxa_demangle(mangled, 
nullptr, 
nullptr, &status), free};
 
   68     return status ? mangled : demangled.get();
 
   73 bool cnrn_target_debug_output_enabled() {
 
   74     const char* env = std::getenv(
"CORENEURON_GPU_DEBUG");
 
   78     std::string env_s{env};
 
   81     } 
else if (env_s == 
"0") {
 
   84         throw std::runtime_error(
"CORENEURON_GPU_DEBUG must be set to 0 or 1 (got " + env_s + 
")");
 
   87 bool cnrn_target_enable_debug{cnrn_target_debug_output_enabled()};
 
  100                               std::type_info 
const& typeid_T,
 
  104     if (!cnrn_target_enable_debug) {
 
  107     std::cerr << file << 
':' << line << 
": cnrn_target_copyin<" << cxx_demangle(typeid_T.name())
 
  108               << 
">(" << h_ptr << 
", " << len << 
" * " << sizeof_T << 
" = " << len * sizeof_T
 
  109               << 
") -> " << d_ptr << std::endl;
 
  113                               std::size_t sizeof_T,
 
  114                               std::type_info 
const& typeid_T,
 
  117     if (!cnrn_target_enable_debug) {
 
  120     std::cerr << file << 
':' << line << 
": cnrn_target_delete<" << cxx_demangle(typeid_T.name())
 
  121               << 
">(" << h_ptr << 
", " << len << 
" * " << sizeof_T << 
" = " << len * sizeof_T << 
')' 
  126                                  std::type_info 
const& typeid_T,
 
  129     if (!cnrn_target_enable_debug) {
 
  132     std::cerr << file << 
':' << line << 
": cnrn_target_deviceptr<" << cxx_demangle(typeid_T.name())
 
  133               << 
">(" << h_ptr << 
") -> " << d_ptr << std::endl;
 
  137                                   std::type_info 
const& typeid_T,
 
  140     if (!cnrn_target_enable_debug) {
 
  143     std::cerr << file << 
':' << line << 
": cnrn_target_is_present<" << cxx_demangle(typeid_T.name())
 
  144               << 
">(" << h_ptr << 
") -> " << d_ptr << std::endl;
 
  148                                         std::size_t sizeof_T,
 
  149                                         std::type_info 
const& typeid_T,
 
  153     if (!cnrn_target_enable_debug) {
 
  156     std::cerr << file << 
':' << line << 
": cnrn_target_memcpy_to_device<" 
  157               << cxx_demangle(typeid_T.name()) << 
">(" << d_ptr << 
", " << h_ptr << 
", " << len
 
  158               << 
" * " << sizeof_T << 
" = " << len * sizeof_T << 
')' << std::endl;
 
  161 #ifdef CORENEURON_ENABLE_PRESENT_TABLE 
  162 std::pair<void*, bool> cnrn_target_deviceptr_impl(
bool must_be_present_or_null, 
void const* h_ptr) {
 
  164         return {
nullptr, 
false};
 
  168     std::shared_lock _{present_table_mutex};
 
  169     if (present_table.empty()) {
 
  170         return {
nullptr, must_be_present_or_null};
 
  174     auto const iter = std::prev(std::upper_bound(
 
  175         present_table.begin(), present_table.end(), h_ptr, [](
void const* hp, 
auto const& entry) {
 
  176             return hp < entry.first;
 
  178     if (iter == present_table.end()) {
 
  179         return {
nullptr, must_be_present_or_null};
 
  181     std::byte 
const* 
const h_byte_ptr{
static_cast<std::byte const*
>(h_ptr)};
 
  182     std::byte 
const* 
const h_start_of_block{iter->first};
 
  183     std::size_t 
const block_size{iter->second.size};
 
  184     std::byte* 
const d_start_of_block{iter->second.dev_ptr};
 
  185     bool const is_present{h_byte_ptr < h_start_of_block + block_size};
 
  187         return {
nullptr, must_be_present_or_null};
 
  189     return {d_start_of_block + (h_byte_ptr - h_start_of_block), 
false};
 
  192 void cnrn_target_copyin_update_present_table(
void const* h_ptr, 
void* d_ptr, std::size_t len) {
 
  197     std::lock_guard _{present_table_mutex};
 
  199     present_table_value new_val{};
 
  201     new_val.ref_count = 1;
 
  202     new_val.dev_ptr = 
static_cast<std::byte*
>(d_ptr);
 
  203     auto const [iter, inserted] = present_table.emplace(
static_cast<std::byte const*
>(h_ptr),
 
  207         assert(iter->second.size == len);
 
  208         assert(iter->second.dev_ptr == new_val.dev_ptr);
 
  209         ++(iter->second.ref_count);
 
  212 void cnrn_target_delete_update_present_table(
void const* h_ptr, std::size_t len) {
 
  216     std::lock_guard _{present_table_mutex};
 
  217     auto const iter = present_table.find(
static_cast<std::byte const*
>(h_ptr));
 
  218     assert(iter != present_table.end());
 
  219     assert(iter->second.size == len);
 
  220     --(iter->second.ref_count);
 
  221     if (iter->second.ref_count == 0) {
 
  222         present_table.erase(iter);
 
  228 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ 
  231     acc_device_t device_type = acc_device_nvidia;
 
  233     return acc_get_num_devices(device_type);
 
  234 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ 
  236     return omp_get_num_devices();
 
  238     throw std::runtime_error(
 
  239         "cnrn_target_get_num_devices() not implemented without OpenACC/OpenMP and gpu build");
 
  244 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ 
  246     acc_set_device_num(device_num, acc_device_nvidia);
 
  247 #elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ 
  249     omp_set_default_device(device_num);
 
  254     auto const cuda_code = cudaSetDevice(device_num);
 
  255     assert(cuda_code == cudaSuccess);
 
  257     throw std::runtime_error(
 
  258         "cnrn_target_set_default_device() not implemented without OpenACC/OpenMP and gpu build");
 
  262 #ifdef CORENEURON_ENABLE_GPU 
  263 #ifndef CORENEURON_UNIFIED_MEMORY 
  264 static Memb_list* copy_ml_to_device(
const Memb_list* ml, 
int type) {
 
  273     if (ml->global_variables) {
 
  274         assert(ml->global_variables_size);
 
  276                                           ml->global_variables_size);
 
  281     int n = ml->nodecount;
 
  305     NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
 
  332     NetSendBuffer_t* nsb = ml->_net_send_buffer;
 
  335         NetSendBuffer_t* d_nsb;
 
  365 static void update_ml_on_host(
const Memb_list* ml, 
int type) {
 
  374     int n = ml->nodecount;
 
  387     auto nrb = ml->_net_receive_buffer;
 
  394                                nrb->_pnt_index[:nrb->_size],
 
  395                                nrb->_weight_index[:nrb->_size],
 
  396                                nrb->_displ[:nrb->_size + 1],
 
  397                                nrb->_nrb_index[:nrb->_size])
 
  403                                       nrb->_pnt_index[:nrb->_size],
 
  404                                       nrb->_weight_index[:nrb->_size],
 
  405                                       nrb->_displ[:nrb->_size + 1],
 
  406                                       nrb->_nrb_index[:nrb->_size])
 
  411 static 
void delete_ml_from_device(Memb_list* ml, 
int type) {
 
  418         NetSendBuffer_t* nsb{ml->_net_send_buffer};
 
  431         NetReceiveBuffer_t* nrb{ml->_net_receive_buffer};
 
  442     int n = ml->nodecount;
 
  454     if (ml->global_variables) {
 
  455         assert(ml->global_variables_size);
 
  457                            ml->global_variables_size);
 
  467 #ifdef CORENEURON_ENABLE_GPU 
  470     for (
int i = 0; 
i < nthreads; 
i++) {
 
  478 #ifdef CORENEURON_UNIFIED_MEMORY 
  479     for (
int i = 0; 
i < nthreads; 
i++) {
 
  498             printf(
"\n WARNING: NrnThread %d not permuted, error for linear algebra?", 
i);
 
  510         printf(
"\n Warning: No permutation data? Required for linear algebra!");
 
  515     for (
int i = 0; 
i < nthreads; 
i++) {
 
  545         dptr = d__data + 0 * ne;
 
  548         dptr = d__data + 1 * ne;
 
  551         dptr = d__data + 2 * ne;
 
  554         dptr = d__data + 3 * ne;
 
  557         dptr = d__data + 4 * ne;
 
  560         dptr = d__data + 5 * ne;
 
  564             dptr = d__data + 6 * ne;
 
  579         bool first_tml = 
true;
 
  581         for (
auto tml = nt->
tml; tml; tml = tml->
next) {
 
  599             Memb_list* d_ml = copy_ml_to_device(tml->ml, tml->index);
 
  606             double* d_shadow_ptr;
 
  685                 int* d_ptr = 
nullptr;
 
  704                 int* d_ptr = 
nullptr;
 
  721                 printf(
"\n ERROR: only --cell_permute = [12] implemented");
 
  725             printf(
"\n WARNING: NrnThread %d not permuted, error for linear algebra?", 
i);
 
  740                 double** d_tr_varrays{
nullptr};
 
  768                                          &d_fornetcon_perm_indices);
 
  785 #ifdef CORENEURON_ENABLE_GPU 
  789     size_t n = from.size();
 
  801 #ifdef CORENEURON_ENABLE_GPU 
  802     auto const n = vec.size();
 
  807     static_cast<void>(vec);
 
  817 #ifdef CORENEURON_ENABLE_GPU 
  830     auto const realloc = [old_size = nrb->
_size, nrb](
auto*& ptr, std::size_t extra_size = 0) {
 
  831         using T = std::remove_pointer_t<std::remove_reference_t<decltype(ptr)>>;
 
  832         static_assert(std::is_trivial<T>::value,
 
  833                       "Only trivially constructible and copiable types are supported.");
 
  834         static_assert(std::is_same<decltype(ptr), T*&>::value,
 
  835                       "ptr should be reference-to-pointer");
 
  836         auto* 
const new_data = 
static_cast<T*
>(
ecalloc_align((nrb->
_size + extra_size), 
sizeof(T)));
 
  837         std::memcpy(new_data, ptr, (old_size + extra_size) * 
sizeof(T));
 
  848 #ifdef CORENEURON_ENABLE_GPU 
  881         if (a.first == b.first) {
 
  882             return a.second > b.second;  
 
  884         return a.first > b.first;
 
  890     if (nrb->
_cnt == 0) {
 
  895     std::priority_queue<NRB_P, std::vector<NRB_P>, 
comp> nrbq;
 
  897     for (
int i = 0; 
i < nrb->
_cnt; ++
i) {
 
  903     int last_instance_index = -1;
 
  906     while (!nrbq.empty()) {
 
  907         const NRB_P& p = nrbq.top();
 
  909         if (p.first != last_instance_index) {
 
  912         nrb->
_displ[displ_cnt] = index_cnt;
 
  913         last_instance_index = p.first;
 
  930     for (
auto tml = nt->
tml; tml; tml = tml->
next) {
 
  939         if (nrb && nrb->
_cnt) {
 
  975 #ifdef CORENEURON_ENABLE_GPU 
  983         printf(
"ERROR: NetSendBuffer exceeded during GPU execution (rank %d)\n", 
nrnmpi_myid);
 
 1013 #ifdef CORENEURON_ENABLE_GPU 
 1015     for (
int i = 0; 
i < nthreads; 
i++) {
 
 1045             for (
auto tml = nt->
tml; tml; tml = tml->
next) {
 
 1050                 update_ml_on_host(tml->ml, tml->index);
 
 1113 #ifdef CORENEURON_ENABLE_GPU 
 1114     for (
int i = 0; 
i < nthreads; 
i++) {
 
 1152 #ifdef CORENEURON_ENABLE_GPU 
 1153     for (
int i = 0; 
i < nthreads; 
i++) {
 
 1234         for (
auto tml = nt->
tml; tml; tml = tml->
next) {
 
 1235             delete_ml_from_device(tml->ml, tml->index);
 
 1249 #ifdef CORENEURON_ENABLE_GPU 
 1283     for (
int i = 0; 
i < ns->
n; ++
i) {
 
 1284         pd = d_jacdat + 
i * n;
 
 1291 #ifdef CORENEURON_ENABLE_GPU 
 1310 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY) 
 1317     unsigned n1 = so->
neqn + 1;
 
 1342     for (
unsigned irow = 1; irow < n1; ++irow) {
 
 1346             if (elm == so->
rowst[irow]) {
 
 1353             if (elm->col == elm->row) {
 
 1370     for (
unsigned irow = 1; irow < n1; ++irow) {
 
 1393 #if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_UNIFIED_MEMORY) 
 1399     unsigned n1 = so->
neqn + 1;
 
 1400     for (
unsigned irow = 1; irow < n1; ++irow) {
 
 1415 #ifdef CORENEURON_ENABLE_GPU 
 1446     if (num_devices_per_node == 0) {
 
 1447         nrn_fatal_error(
"\n ERROR : Enabled GPU execution but couldn't find NVIDIA GPU!\n");
 
 1452             nrn_fatal_error(
"Fatal error: asking for '%d' GPUs per node but only '%d' available\n",
 
 1454                             num_devices_per_node);
 
 1474         std::cout << 
" Info : " << num_devices_per_node << 
" GPUs shared by " << local_size
 
 1475                   << 
" ranks per node\n";
 
 1480     for (
int i = 0; 
i < nt->n_vecplay; 
i++) {
 
 1481         VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[
i];
 
 1492         if (vecplay_instance->discon_indices_) {
 
 1496                                      *(d_vecplay_instance->discon_indices_));
 
 1513     for (
int i = 0; 
i < nt->n_vecplay; 
i++) {
 
 1514         auto* vecplay_instance = 
static_cast<VecPlayContinuous*
>(nt->_vecplay[
i]);
 
 1516         if (vecplay_instance->discon_indices_) {