CoreNEURON
nrnran123.cpp
Go to the documentation of this file.
1 /*
2 # =============================================================================
3 # Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
4 #
5 # See top-level LICENSE file for details.
6 # =============================================================================.
7 */
13 
14 #ifdef CORENEURON_USE_BOOST_POOL
15 #include <boost/pool/pool_alloc.hpp>
16 #include <unordered_map>
17 #endif
18 
19 #include <cmath>
20 #include <iostream>
21 #include <memory>
22 #include <mutex>
23 
24 // Defining these attributes seems to help nvc++ in OpenMP target offload mode.
25 #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
26  defined(_OPENMP) && defined(__CUDACC__)
27 #define CORENRN_HOST_DEVICE __host__ __device__
28 #else
29 #define CORENRN_HOST_DEVICE
30 #endif
31 
32 namespace {
33 #ifdef CORENEURON_USE_BOOST_POOL
34 /** Tag type for use with boost::fast_pool_allocator that forwards to
35  * coreneuron::[de]allocate_unified(). Using a Random123-specific type here
36  * makes sure that allocations do not come from the same global pool as other
37  * usage of boost pools for objects with sizeof == sizeof(nrnran123_State).
38  *
39  * The messy m_block_sizes map is just because `deallocate_unified` uses sized
40  * deallocations, but the Boost pool allocators don't. Because this is hidden
41  * behind the pool mechanism, these methods are not called very often and the
42  * overhead is minimal.
43  */
44 struct random123_allocate_unified {
45  using size_type = std::size_t;
46  using difference_type = std::size_t;
47  static char* malloc(const size_type bytes) {
48  std::lock_guard<std::mutex> const lock{m_mutex};
49  static_cast<void>(lock);
50  auto* buffer = coreneuron::allocate_unified(bytes);
51  m_block_sizes[buffer] = bytes;
52  return reinterpret_cast<char*>(buffer);
53  }
54  static void free(char* const block) {
55  std::lock_guard<std::mutex> const lock{m_mutex};
56  static_cast<void>(lock);
57  auto const iter = m_block_sizes.find(block);
58  assert(iter != m_block_sizes.end());
59  auto const size = iter->second;
60  m_block_sizes.erase(iter);
61  return coreneuron::deallocate_unified(block, size);
62  }
63  static std::mutex m_mutex;
64  static std::unordered_map<void*, std::size_t> m_block_sizes;
65 };
66 
67 std::mutex random123_allocate_unified::m_mutex{};
68 std::unordered_map<void*, std::size_t> random123_allocate_unified::m_block_sizes{};
69 
70 using random123_allocator =
71  boost::fast_pool_allocator<coreneuron::nrnran123_State, random123_allocate_unified>;
72 #else
74 #endif
75 /* Global data structure per process. Using a unique_ptr here causes [minor]
76  * problems because its destructor can be called very late during application
77  * shutdown. If the destructor calls cudaFree and the CUDA runtime has already
78  * been shut down then tools like cuda-memcheck reports errors.
79  */
80 OMP_Mutex g_instance_count_mutex;
81 std::size_t g_instance_count{};
82 
83 #ifdef __CUDACC__
84 #define g_k_qualifiers __device__ __constant__
85 #else
86 #define g_k_qualifiers
87 #endif
88 g_k_qualifiers philox4x32_key_t g_k{};
89 // Cannot refer to g_k directly from a nrn_pragma_acc(routine seq) method like
90 // coreneuron_random123_philox4x32_helper, and cannot have this inlined there at
91 // higher optimisation levels
92 __attribute__((noinline)) philox4x32_key_t& global_state() {
93  return g_k;
94 }
95 } // namespace
96 
97 CORENRN_HOST_DEVICE philox4x32_ctr_t
99  return philox4x32(s->c, global_state());
100 }
101 
102 namespace coreneuron {
104  return g_instance_count;
105 }
106 
107 /* if one sets the global, one should reset all the stream sequences. */
109  return global_state().v[0];
110 }
111 
112 /* nrn123 streams are created from cpu launcher routine */
113 void nrnran123_set_globalindex(uint32_t gix) {
114  // If the global seed is changing then we shouldn't have any active streams.
115  auto& g_k = global_state();
116  {
117  std::lock_guard<OMP_Mutex> _{g_instance_count_mutex};
118  if (g_instance_count != 0 && nrnmpi_myid == 0) {
119  std::cout
120  << "nrnran123_set_globalindex(" << gix
121  << ") called when a non-zero number of Random123 streams (" << g_instance_count
122  << ") were active. This is not safe, some streams will remember the old value ("
123  << g_k.v[0] << ')' << std::endl;
124  }
125  }
126  if (g_k.v[0] != gix) {
127  g_k.v[0] = gix;
128  if (coreneuron::gpu_enabled()) {
129 #ifdef __CUDACC__
130  {
131  auto const code = cudaMemcpyToSymbol(g_k, &g_k, sizeof(g_k));
132  assert(code == cudaSuccess);
133  }
134  {
135  auto const code = cudaDeviceSynchronize();
136  assert(code == cudaSuccess);
137  }
138 #else
139  nrn_pragma_acc(update device(g_k))
140  nrn_pragma_omp(target update to(g_k))
141 #endif
142  }
143  }
144 }
145 
147  if (coreneuron::gpu_enabled()) {
148 #ifndef __CUDACC__
149  nrn_pragma_acc(enter data copyin(g_k))
150 #endif
151  }
152 }
153 
155  if (coreneuron::gpu_enabled()) {
156 #ifndef __CUDACC__
157  nrn_pragma_acc(exit data delete (g_k))
158 #endif
159  }
160 }
161 
162 /** @brief Allocate a new Random123 stream.
163  * @todo It would be nicer if the API return type was
164  * std::unique_ptr<nrnran123_State, ...not specified...>, so we could use a
165  * custom allocator/deleter and avoid the (fragile) need for matching
166  * nrnran123_deletestream calls.
167  */
169  uint32_t id2,
170  uint32_t id3,
171  bool use_unified_memory) {
172  // The `use_unified_memory` argument is an implementation detail to keep the
173  // old behaviour that some Random123 streams that are known to only be used
174  // from the CPU are allocated using new/delete instead of unified memory.
175  // See OPENACC_EXCLUDED_FILES in coreneuron/CMakeLists.txt. If we dropped
176  // this feature then we could always use coreneuron::unified_allocator.
177 #ifndef CORENEURON_ENABLE_GPU
178  if (use_unified_memory) {
179  throw std::runtime_error("Tried to use CUDA unified memory in a non-GPU build.");
180  }
181 #endif
182  nrnran123_State* s{nullptr};
183  if (use_unified_memory) {
184  s = coreneuron::allocate_unique<nrnran123_State>(random123_allocator{}).release();
185  } else {
186  s = new nrnran123_State{};
187  }
188  s->c.v[0] = 0;
189  s->c.v[1] = id3;
190  s->c.v[2] = id1;
191  s->c.v[3] = id2;
192  nrnran123_setseq(s, 0, 0);
193  {
194  std::lock_guard<OMP_Mutex> _{g_instance_count_mutex};
195  ++g_instance_count;
196  }
197  return s;
198 }
199 
200 /* nrn123 streams are destroyed from cpu launcher routine */
201 void nrnran123_deletestream(nrnran123_State* s, bool use_unified_memory) {
202 #ifndef CORENEURON_ENABLE_GPU
203  if (use_unified_memory) {
204  throw std::runtime_error("Tried to use CUDA unified memory in a non-GPU build.");
205  }
206 #endif
207  {
208  std::lock_guard<OMP_Mutex> _{g_instance_count_mutex};
209  --g_instance_count;
210  }
211  if (use_unified_memory) {
212  std::unique_ptr<nrnran123_State, coreneuron::alloc_deleter<random123_allocator>> _{s};
213  } else {
214  delete s;
215  }
216 }
217 } // namespace coreneuron
coreneuron::nrnran123_get_globalindex
uint32_t nrnran123_get_globalindex()
Definition: nrnran123.cpp:108
nrnran123.h
OMP_Mutex
Definition: nrnmutdec.hpp:55
coreneuron::allocate_unified
void * allocate_unified(std::size_t num_bytes)
Allocate unified memory in GPU builds iff GPU enabled, otherwise new.
Definition: memory.cpp:26
coreneuron_random123_philox4x32_helper
CORENRN_HOST_DEVICE philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron::nrnran123_State *s)
Definition: nrnran123.cpp:98
data
Definition: alignment.cpp:18
coreneuron::nrnran123_instance_count
std::size_t nrnran123_instance_count()
Definition: nrnran123.cpp:103
nrn_acc_manager.hpp
coreneuron::nrnran123_destroy_global_state_on_device
void nrnran123_destroy_global_state_on_device()
Definition: nrnran123.cpp:154
nrn_pragma_omp
nrn_pragma_acc(routine seq) nrn_pragma_omp(declare target) philox4x32_ctr_t coreneuron_random123_philox4x32_helper(coreneuron nrn_pragma_omp(end declare target) namespace coreneuron
Provide a helper function in global namespace that is declared target for OpenMP offloading to functi...
Definition: nrnran123.h:69
coreneuron::nrnran123_set_globalindex
void nrnran123_set_globalindex(uint32_t gix)
Definition: nrnran123.cpp:113
CORENRN_HOST_DEVICE
#define CORENRN_HOST_DEVICE
Definition: nrnran123.cpp:29
coreneuron
THIS FILE IS AUTO GENERATED DONT MODIFY IT.
Definition: corenrn_parameters.cpp:12
coreneuron::update
void update(NrnThread *_nt)
Definition: fadvance_core.cpp:201
coreneuron::nrnran123_deletestream
void nrnran123_deletestream(nrnran123_State *s, bool use_unified_memory)
Definition: nrnran123.cpp:201
nrnmpi.hpp
coreneuron::nrnran123_newstream3
nrnran123_State * nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3, bool use_unified_memory)
Allocate a new Random123 stream.
Definition: nrnran123.cpp:168
coreneuron::nrnran123_State::c
philox4x32_ctr_t c
Definition: nrnran123.h:56
coreneuron::nrnran123_initialise_global_state_on_device
void nrnran123_initialise_global_state_on_device()
Definition: nrnran123.cpp:146
nrnmutdec.hpp
coreneuron::nrnran123_State
Definition: nrnran123.h:55
coreneuron::gpu_enabled
bool gpu_enabled()
Check if GPU support is enabled.
Definition: memory.cpp:18
coreneuron::deallocate_unified
void deallocate_unified(void *ptr, std::size_t num_bytes)
Deallocate memory allocated by allocate_unified.
Definition: memory.cpp:44
coreneuron::nrnmpi_myid
int nrnmpi_myid
Definition: nrnmpi_def_cinc.cpp:11
coreneuron::nrn_pragma_acc
nrn_pragma_acc(routine vector) static void triang_interleaved2(NrnThread *nt
Definition: ivocvect.cpp:30
g_k_qualifiers
#define g_k_qualifiers
Definition: nrnran123.cpp:86
memory.h
coreneuron::unified_allocator
C++ allocator that uses [de]allocate_unified.
Definition: memory.h:44