User Guide
codegen_acc_visitor.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2023 Blue Brain Project, EPFL.
3  * See the top-level LICENSE file for details.
4  *
5  * SPDX-License-Identifier: Apache-2.0
6  */
7 
9 
11 #include "ast/integer.hpp"
13 
14 
15 namespace nmodl {
16 namespace codegen {
17 
18 /****************************************************************************************/
19 /* Routines must be overloaded in backend */
20 /****************************************************************************************/
21 
22 
23 /**
24  * Depending programming model and compiler, we print compiler hint
25  * for parallelization. For example:
26  *
27  * #pragma ivdep
28  * for(int id=0; id<nodecount; id++) {
29  *
30  * #pragma acc parallel loop
31  * for(int id=0; id<nodecount; id++) {
32  *
33  */
35  if (info.artificial_cell) {
36  return;
37  }
38 
39  std::ostringstream present_clause;
40  present_clause << "present(inst";
41 
42  if (type == BlockType::NetReceive) {
43  present_clause << ", nrb";
44  } else {
45  present_clause << ", node_index, data, voltage, indexes, thread";
46  if (type == BlockType::Equation) {
47  present_clause << ", vec_rhs, vec_d";
48  }
49  }
50  present_clause << ')';
51  printer->fmt_line("nrn_pragma_acc(parallel loop {} async(nt->stream_id) if(nt->compute_gpu))",
52  present_clause.str());
53  printer->add_line("nrn_pragma_omp(target teams distribute parallel for if(nt->compute_gpu))");
54 }
55 
56 
58  printer->add_line("nrn_pragma_acc(atomic update)");
59  printer->add_line("nrn_pragma_omp(atomic update)");
60 }
61 
62 
64  printer->add_line("#include <coreneuron/utils/offload.hpp>");
65  printer->add_line("#include <cuda_runtime_api.h>");
66 }
67 
68 
69 std::string CodegenAccVisitor::backend_name() const {
70  return "C++-OpenAcc (api-compatibility)";
71 }
72 
73 
75  // memory for artificial cells should be allocated on CPU
76  if (info.artificial_cell) {
78  return;
79  }
80  printer->add_newline(2);
81  printer->push_block(
82  "static inline void* mem_alloc(size_t num, size_t size, size_t alignment = 16)");
83  printer->add_multi_line(R"CODE(
84  void* ptr;
85  cudaMallocManaged(&ptr, num*size);
86  cudaMemset(ptr, 0, num*size);
87  return ptr;
88  )CODE");
89  printer->pop_block();
90 
91  printer->add_newline(2);
92  printer->push_block("static inline void mem_free(void* ptr)");
93  printer->add_line("cudaFree(ptr);");
94  printer->pop_block();
95 }
96 
97 /**
98  * OpenACC kernels running on GPU doesn't support `abort()`. CUDA/OpenACC supports
99  * `assert()` in device kernel that can be used for similar purpose. Also, `printf`
100  * is supported on device.
101  *
102  * @todo : we need to implement proper error handling mechanism to propogate errors
103  * from GPU to CPU. For example, error code can be returned like original
104  * neuron implementation. For now we use `assert(0==1)` pattern which is
105  * used for OpenACC.
106  */
108  printer->add_newline(2);
109  printer->push_block("static inline void coreneuron_abort()");
110  printer->add_line(R"(printf("Error : Issue while running OpenACC kernel \n");)");
111  printer->add_line("assert(0==1);");
112  printer->pop_block();
113 }
114 
116  printer->push_block("if (nt->compute_gpu)");
118  printer->add_line("i = nsb->_cnt++;");
119  printer->chain_block("else");
120  printer->add_line("i = nsb->_cnt++;");
121  printer->pop_block();
122 }
123 
125  // no-op since can not grow buffer during gpu execution
126 }
127 
128 /**
129  * Each kernel like nrn_init, nrn_state and nrn_cur could be offloaded
130  * to accelerator. In this case, at very top level, we print pragma
131  * for data present. For example:
132  *
133  * \code{.cpp}
134  * void nrn_state(...) {
135  * #pragma acc data present (nt, ml...)
136  * {
137  *
138  * }
139  * }
140  * \endcode
141  */
143  if (!info.artificial_cell) {
144  printer->add_line("nrn_pragma_acc(data present(nt, ml) if(nt->compute_gpu))");
145  printer->add_line("{");
146  printer->increase_indent();
147  }
148 }
149 
150 /**
151  * `INITIAL` block from `NET_RECEIVE` generates `net_init` function. The `net_init`
152  * function pointer is registered with the coreneuron and called from the CPU.
153  * As the data is on GPU, we need to launch `net_init` on the GPU.
154  *
155  * \todo: With the current code structure for NMODL and MOD2C, we use `serial`
156  * construct to launch serial kernels. This is during initialization
157  * but still inefficient. This should be improved when we drop MOD2C.
158  */
160  if (!info.artificial_cell) {
161  printer->add_line("#pragma acc serial present(inst, indexes, weights) if(nt->compute_gpu)");
162  printer->add_line("{");
163  printer->increase_indent();
164  }
165 }
166 
168  if (!info.artificial_cell) {
169  printer->pop_block();
170  }
171 }
172 
174  auto rhs_op = operator_for_rhs();
175  auto d_op = operator_for_d();
176  if (info.point_process) {
178  }
179  printer->fmt_line("vec_rhs[node_id] {} rhs;", rhs_op);
180  if (info.point_process) {
182  }
183  printer->fmt_line("vec_d[node_id] {} g;", d_op);
184 }
185 
187  if (!info.electrode_current) {
188  return;
189  }
190 
191  auto rhs_op = operator_for_rhs();
192  auto d_op = operator_for_d();
193  printer->push_block("if (nt->nrn_fast_imem)");
194  if (info.point_process) {
196  }
197  printer->fmt_line("nt->nrn_fast_imem->nrn_sav_rhs[node_id] {} rhs;", rhs_op);
198  if (info.point_process) {
200  }
201  printer->fmt_line("nt->nrn_fast_imem->nrn_sav_d[node_id] {} g;", d_op);
202  printer->pop_block();
203 }
204 
206  // do nothing
207 }
208 
209 
210 /**
211  * End of print_kernel_enter_data_begin
212  */
214  if (!info.artificial_cell) {
215  printer->pop_block();
216  }
217 }
218 
219 
221  // do nothing
222 }
223 
224 
226  return false;
227 }
228 
229 
231  if (!info.artificial_cell) {
232  printer->push_block("if (nt->compute_gpu)");
233  printer->fmt_line("nrn_pragma_acc(update device ({}))", global_struct_instance());
234  printer->fmt_line("nrn_pragma_omp(target update to({}))", global_struct_instance());
235  printer->pop_block();
236  }
237 }
238 
239 
241  int list_num = info.derivimplicit_list_num;
242  printer->push_block("if(nt->compute_gpu)");
243  printer->add_multi_line(R"CODE(
244  double* device_vec = cnrn_target_copyin(vec, vec_size / sizeof(double));
245  void* device_ns = cnrn_target_deviceptr(*ns);
246  ThreadDatum* device_thread = cnrn_target_deviceptr(thread);
247  )CODE");
248  printer->fmt_line("cnrn_target_memcpy_to_device(&(device_thread[{}]._pvoid), &device_ns);",
249  info.thread_data_index - 1);
250  printer->fmt_line("cnrn_target_memcpy_to_device(&(device_thread[dith{}()].pval), &device_vec);",
251  list_num);
252  printer->pop_block();
253 }
254 
255 
257  if (info.artificial_cell) {
258  return;
259  }
260  printer->fmt_line(
261  "static inline void copy_instance_to_device(NrnThread* nt, Memb_list* ml, {} const* inst);",
262  instance_struct());
263  printer->fmt_line("static inline void delete_instance_from_device({}* inst);",
264  instance_struct());
265 }
266 
267 
269  std::vector<std::string> const& ptr_members) {
270  if (info.artificial_cell) {
271  return;
272  }
273  printer->fmt_push_block(
274  "static inline void copy_instance_to_device(NrnThread* nt, Memb_list* ml, {} const* inst)",
275  instance_struct());
276  printer->push_block("if (!nt->compute_gpu)");
277  printer->add_line("return;");
278  printer->pop_block();
279  printer->fmt_line("auto tmp = *inst;");
280  printer->add_line("auto* d_inst = cnrn_target_is_present(inst);");
281  printer->push_block("if (!d_inst)");
282  printer->add_line("d_inst = cnrn_target_copyin(inst);");
283  printer->pop_block();
284  for (auto const& ptr_mem: ptr_members) {
285  printer->fmt_line("tmp.{0} = cnrn_target_deviceptr(tmp.{0});", ptr_mem);
286  }
287  printer->add_multi_line(R"CODE(
288  cnrn_target_memcpy_to_device(d_inst, &tmp);
289  auto* d_ml = cnrn_target_deviceptr(ml);
290  void* d_inst_void = d_inst;
291  cnrn_target_memcpy_to_device(&(d_ml->instance), &d_inst_void);
292  )CODE");
293  printer->pop_block(); // copy_instance_to_device
294  printer->add_newline();
295 
296  printer->fmt_push_block("static inline void delete_instance_from_device({}* inst)",
297  instance_struct());
298  printer->push_block("if (cnrn_target_is_present(inst))");
299  printer->add_line("cnrn_target_delete(inst);");
300  printer->pop_block();
301  printer->pop_block(); // delete_instance_from_device
302  printer->add_newline();
303 }
304 
305 
307  if (info.artificial_cell) {
308  return;
309  }
310  printer->add_line("copy_instance_to_device(nt, ml, inst);");
311 }
312 
313 
315  if (info.artificial_cell) {
316  return;
317  }
318  printer->add_line("delete_instance_from_device(inst);");
319 }
320 
321 
323  printer->add_line("nrn_pragma_acc(update device (deriv_advance_flag) if(nt->compute_gpu))");
324  printer->add_line("nrn_pragma_omp(target update to(deriv_advance_flag) if(nt->compute_gpu))");
325 }
326 
327 
329  printer->add_line("nrn_pragma_acc(atomic capture)");
330  printer->add_line("nrn_pragma_omp(atomic capture)");
331 }
332 
333 
335  printer->push_block("if(nt->compute_gpu)");
336  printer->add_line("nrn_pragma_acc(wait(nt->stream_id))");
337  printer->pop_block();
338 }
339 
340 
342  printer->add_line("nrn_pragma_acc(update self(nsb->_cnt))");
343  printer->add_line("nrn_pragma_omp(target update from(nsb->_cnt))");
344 }
345 
346 
349  printer->push_block("if (nsb && nt->compute_gpu)");
351  printer->add_line("update_net_send_buffer_on_host(nt, nsb);");
352  printer->pop_block();
353 }
354 
355 
357  printer->push_block("if (nt->compute_gpu)");
358  printer->add_line("nrn_pragma_acc(update device(nsb->_cnt))");
359  printer->add_line("nrn_pragma_omp(target update to(nsb->_cnt))");
360  printer->pop_block();
361 }
362 
363 
365  printer->fmt_line("#pragma acc update device({}) if (nt->compute_gpu)",
367 }
368 
369 } // namespace codegen
370 } // namespace nmodl
nmodl::codegen::CodegenAccVisitor::print_net_send_buf_count_update_to_host
void print_net_send_buf_count_update_to_host() const override
update NetSendBuffer_t count from device to host
Definition: codegen_acc_visitor.cpp:341
nmodl::codegen::CodegenAccVisitor::print_device_stream_wait
void print_device_stream_wait() const override
Print the code to synchronise/wait on stream specific to NrnThread.
Definition: codegen_acc_visitor.cpp:334
nmodl::codegen::CodegenAccVisitor::print_kernel_data_present_annotation_block_begin
void print_kernel_data_present_annotation_block_begin() override
annotations like "acc enter data present(...)" for main kernel
Definition: codegen_acc_visitor.cpp:142
nmodl::codegen::CodegenAccVisitor::print_parallel_iteration_hint
void print_parallel_iteration_hint(BlockType type, const ast::Block *block) override
ivdep like annotation for channel iterations
Definition: codegen_acc_visitor.cpp:34
codegen_acc_visitor.hpp
Visitor for printing C++ code with OpenACC backend
nmodl::codegen::CodegenAccVisitor::print_net_send_buffering_grow
void print_net_send_buffering_grow() override
Replace default implementation by a no-op since the buffer cannot be grown up during gpu execution.
Definition: codegen_acc_visitor.cpp:124
nmodl::codegen::CodegenAccVisitor::print_newtonspace_transfer_to_device
void print_newtonspace_transfer_to_device() const override
transfer newtonspace structure to device
Definition: codegen_acc_visitor.cpp:240
nmodl::codegen::CodegenAccVisitor::print_nrn_cur_matrix_shadow_reduction
void print_nrn_cur_matrix_shadow_reduction() override
reduction to matrix elements from shadow vectors
Definition: codegen_acc_visitor.cpp:205
nmodl::codegen::CodegenAccVisitor::print_net_send_buffering_cnt_update
void print_net_send_buffering_cnt_update() const override
print atomic update of NetSendBuffer_t cnt
Definition: codegen_acc_visitor.cpp:115
nmodl::codegen::CodegenAccVisitor::print_net_init_acc_serial_annotation_block_begin
void print_net_init_acc_serial_annotation_block_begin() override
start of annotation "acc kernels" for net_init kernel
Definition: codegen_acc_visitor.cpp:159
nmodl::codegen::CodegenCppVisitor::info
codegen::CodegenInfo info
All ast information for code generation.
Definition: codegen_cpp_visitor.hpp:331
nmodl::codegen::CodegenCppVisitor::global_struct_instance
std::string global_struct_instance() const
Name of the (host-only) global instance of global_struct
Definition: codegen_cpp_visitor.hpp:444
nmodl::codegen::CodegenAccVisitor::print_deriv_advance_flag_transfer_to_device
void print_deriv_advance_flag_transfer_to_device() const override
update derivimplicit advance flag on the gpu device
Definition: codegen_acc_visitor.cpp:322
nmodl::codegen::CodegenInfo::derivimplicit_list_num
int derivimplicit_list_num
slist/dlist id for derivimplicit block
Definition: codegen_info.hpp:433
nmodl::codegen::CodegenInfo::thread_data_index
int thread_data_index
thread_data_index indicates number of threads being allocated.
Definition: codegen_info.hpp:406
nmodl::codegen::CodegenCppVisitor::operator_for_d
const char * operator_for_d() const noexcept
Operator for diagonal vector update (matrix update)
Definition: codegen_cpp_visitor.hpp:506
nmodl::codegen::CodegenAccVisitor::nrn_cur_reduction_loop_required
bool nrn_cur_reduction_loop_required() override
if reduction block in nrn_cur required
Definition: codegen_acc_visitor.cpp:225
nmodl
encapsulates code generation backend implementations
Definition: ast_common.hpp:26
protect_statement.hpp
Auto generated AST classes declaration.
nmodl::codegen::naming::NTHREAD_DT_VARIABLE
static constexpr char NTHREAD_DT_VARIABLE[]
dt variable in neuron thread structure
Definition: codegen_naming.hpp:108
nmodl::codegen::CodegenAccVisitor::print_device_atomic_capture_annotation
void print_device_atomic_capture_annotation() const override
print atomic capture pragma
Definition: codegen_acc_visitor.cpp:328
nmodl::codegen::CodegenAccVisitor::print_fast_imem_calculation
void print_fast_imem_calculation() override
fast membrane current calculation
Definition: codegen_acc_visitor.cpp:186
nmodl::codegen::CodegenAccVisitor::print_atomic_reduction_pragma
void print_atomic_reduction_pragma() override
atomic update pragma for reduction statements
Definition: codegen_acc_visitor.cpp:57
nmodl::ast::Block
Base class for all block scoped nodes.
Definition: block.hpp:41
nmodl::codegen::CodegenAccVisitor::print_instance_struct_transfer_routine_declarations
void print_instance_struct_transfer_routine_declarations() override
declare helper functions for copying the instance struct to the device
Definition: codegen_acc_visitor.cpp:256
nmodl::codegen::CodegenInfo::artificial_cell
bool artificial_cell
if mod file is artificial cell
Definition: codegen_info.hpp:365
nmodl::codegen::BlockType::Equation
@ Equation
breakpoint block
nmodl::codegen::CodegenAccVisitor::print_nrn_cur_matrix_shadow_update
void print_nrn_cur_matrix_shadow_update() override
update to matrix elements with/without shadow vectors
Definition: codegen_acc_visitor.cpp:173
nmodl::codegen::CodegenAccVisitor::print_instance_struct_copy_to_device
void print_instance_struct_copy_to_device() override
call helper function for copying the instance struct to the device
Definition: codegen_acc_visitor.cpp:306
nmodl::codegen::CodegenAccVisitor::print_rhs_d_shadow_variables
void print_rhs_d_shadow_variables() override
setup method for setting matrix shadow vectors
Definition: codegen_acc_visitor.cpp:220
nmodl::codegen::CodegenCoreneuronCppVisitor::print_memory_allocation_routine
virtual void print_memory_allocation_routine() const
Print memory allocation routine.
Definition: codegen_coreneuron_cpp_visitor.cpp:298
nmodl::codegen::BlockType::NetReceive
@ NetReceive
net_receive block
nmodl::codegen::CodegenInfo::point_process
bool point_process
if mod file is point process
Definition: codegen_info.hpp:362
nmodl::codegen::CodegenAccVisitor::print_dt_update_to_device
void print_dt_update_to_device() const override
update dt from host to device
Definition: codegen_acc_visitor.cpp:364
nmodl::codegen::CodegenCoreneuronCppVisitor::get_variable_name
std::string get_variable_name(const std::string &name, bool use_instance=true) const override
Determine variable name in the structure of mechanism properties.
Definition: codegen_coreneuron_cpp_visitor.cpp:807
nmodl::codegen::CodegenAccVisitor::print_global_variable_device_update_annotation
void print_global_variable_device_update_annotation() override
update global variable from host to the device
Definition: codegen_acc_visitor.cpp:230
nmodl::codegen::CodegenInfo::electrode_current
bool electrode_current
if electrode current specified
Definition: codegen_info.hpp:368
nmodl::codegen::CodegenAccVisitor::print_kernel_data_present_annotation_block_end
void print_kernel_data_present_annotation_block_end() override
end of annotation like "acc enter data"
Definition: codegen_acc_visitor.cpp:213
nmodl::codegen::BlockType
BlockType
Helper to represent various block types.
Definition: codegen_cpp_visitor.hpp:56
nmodl::codegen::CodegenAccVisitor::print_abort_routine
void print_abort_routine() const override
abort routine
Definition: codegen_acc_visitor.cpp:107
eigen_linear_solver_block.hpp
Auto generated AST classes declaration.
nmodl::codegen::CodegenCppVisitor::instance_struct
std::string instance_struct() const
Name of structure that wraps range variables.
Definition: codegen_cpp_visitor.hpp:417
nmodl::codegen::CodegenAccVisitor::print_backend_includes
void print_backend_includes() override
common includes : standard c++, coreneuron and backend specific
Definition: codegen_acc_visitor.cpp:63
nmodl::codegen::CodegenCppVisitor::operator_for_rhs
const char * operator_for_rhs() const noexcept
Operator for rhs vector update (matrix update)
Definition: codegen_cpp_visitor.hpp:498
nmodl::codegen::CodegenAccVisitor::print_instance_struct_delete_from_device
void print_instance_struct_delete_from_device() override
call helper function that deletes the instance struct from the device
Definition: codegen_acc_visitor.cpp:314
nmodl::codegen::CodegenCppVisitor::printer
std::unique_ptr< CodePrinter > printer
Code printer object for target (C++)
Definition: codegen_cpp_visitor.hpp:307
nmodl::codegen::CodegenAccVisitor::print_net_send_buf_update_to_host
void print_net_send_buf_update_to_host() const override
update NetSendBuffer_t from device to host
Definition: codegen_acc_visitor.cpp:347
nmodl::codegen::CodegenAccVisitor::print_net_send_buf_count_update_to_device
void print_net_send_buf_count_update_to_device() const override
update NetSendBuffer_t count from host to device
Definition: codegen_acc_visitor.cpp:356
nmodl::codegen::CodegenAccVisitor::print_instance_struct_transfer_routines
void print_instance_struct_transfer_routines(const std::vector< std::string > &ptr_members) override
define helper functions for copying the instance struct to the device
Definition: codegen_acc_visitor.cpp:268
integer.hpp
Auto generated AST classes declaration.
nmodl::codegen::CodegenAccVisitor::print_net_init_acc_serial_annotation_block_end
void print_net_init_acc_serial_annotation_block_end() override
end of annotation "acc kernels" for net_init kernel
Definition: codegen_acc_visitor.cpp:167
nmodl::codegen::CodegenAccVisitor::backend_name
std::string backend_name() const override
name of the code generation backend
Definition: codegen_acc_visitor.cpp:69
nmodl::codegen::CodegenAccVisitor::print_memory_allocation_routine
void print_memory_allocation_routine() const override
memory allocation routine
Definition: codegen_acc_visitor.cpp:74