My Project
FPGASolverBackend.hpp
1/*
2 Copyright 2020 Equinor ASA
3
4 This file is part of the Open Porous Media project (OPM).
5
6 OPM is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 OPM is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with OPM. If not, see <http://www.gnu.org/licenses/>.
18*/
19
20#ifndef OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED
21#define OPM_FPGASOLVER_BACKEND_HEADER_INCLUDED
22
23#include <opm/simulators/linalg/bda/BdaSolver.hpp>
24#include <opm/simulators/linalg/bda/FPGABILU0.hpp>
25
26#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/bicgstab_solver_config.hpp>
27#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/opencl_lib.hpp>
28#include <linearalgebra/ilu0bicgstab/xilinx/src/sda_app/common/fpga_functions_bicgstab.hpp>
29
30namespace Opm
31{
32namespace Accelerator
33{
34
36template <unsigned int block_size>
37class FpgaSolverBackend : public BdaSolver<block_size>
38{
41
42 using Base::N;
43 using Base::Nb;
44 using Base::nnz;
45 using Base::nnzb;
46 using Base::verbosity;
47 using Base::maxit;
48 using Base::tolerance;
49 using Base::initialized;
50
51private:
52 double *rx = nullptr; // reordered x
53 double *rb = nullptr; // reordered b
54 int *fromOrder = nullptr, *toOrder = nullptr;
55 bool analysis_done = false;
56 bool level_scheduling = false;
57
58 // LUMat will shallow copy rowPointers and colIndices of mat/rMat
59 std::unique_ptr<BlockedMatrix> mat = nullptr;
60 BlockedMatrix *rMat = nullptr;
61 std::unique_ptr<Preconditioner> prec = nullptr;
62
63 // vectors with data processed by the preconditioner (input to the kernel)
64 void **processedPointers = nullptr;
65 int *processedSizes = nullptr;
66
67 unsigned int fpga_calls = 0;
68 bool perf_call_enabled = true;
69
70 // per call performance metrics
71 typedef struct {
72 double s_preconditioner_create = 0.0;
73 double s_analysis = 0.0;
74 double s_reorder = 0.0;
75 double s_mem_setup = 0.0;
76 double s_mem_h2d = 0.0;
77 double s_kernel_exec = 0.0;
78 unsigned int n_kernel_exec_cycles = 0;
79 float n_kernel_exec_iters = 0.0;
80 double s_mem_d2h = 0.0;
81 double s_solve = 0.0;
82 double s_postprocess = 0.0;
83 bool converged = false;
84 unsigned int converged_flags = 0;
85 } perf_call_metrics_t;
86 // cumulative performance metrics
87 typedef struct {
88 double s_initialization;
89 double s_preconditioner_setup;
90 double s_preconditioner_create;
91 double s_preconditioner_create_min,s_preconditioner_create_max,s_preconditioner_create_avg;
92 double s_analysis;
93 double s_analysis_min,s_analysis_max,s_analysis_avg;
94 double s_reorder;
95 double s_reorder_min,s_reorder_max,s_reorder_avg;
96 double s_mem_setup;
97 double s_mem_setup_min,s_mem_setup_max,s_mem_setup_avg;
98 double s_mem_h2d;
99 double s_mem_h2d_min,s_mem_h2d_max,s_mem_h2d_avg;
100 double s_kernel_exec;
101 double s_kernel_exec_min,s_kernel_exec_max,s_kernel_exec_avg;
102 unsigned long n_kernel_exec_cycles;
103 unsigned long n_kernel_exec_cycles_min,n_kernel_exec_cycles_max,n_kernel_exec_cycles_avg;
104 float n_kernel_exec_iters;
105 float n_kernel_exec_iters_min,n_kernel_exec_iters_max,n_kernel_exec_iters_avg;
106 double s_mem_d2h;
107 double s_mem_d2h_min,s_mem_d2h_max,s_mem_d2h_avg;
108 double s_solve;
109 double s_solve_min,s_solve_max,s_solve_avg;
110 double s_postprocess;
111 double s_postprocess_min,s_postprocess_max,s_postprocess_avg;
112 unsigned int n_converged;
113 } perf_total_metrics_t;
114 std::vector<perf_call_metrics_t> perf_call;
115 perf_total_metrics_t perf_total;
116 // fpga_config_bits: bit0=do_reset_debug: if 1, will reset debug flags at each state change, otherwise flags are sticky
117 // fpga_config_bits: bit1=absolute_compare: if 1, will compare norm with provided precision value, otherwise it's incremental
118 unsigned int fpga_config_bits = 0;
119 bool fpga_disabled = false;
120 bool platform_awsf1;
121 unsigned int debugbufferSize;
122 unsigned long int *debugBuffer = nullptr;
123 unsigned int *databufferSize = nullptr;
124 unsigned char *dataBuffer[RW_BUF] = {nullptr};
125 unsigned int debug_outbuf_words;
126 int resultsNum;
127 int resultsBufferNum;
128 unsigned int resultsBufferSize[RES_BUF_MAX];
129 unsigned int result_offsets[6];
130 unsigned int kernel_cycles, kernel_iter_run;
131 double norms[4];
132 unsigned char last_norm_idx;
133 bool kernel_aborted, kernel_signature, kernel_overflow;
134 bool kernel_noresults;
135 bool kernel_wrafterend, kernel_dbgfifofull;
136 bool use_residuals = false;
137 bool use_LU_res = false;
138 int sequence = 0;
139 // TODO: these values may be sent via command line parameters
140 unsigned int abort_cycles = 2000000000; // 2x10^9 @ 300MHz is around 6.6 s
141 unsigned int debug_sample_rate = 65535; // max value allowed is 65535, 0 means disabled; reduce to get a finer debug dump
142 int nnzValArrays_size = 0;
143 int L_nnzValArrays_size = 0;
144 int U_nnzValArrays_size = 0;
145 // aliases to areas of the host data buffers
146 long unsigned int *setupArray = nullptr;
147 double **nnzValArrays = nullptr;
148 short unsigned int *columnIndexArray = nullptr;
149 unsigned char *newRowOffsetArray = nullptr;
150 unsigned int *PIndexArray = nullptr;
151 unsigned int *colorSizesArray = nullptr;
152 double **L_nnzValArrays = nullptr;
153 short unsigned int *L_columnIndexArray = nullptr;
154 unsigned char *L_newRowOffsetArray = nullptr;
155 unsigned int *L_PIndexArray = nullptr;
156 unsigned int *L_colorSizesArray = nullptr;
157 double **U_nnzValArrays = nullptr;
158 short unsigned int *U_columnIndexArray = nullptr;
159 unsigned char *U_newRowOffsetArray = nullptr;
160 unsigned int *U_PIndexArray = nullptr;
161 unsigned int *U_colorSizesArray = nullptr;
162 double *BLKDArray = nullptr;
163 double *X1Array = nullptr, *X2Array = nullptr;
164 double *R1Array = nullptr, *R2Array = nullptr;
165 double *LresArray = nullptr, *UresArray = nullptr;
166 double *resultsBuffer[RES_BUF_MAX] = {nullptr}; // alias for data output region
167 // OpenCL variables
168 cl_device_id device_id;
169 cl_context context;
170 cl_command_queue commands;
171 cl_program program;
172 cl_kernel kernel;
173 cl_mem cldata[RW_BUF] = {nullptr};
174 cl_mem cldebug = nullptr;
175 // HW limits/configuration variables
176 unsigned int hw_x_vector_elem;
177 unsigned int hw_max_row_size;
178 unsigned int hw_max_column_size;
179 unsigned int hw_max_colors_size;
180 unsigned short hw_max_nnzs_per_row;
181 unsigned int hw_max_matrix_size;
182 bool hw_use_uram;
183 bool hw_write_ilu0_results;
184 unsigned short hw_dma_data_width;
185 unsigned char hw_x_vector_latency;
186 unsigned char hw_add_latency;
187 unsigned char hw_mult_latency;
188 unsigned char hw_mult_num;
189 unsigned char hw_num_read_ports;
190 unsigned char hw_num_write_ports;
191 unsigned short hw_reset_cycles;
192 unsigned short hw_reset_settle;
193 // debug
194 bool reset_data_buffers = false;
195 bool fill_results_buffers = false;
196 int dump_data_buffers = 0; // 0=disabled, 1=binary format, 2=text format
197 bool dump_results = false;
198 char *data_dir = nullptr;
199 char *basename = nullptr;
200 unsigned short rst_assert_cycles = 0;
201 unsigned short rst_settle_cycles = 0;
202
209 void initialize(int Nb, int nnzbs, int dim, double *vals, int *rows, int *cols);
210
214 void update_system(double *vals, double *b);
215
218 bool analyse_matrix();
219
222 bool create_preconditioner();
223
226 void solve_system(BdaResult &res);
227
229 void generate_statistics(void);
230
231public:
232
239 FpgaSolverBackend(std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder);
240
243
251 SolverStatus solve_system(std::shared_ptr<BlockedMatrix> matrix, double *b,
252 std::shared_ptr<BlockedMatrix> jacMatrix, WellContributions& wellContribs, BdaResult &res) override;
253
256 void get_result(double *x) override;
257
258}; // end class fpgaSolverBackend
259
260} // namespace Accelerator
261} // namespace Opm
262
263#endif
264
This class is based on InverseOperatorResult struct from dune/istl/solver.hh It is needed to prevent ...
Definition: BdaResult.hpp:31
This class serves to simplify choosing between different backend solvers, such as cusparseSolver and ...
Definition: BdaSolver.hpp:45
This struct resembles a blocked csr matrix, like Dune::BCRSMatrix.
Definition: BlockedMatrix.hpp:37
Definition: FPGABILU0.hpp:41
This class implements an ilu0-bicgstab solver on FPGA.
Definition: FPGASolverBackend.hpp:38
FpgaSolverBackend(std::string fpga_bitstream, int linear_solver_verbosity, int maxit, double tolerance, ILUReorder opencl_ilu_reorder)
Construct an fpgaSolver.
Definition: FPGASolverBackend.cpp:50
void get_result(double *x) override
Get result after linear solve, and peform postprocessing if necessary.
Definition: FPGASolverBackend.cpp:210
~FpgaSolverBackend()
Destroy an fpgaSolver, and free memory.
Definition: FPGASolverBackend.cpp:176
This class serves to eliminate the need to include the WellContributions into the matrix (with –matri...
Definition: WellContributions.hpp:53
This file contains a set of helper functions used by VFPProd / VFPInj.
Definition: BlackoilPhases.hpp:27