17inline double fmuls_asum(
double n )
20inline double fadds_asum(
double n )
24inline double fmuls_axpy(
double n )
27inline double fadds_axpy(
double n )
31inline double fmuls_iamax(
double n )
35inline double fadds_iamax(
double n )
39inline double fmuls_nrm2(
double n )
42inline double fadds_nrm2(
double n )
46inline double fmuls_dot(
double n )
49inline double fadds_dot(
double n )
53inline double fmuls_scal(
double n )
56inline double fadds_scal(
double n )
60inline double fmuls_rot(
double n )
63inline double fadds_rot(
double n )
67inline double fmuls_rotm(
double n )
70inline double fadds_rotm(
double n )
79inline double fmuls_gemv(
double m,
double n )
82inline double fadds_gemv(
double m,
double n )
86inline double fmuls_trmv(
double n )
87 {
return 0.5*n*(n + 1); }
89inline double fadds_trmv(
double n )
90 {
return 0.5*n*(n - 1); }
93inline double fmuls_ger(
double m,
double n )
96inline double fadds_ger(
double m,
double n )
100inline double fmuls_gemm(
double m,
double n,
double k )
103inline double fadds_gemm(
double m,
double n,
double k )
119inline double fmuls_gbmm(
double m,
double n,
double k,
double kl,
double ku )
122 return (kl*k + (k+1)*k/2. - (k-ku-1)*(k-ku)/2.)*n;
124 return (ku*m - (m-kl-1)*(m-kl)/2. + (m+1)*m/2.)*n;
125 return (m*k - (m-kl-1)*(m-kl)/2. - (k-ku-1)*(k-ku)/2.)*n;
129inline double fadds_gbmm(
double m,
double n,
double k,
double kl,
double ku )
131 return fmuls_gbmm( m, n, k, kl, ku );
135inline double fmuls_hemm( blas::Side side,
double m,
double n )
136 {
return (side == blas::Side::Left ? m*m*n : m*n*n); }
138inline double fadds_hemm( blas::Side side,
double m,
double n )
139 {
return (side == blas::Side::Left ? m*m*n : m*n*n); }
142inline double fmuls_herk(
double n,
double k )
143 {
return 0.5*k*n*(n+1); }
145inline double fadds_herk(
double n,
double k )
146 {
return 0.5*k*n*(n+1); }
149inline double fmuls_her2k(
double n,
double k )
152inline double fadds_her2k(
double n,
double k )
156inline double fmuls_trmm( blas::Side side,
double m,
double n )
158 if (side == blas::Side::Left)
159 return 0.5*n*m*(m + 1);
161 return 0.5*m*n*(n + 1);
164inline double fadds_trmm( blas::Side side,
double m,
double n )
166 if (side == blas::Side::Left)
167 return 0.5*n*m*(m - 1);
169 return 0.5*m*n*(n - 1);
184 static double asum(
double n )
185 {
return 1e-9 * (n *
sizeof(T)); }
188 static double axpy(
double n )
189 {
return 1e-9 * (3*n *
sizeof(T)); }
192 static double copy(
double n )
193 {
return 1e-9 * (2*n *
sizeof(T)); }
196 static double iamax(
double n )
197 {
return 1e-9 * (n *
sizeof(T)); }
200 static double nrm2(
double n )
201 {
return 1e-9 * (n *
sizeof(T)); }
204 static double dot(
double n )
205 {
return 1e-9 * (2*n *
sizeof(T)); }
208 static double scal(
double n )
209 {
return 1e-9 * (2*n *
sizeof(T)); }
212 static double swap(
double n )
213 {
return 1e-9 * (4*n *
sizeof(T)); }
218 static double gemv(
double m,
double n )
219 {
return 1e-9 * ((m*n + m + n) *
sizeof(T)); }
222 static double hemv(
double n )
223 {
return 1e-9 * ((0.5*(n+1)*n + 2*n) *
sizeof(T)); }
225 static double symv(
double n )
226 {
return hemv( n ); }
229 static double trmv(
double n )
230 {
return 1e-9 * ((0.5*(n+1)*n + 2*n) *
sizeof(T)); }
232 static double trsv(
double n )
233 {
return trmv( n ); }
236 static double ger(
double m,
double n )
237 {
return 1e-9 * ((2*m*n + m + n) *
sizeof(T)); }
240 static double her(
double n )
241 {
return 1e-9 * (((n+1)*n + n) *
sizeof(T)); }
243 static double syr(
double n )
247 static double her2(
double n )
248 {
return 1e-9 * (((n+1)*n + n + n) *
sizeof(T)); }
250 static double syr2(
double n )
251 {
return her2( n ); }
254 static double copy_2d(
double m,
double n )
255 {
return 1e-9 * (2*m*n *
sizeof(T)); }
260 static double gemm(
double m,
double n,
double k )
261 {
return 1e-9 * ((m*k + k*n + 2*m*n) *
sizeof(T)); }
263 static double hemm( blas::Side side,
double m,
double n )
266 double sizeA = (side == blas::Side::Left ? 0.5*m*(m+1) : 0.5*n*(n+1));
267 return 1e-9 * ((sizeA + 3*m*n) *
sizeof(T));
270 static double symm( blas::Side side,
double m,
double n )
271 {
return hemm( side, m, n ); }
273 static double herk(
double n,
double k )
276 double sizeC = 0.5*n*(n+1);
277 return 1e-9 * ((n*k + 2*sizeC) *
sizeof(T));
280 static double syrk(
double n,
double k )
281 {
return herk( n, k ); }
283 static double her2k(
double n,
double k )
286 double sizeC = 0.5*n*(n+1);
287 return 1e-9 * ((2*n*k + 2*sizeC) *
sizeof(T));
290 static double syr2k(
double n,
double k )
291 {
return her2k( n, k ); }
293 static double trmm( blas::Side side,
double m,
double n )
296 if (side == blas::Side::Left)
297 return 1e-9 * ((0.5*(m+1)*m + 2*m*n) *
sizeof(T));
299 return 1e-9 * ((0.5*(n+1)*n + 2*m*n) *
sizeof(T));
302 static double trsm( blas::Side side,
double m,
double n )
303 {
return trmm( side, m, n ); }
312 static constexpr double mul_ops = 1;
313 static constexpr double add_ops = 1;
320class FlopTraits< std::complex<T> >
323 static constexpr double mul_ops = 6;
324 static constexpr double add_ops = 2;
336 static constexpr double mul_ops = FlopTraits<T>::mul_ops;
337 static constexpr double add_ops = FlopTraits<T>::add_ops;
341 static double asum(
double n )
342 {
return 1e-9 * (mul_ops*fmuls_asum(n) +
343 add_ops*fadds_asum(n)); }
345 static double axpy(
double n )
346 {
return 1e-9 * (mul_ops*fmuls_axpy(n) +
347 add_ops*fadds_axpy(n)); }
349 static double copy(
double n )
352 static double iamax(
double n )
353 {
return 1e-9 * (mul_ops*fmuls_iamax(n) +
354 add_ops*fadds_iamax(n)); }
356 static double nrm2(
double n )
357 {
return 1e-9 * (mul_ops*fmuls_nrm2(n) +
358 add_ops*fadds_nrm2(n)); }
360 static double dot(
double n )
361 {
return 1e-9 * (mul_ops*fmuls_dot(n) +
362 add_ops*fadds_dot(n)); }
364 static double scal(
double n )
365 {
return 1e-9 * (mul_ops*fmuls_scal(n) +
366 add_ops*fadds_scal(n)); }
368 static double swap(
double n )
371 static double rot(
double n )
372 {
return 1e-9 * (mul_ops*fmuls_rot(n) +
373 add_ops*fadds_rot(n)); }
375 static double rotm(
double n )
376 {
return 1e-9 * (mul_ops*fmuls_rotm(n) +
377 add_ops*fadds_rotm(n)); }
381 static double gemv(
double m,
double n)
382 {
return 1e-9 * (mul_ops*fmuls_gemv(m, n) +
383 add_ops*fadds_gemv(m, n)); }
385 static double symv(
double n)
386 {
return gemv( n, n ); }
388 static double hemv(
double n)
389 {
return symv( n ); }
391 static double trmv(
double n )
392 {
return 1e-9 * (mul_ops*fmuls_trmv(n) +
393 add_ops*fadds_trmv(n)); }
395 static double trsv(
double n )
396 {
return trmv( n ); }
398 static double her(
double n )
399 {
return ger( n, n ); }
401 static double syr(
double n )
404 static double ger(
double m,
double n )
405 {
return 1e-9 * (mul_ops*fmuls_ger(m, n) +
406 add_ops*fadds_ger(m, n)); }
408 static double her2(
double n )
409 {
return 2*
ger( n, n ); }
411 static double syr2(
double n )
412 {
return her2( n ); }
416 static double gemm(
double m,
double n,
double k)
417 {
return 1e-9 * (mul_ops*fmuls_gemm(m, n, k) +
418 add_ops*fadds_gemm(m, n, k)); }
420 static double gbmm(
double m,
double n,
double k,
double kl,
double ku)
421 {
return 1e-9 * (mul_ops*fmuls_gbmm(m, n, k, kl, ku) +
422 add_ops*fadds_gbmm(m, n, k, kl, ku)); }
424 static double hemm(blas::Side side,
double m,
double n)
425 {
return 1e-9 * (mul_ops*fmuls_hemm(side, m, n) +
426 add_ops*fadds_hemm(side, m, n)); }
428 static double hbmm(
double m,
double n,
double kd)
429 {
return gbmm(m, n, m, kd, kd); }
431 static double symm(blas::Side side,
double m,
double n)
432 {
return hemm( side, m, n ); }
434 static double herk(
double n,
double k)
435 {
return 1e-9 * (mul_ops*fmuls_herk(n, k) +
436 add_ops*fadds_herk(n, k)); }
438 static double syrk(
double n,
double k)
439 {
return herk( n, k ); }
441 static double her2k(
double n,
double k)
442 {
return 1e-9 * (mul_ops*fmuls_her2k(n, k) +
443 add_ops*fadds_her2k(n, k)); }
445 static double syr2k(
double n,
double k)
446 {
return her2k( n, k ); }
448 static double trmm(blas::Side side,
double m,
double n)
449 {
return 1e-9 * (mul_ops*fmuls_trmm(side, m, n) +
450 add_ops*fadds_trmm(side, m, n)); }
452 static double trsm(blas::Side side,
double m,
double n)
453 {
return trmm( side, m, n ); }
real_type< T > asum(int64_t n, T const *x, int64_t incx)
Definition asum.hh:35
void axpy(int64_t n, blas::scalar_type< TX, TY > alpha, TX const *x, int64_t incx, TY *y, int64_t incy)
Add scaled vector, .
Definition axpy.hh:43
void copy(int64_t n, TX const *x, int64_t incx, TY *y, int64_t incy)
Copy vector, .
Definition copy.hh:40
void dot(int64_t n, float const *x, int64_t incx, float const *y, int64_t incy, float *result, blas::Queue &queue)
GPU device, float version.
Definition device_dot.cc:139
void gemm(blas::Layout layout, blas::Op transA, blas::Op transB, int64_t m, int64_t n, int64_t k, float alpha, float const *A, int64_t lda, float const *B, int64_t ldb, float beta, float *C, int64_t ldc, blas::Queue &queue)
GPU device, float version.
Definition device_gemm.cc:119
void gemv(blas::Layout layout, blas::Op trans, int64_t m, int64_t n, blas::scalar_type< TA, TX, TY > alpha, TA const *A, int64_t lda, TX const *x, int64_t incx, blas::scalar_type< TA, TX, TY > beta, TY *y, int64_t incy)
General matrix-vector multiply:
Definition gemv.hh:79
void ger(blas::Layout layout, int64_t m, int64_t n, blas::scalar_type< TA, TX, TY > alpha, TX const *x, int64_t incx, TY const *y, int64_t incy, TA *A, int64_t lda)
General matrix rank-1 update:
Definition ger.hh:60
void hemm(blas::Layout layout, blas::Side side, blas::Uplo uplo, int64_t m, int64_t n, float alpha, float const *A, int64_t lda, float const *B, int64_t ldb, float beta, float *C, int64_t ldc, blas::Queue &queue)
GPU device, float version.
Definition device_hemm.cc:107
void hemv(blas::Layout layout, blas::Uplo uplo, int64_t n, blas::scalar_type< TA, TX, TY > alpha, TA const *A, int64_t lda, TX const *x, int64_t incx, blas::scalar_type< TA, TX, TY > beta, TY *y, int64_t incy)
Hermitian matrix-vector multiply:
Definition hemv.hh:69
void her2(blas::Layout layout, blas::Uplo uplo, int64_t n, blas::scalar_type< TA, TX, TY > alpha, TX const *x, int64_t incx, TY const *y, int64_t incy, TA *A, int64_t lda)
Hermitian matrix rank-2 update:
Definition her2.hh:66
void her2k(blas::Layout layout, blas::Uplo uplo, blas::Op trans, int64_t n, int64_t k, float alpha, float const *A, int64_t lda, float const *B, int64_t ldb, float beta, float *C, int64_t ldc, blas::Queue &queue)
GPU device, float version.
Definition device_her2k.cc:100
void her(blas::Layout layout, blas::Uplo uplo, int64_t n, blas::real_type< TA, TX > alpha, TX const *x, int64_t incx, TA *A, int64_t lda)
Hermitian matrix rank-1 update:
Definition her.hh:59
void herk(blas::Layout layout, blas::Uplo uplo, blas::Op trans, int64_t n, int64_t k, float alpha, float const *A, int64_t lda, float beta, float *C, int64_t ldc, blas::Queue &queue)
GPU device, float version.
Definition device_herk.cc:92
int64_t iamax(int64_t n, T const *x, int64_t incx)
Definition iamax.hh:34
void nrm2(int64_t n, float const *x, int64_t incx, float *result, blas::Queue &queue)
GPU device, float version.
Definition device_nrm2.cc:84
void rot(int64_t n, TX *x, int64_t incx, TY *y, int64_t incy, blas::real_type< TX, TY > c, blas::scalar_type< TX, TY > s)
Apply plane rotation:
Definition rot.hh:53
void rotm(int64_t n, TX *x, int64_t incx, TY *y, int64_t incy, blas::scalar_type< TX, TY > const param[5])
Apply modified (fast) plane rotation, H:
Definition rotm.hh:50
void scal(int64_t n, float alpha, float *x, int64_t incx, blas::Queue &queue)
GPU device, float version.
Definition device_scal.cc:65
void swap(int64_t n, float *x, int64_t incx, float *y, int64_t incy, blas::Queue &queue)
GPU device, float version.
Definition device_swap.cc:67
void symm(blas::Layout layout, blas::Side side, blas::Uplo uplo, int64_t m, int64_t n, float alpha, float const *A, int64_t lda, float const *B, int64_t ldb, float beta, float *C, int64_t ldc, blas::Queue &queue)
GPU device, float version.
Definition device_symm.cc:106
void symv(blas::Layout layout, blas::Uplo uplo, int64_t n, blas::scalar_type< TA, TX, TY > alpha, TA const *A, int64_t lda, TX const *x, int64_t incx, blas::scalar_type< TA, TX, TY > beta, TY *y, int64_t incy)
Symmetric matrix-vector multiply:
Definition symv.hh:66
void syr2(blas::Layout layout, blas::Uplo uplo, int64_t n, blas::scalar_type< TA, TX, TY > alpha, TX const *x, int64_t incx, TY const *y, int64_t incy, TA *A, int64_t lda)
Symmetric matrix rank-2 update:
Definition syr2.hh:63
void syr2k(blas::Layout layout, blas::Uplo uplo, blas::Op trans, int64_t n, int64_t k, float alpha, float const *A, int64_t lda, float const *B, int64_t ldb, float beta, float *C, int64_t ldc, blas::Queue &queue)
GPU device, float version.
Definition device_syr2k.cc:107
void syr(blas::Layout layout, blas::Uplo uplo, int64_t n, blas::scalar_type< TA, TX > alpha, TX const *x, int64_t incx, TA *A, int64_t lda)
Symmetric matrix rank-1 update:
Definition syr.hh:56
void syrk(blas::Layout layout, blas::Uplo uplo, blas::Op trans, int64_t n, int64_t k, float alpha, float const *A, int64_t lda, float beta, float *C, int64_t ldc, blas::Queue &queue)
GPU device, float version.
Definition device_syrk.cc:101
void trmm(blas::Layout layout, blas::Side side, blas::Uplo uplo, blas::Op trans, blas::Diag diag, int64_t m, int64_t n, float alpha, float const *A, int64_t lda, float *B, int64_t ldb, blas::Queue &queue)
GPU device, float version.
Definition device_trmm.cc:104
void trmv(blas::Layout layout, blas::Uplo uplo, blas::Op trans, blas::Diag diag, int64_t n, TA const *A, int64_t lda, TX *x, int64_t incx)
Triangular matrix-vector multiply:
Definition trmv.hh:69
void trsm(blas::Layout layout, blas::Side side, blas::Uplo uplo, blas::Op trans, blas::Diag diag, int64_t m, int64_t n, float alpha, float const *A, int64_t lda, float *B, int64_t ldb, blas::Queue &queue)
GPU device, float version.
Definition device_trsm.cc:104
void trsv(blas::Layout layout, blas::Uplo uplo, blas::Op trans, blas::Diag diag, int64_t n, TA const *A, int64_t lda, TX *x, int64_t incx)
Solve the triangular matrix-vector equation.
Definition trsv.hh:73