BLAS++ 2024.05.31
BLAS C++ API
Loading...
Searching...
No Matches

\(C = \alpha \;op(A) \;op(B) + \beta C\) More...

Functions

template<typename TA , typename TB , typename TC >
void blas::gemm (blas::Layout layout, blas::Op transA, blas::Op transB, int64_t m, int64_t n, int64_t k, scalar_type< TA, TB, TC > alpha, TA const *A, int64_t lda, TB const *B, int64_t ldb, scalar_type< TA, TB, TC > beta, TC *C, int64_t ldc)
 General matrix-matrix multiply:
 
void blas::batch::gemm (blas::Layout layout, std::vector< blas::Op > const &transA, std::vector< blas::Op > const &transB, std::vector< int64_t > const &m, std::vector< int64_t > const &n, std::vector< int64_t > const &k, std::vector< float > const &alpha, std::vector< float * > const &Aarray, std::vector< int64_t > const &lda, std::vector< float * > const &Barray, std::vector< int64_t > const &ldb, std::vector< float > const &beta, std::vector< float * > const &Carray, std::vector< int64_t > const &ldc, size_t batch_size, std::vector< int64_t > &info)
 CPU, variable-size batched, float version.
 
void blas::batch::gemm (blas::Layout layout, std::vector< blas::Op > const &transA, std::vector< blas::Op > const &transB, std::vector< int64_t > const &m, std::vector< int64_t > const &n, std::vector< int64_t > const &k, std::vector< double > const &alpha, std::vector< double * > const &Aarray, std::vector< int64_t > const &lda, std::vector< double * > const &Barray, std::vector< int64_t > const &ldb, std::vector< double > const &beta, std::vector< double * > const &Carray, std::vector< int64_t > const &ldc, size_t batch_size, std::vector< int64_t > &info)
 CPU, variable-size batched, double version.
 
void blas::batch::gemm (blas::Layout layout, std::vector< blas::Op > const &transA, std::vector< blas::Op > const &transB, std::vector< int64_t > const &m, std::vector< int64_t > const &n, std::vector< int64_t > const &k, std::vector< std::complex< float > > const &alpha, std::vector< std::complex< float > * > const &Aarray, std::vector< int64_t > const &lda, std::vector< std::complex< float > * > const &Barray, std::vector< int64_t > const &ldb, std::vector< std::complex< float > > const &beta, std::vector< std::complex< float > * > const &Carray, std::vector< int64_t > const &ldc, size_t batch_size, std::vector< int64_t > &info)
 CPU, variable-size batched, complex<float> version.
 
void blas::batch::gemm (blas::Layout layout, std::vector< blas::Op > const &transA, std::vector< blas::Op > const &transB, std::vector< int64_t > const &m, std::vector< int64_t > const &n, std::vector< int64_t > const &k, std::vector< std::complex< double > > const &alpha, std::vector< std::complex< double > * > const &Aarray, std::vector< int64_t > const &lda, std::vector< std::complex< double > * > const &Barray, std::vector< int64_t > const &ldb, std::vector< std::complex< double > > const &beta, std::vector< std::complex< double > * > const &Carray, std::vector< int64_t > const &ldc, size_t batch_size, std::vector< int64_t > &info)
 CPU, variable-size batched, complex<double> version.
 
void blas::batch::gemm (blas::Layout layout, std::vector< blas::Op > const &transA, std::vector< blas::Op > const &transB, std::vector< int64_t > const &m, std::vector< int64_t > const &n, std::vector< int64_t > const &k, std::vector< float > const &alpha, std::vector< float * > const &Aarray, std::vector< int64_t > const &lda, std::vector< float * > const &Barray, std::vector< int64_t > const &ldb, std::vector< float > const &beta, std::vector< float * > const &Carray, std::vector< int64_t > const &ldc, size_t batch_size, std::vector< int64_t > &info, blas::Queue &queue)
 GPU device, variable-size batched, float version.
 
void blas::batch::gemm (blas::Layout layout, std::vector< blas::Op > const &transA, std::vector< blas::Op > const &transB, std::vector< int64_t > const &m, std::vector< int64_t > const &n, std::vector< int64_t > const &k, std::vector< double > const &alpha, std::vector< double * > const &Aarray, std::vector< int64_t > const &lda, std::vector< double * > const &Barray, std::vector< int64_t > const &ldb, std::vector< double > const &beta, std::vector< double * > const &Carray, std::vector< int64_t > const &ldc, size_t batch_size, std::vector< int64_t > &info, blas::Queue &queue)
 GPU device, variable-size batched, double version.
 
void blas::batch::gemm (blas::Layout layout, std::vector< blas::Op > const &transA, std::vector< blas::Op > const &transB, std::vector< int64_t > const &m, std::vector< int64_t > const &n, std::vector< int64_t > const &k, std::vector< std::complex< float > > const &alpha, std::vector< std::complex< float > * > const &Aarray, std::vector< int64_t > const &lda, std::vector< std::complex< float > * > const &Barray, std::vector< int64_t > const &ldb, std::vector< std::complex< float > > const &beta, std::vector< std::complex< float > * > const &Carray, std::vector< int64_t > const &ldc, size_t batch_size, std::vector< int64_t > &info, blas::Queue &queue)
 GPU device, variable-size batched, complex<float> version.
 
void blas::batch::gemm (blas::Layout layout, std::vector< blas::Op > const &transA, std::vector< blas::Op > const &transB, std::vector< int64_t > const &m, std::vector< int64_t > const &n, std::vector< int64_t > const &k, std::vector< std::complex< double > > const &alpha, std::vector< std::complex< double > * > const &Aarray, std::vector< int64_t > const &lda, std::vector< std::complex< double > * > const &Barray, std::vector< int64_t > const &ldb, std::vector< std::complex< double > > const &beta, std::vector< std::complex< double > * > const &Carray, std::vector< int64_t > const &ldc, size_t batch_size, std::vector< int64_t > &info, blas::Queue &queue)
 GPU device, variable-size batched, complex<double> version.
 
void blas::batch::gemm (blas::Layout layout, std::vector< blas::Op > const &transA, std::vector< blas::Op > const &transB, std::vector< int64_t > const &m, std::vector< int64_t > const &n, std::vector< int64_t > const &k, std::vector< float > const &alpha, std::vector< float * > const &Aarray, std::vector< int64_t > const &lda, std::vector< float * > const &Barray, std::vector< int64_t > const &ldb, std::vector< float > const &beta, std::vector< float * > const &Carray, std::vector< int64_t > const &ldc, std::vector< size_t > const &group_size, std::vector< int64_t > &info, blas::Queue &queue)
 GPU device, group batched, float version.
 
void blas::batch::gemm (blas::Layout layout, std::vector< blas::Op > const &transA, std::vector< blas::Op > const &transB, std::vector< int64_t > const &m, std::vector< int64_t > const &n, std::vector< int64_t > const &k, std::vector< double > const &alpha, std::vector< double * > const &Aarray, std::vector< int64_t > const &lda, std::vector< double * > const &Barray, std::vector< int64_t > const &ldb, std::vector< double > const &beta, std::vector< double * > const &Carray, std::vector< int64_t > const &ldc, std::vector< size_t > const &group_size, std::vector< int64_t > &info, blas::Queue &queue)
 GPU device, group batched, double version.
 
void blas::batch::gemm (blas::Layout layout, std::vector< blas::Op > const &transA, std::vector< blas::Op > const &transB, std::vector< int64_t > const &m, std::vector< int64_t > const &n, std::vector< int64_t > const &k, std::vector< std::complex< float > > const &alpha, std::vector< std::complex< float > * > const &Aarray, std::vector< int64_t > const &lda, std::vector< std::complex< float > * > const &Barray, std::vector< int64_t > const &ldb, std::vector< std::complex< float > > const &beta, std::vector< std::complex< float > * > const &Carray, std::vector< int64_t > const &ldc, std::vector< size_t > const &group_size, std::vector< int64_t > &info, blas::Queue &queue)
 GPU device, group batched, complex<float> version.
 
void blas::batch::gemm (blas::Layout layout, std::vector< blas::Op > const &transA, std::vector< blas::Op > const &transB, std::vector< int64_t > const &m, std::vector< int64_t > const &n, std::vector< int64_t > const &k, std::vector< std::complex< double > > const &alpha, std::vector< std::complex< double > * > const &Aarray, std::vector< int64_t > const &lda, std::vector< std::complex< double > * > const &Barray, std::vector< int64_t > const &ldb, std::vector< std::complex< double > > const &beta, std::vector< std::complex< double > * > const &Carray, std::vector< int64_t > const &ldc, std::vector< size_t > const &group_size, std::vector< int64_t > &info, blas::Queue &queue)
 GPU device, group batched, complex<double> version.
 
void blas::gemm (blas::Layout layout, blas::Op transA, blas::Op transB, int64_t m, int64_t n, int64_t k, float alpha, float const *A, int64_t lda, float const *B, int64_t ldb, float beta, float *C, int64_t ldc, blas::Queue &queue)
 GPU device, float version.
 
void blas::gemm (blas::Layout layout, blas::Op transA, blas::Op transB, int64_t m, int64_t n, int64_t k, double alpha, double const *A, int64_t lda, double const *B, int64_t ldb, double beta, double *C, int64_t ldc, blas::Queue &queue)
 GPU device, double version.
 
void blas::gemm (blas::Layout layout, blas::Op transA, blas::Op transB, int64_t m, int64_t n, int64_t k, std::complex< float > alpha, std::complex< float > const *A, int64_t lda, std::complex< float > const *B, int64_t ldb, std::complex< float > beta, std::complex< float > *C, int64_t ldc, blas::Queue &queue)
 GPU device, complex<float> version.
 
void blas::gemm (blas::Layout layout, blas::Op transA, blas::Op transB, int64_t m, int64_t n, int64_t k, std::complex< double > alpha, std::complex< double > const *A, int64_t lda, std::complex< double > const *B, int64_t ldb, std::complex< double > beta, std::complex< double > *C, int64_t ldc, blas::Queue &queue)
 GPU device, complex<double> version.
 
void blas::gemm (blas::Layout layout, blas::Op transA, blas::Op transB, int64_t m, int64_t n, int64_t k, float alpha, float const *A, int64_t lda, float const *B, int64_t ldb, float beta, float *C, int64_t ldc)
 CPU, float version.
 
void blas::gemm (blas::Layout layout, blas::Op transA, blas::Op transB, int64_t m, int64_t n, int64_t k, double alpha, double const *A, int64_t lda, double const *B, int64_t ldb, double beta, double *C, int64_t ldc)
 CPU, double version.
 
void blas::gemm (blas::Layout layout, blas::Op transA, blas::Op transB, int64_t m, int64_t n, int64_t k, std::complex< float > alpha, std::complex< float > const *A, int64_t lda, std::complex< float > const *B, int64_t ldb, std::complex< float > beta, std::complex< float > *C, int64_t ldc)
 CPU, complex<float> version.
 
void blas::gemm (blas::Layout layout, blas::Op transA, blas::Op transB, int64_t m, int64_t n, int64_t k, std::complex< double > alpha, std::complex< double > const *A, int64_t lda, std::complex< double > const *B, int64_t ldb, std::complex< double > beta, std::complex< double > *C, int64_t ldc)
 CPU, complex<double> version.
 

Detailed Description

\(C = \alpha \;op(A) \;op(B) + \beta C\)

Function Documentation

◆ gemm()

template<typename TA , typename TB , typename TC >
void blas::gemm ( blas::Layout  layout,
blas::Op  transA,
blas::Op  transB,
int64_t  m,
int64_t  n,
int64_t  k,
scalar_type< TA, TB, TC >  alpha,
TA const *  A,
int64_t  lda,
TB const *  B,
int64_t  ldb,
scalar_type< TA, TB, TC >  beta,
TC *  C,
int64_t  ldc 
)

General matrix-matrix multiply:

\[ C = \alpha op(A) \times op(B) + \beta C, \]

where \(op(X)\) is one of \(op(X) = X\), \(op(X) = X^T\), or \(op(X) = X^H\), alpha and beta are scalars, and A, B, and C are matrices, with \(op(A)\) an m-by-k matrix, \(op(B)\) a k-by-n matrix, and C an m-by-n matrix.

Generic implementation for arbitrary data types.

Parameters
[in]layoutMatrix storage, Layout::ColMajor or Layout::RowMajor.
[in]transAThe operation \(op(A)\) to be used:
  • Op::NoTrans: \(op(A) = A\).
  • Op::Trans: \(op(A) = A^T\).
  • Op::ConjTrans: \(op(A) = A^H\).
[in]transBThe operation \(op(B)\) to be used:
  • Op::NoTrans: \(op(B) = B\).
  • Op::Trans: \(op(B) = B^T\).
  • Op::ConjTrans: \(op(B) = B^H\).
[in]mNumber of rows of the matrix C and \(op(A)\). m >= 0.
[in]nNumber of columns of the matrix C and \(op(B)\). n >= 0.
[in]kNumber of columns of \(op(A)\) and rows of \(op(B)\). k >= 0.
[in]alphaScalar alpha. If alpha is zero, A and B are not accessed.
[in]A
  • If transA = NoTrans: the m-by-k matrix A, stored in an lda-by-k array [RowMajor: m-by-lda].
  • Otherwise: the k-by-m matrix A, stored in an lda-by-m array [RowMajor: k-by-lda].
[in]ldaLeading dimension of A.
  • If transA = NoTrans: lda >= max(1, m) [RowMajor: lda >= max(1, k)].
  • Otherwise: lda >= max(1, k) [RowMajor: lda >= max(1, m)].
[in]B
  • If transB = NoTrans: the k-by-n matrix B, stored in an ldb-by-n array [RowMajor: k-by-ldb].
  • Otherwise: the n-by-k matrix B, stored in an ldb-by-k array [RowMajor: n-by-ldb].
[in]ldbLeading dimension of B.
  • If transB = NoTrans: ldb >= max(1, k) [RowMajor: ldb >= max(1, n)].
  • Otherwise: ldb >= max(1, n) [RowMajor: ldb >= max(1, k)].
[in]betaScalar beta. If beta is zero, C need not be set on input.
[in]CThe m-by-n matrix C, stored in an ldc-by-n array [RowMajor: m-by-ldc].
[in]ldcLeading dimension of C. ldc >= max(1, m) [RowMajor: ldc >= max(1, n)].