SLATE 2024.05.31
Software for Linear Algebra Targeting Exascale
|
Functions | |
template<Target target = Target::HostTask, typename scalar_t > | |
void | slate::internal::trmm (Side side, scalar_t alpha, TriangularMatrix< scalar_t > &&A, Matrix< scalar_t > &&B, int priority, int64_t queue_index) |
Triangular matrix multiply. | |
template<typename scalar_t > | |
void | slate::internal::trmm (internal::TargetType< Target::HostTask >, Side side, scalar_t alpha, TriangularMatrix< scalar_t > &A, Matrix< scalar_t > &B, int priority, int64_t queue_index) |
Triangular matrix multiply. | |
template<typename scalar_t > | |
void | slate::internal::trmm (internal::TargetType< Target::HostNest >, Side side, scalar_t alpha, TriangularMatrix< scalar_t > &A, Matrix< scalar_t > &B, int priority, int64_t queue_index) |
Triangular matrix multiply. | |
template<typename scalar_t > | |
void | slate::internal::trmm (internal::TargetType< Target::HostBatch >, Side side, scalar_t alpha, TriangularMatrix< scalar_t > &A, Matrix< scalar_t > &B, int priority, int64_t queue_index) |
Triangular matrix multiply. | |
template<typename scalar_t > | |
void | slate::internal::trmm (internal::TargetType< Target::Devices >, Side side, scalar_t alpha, TriangularMatrix< scalar_t > &A, Matrix< scalar_t > &B, int priority, int64_t queue_index) |
Triangular matrix multiply. | |
template<Target target = Target::HostTask, typename scalar_t > | |
void | slate::work::trmm (Side side, scalar_t alpha, TriangularMatrix< scalar_t > A, Matrix< scalar_t > B, uint8_t *bcast, uint8_t *gemm, int64_t lookahead) |
Triangular matrix multiply. | |
void slate::internal::trmm | ( | internal::TargetType< Target::Devices > | , |
Side | side, | ||
scalar_t | alpha, | ||
TriangularMatrix< scalar_t > & | A, | ||
Matrix< scalar_t > & | B, | ||
int | priority, | ||
int64_t | queue_index | ||
) |
Triangular matrix multiply.
GPU device batched cuBLAS implementation.
void slate::internal::trmm | ( | internal::TargetType< Target::HostBatch > | , |
Side | side, | ||
scalar_t | alpha, | ||
TriangularMatrix< scalar_t > & | A, | ||
Matrix< scalar_t > & | B, | ||
int | priority, | ||
int64_t | queue_index | ||
) |
Triangular matrix multiply.
Host batched implementation.
void slate::internal::trmm | ( | internal::TargetType< Target::HostNest > | , |
Side | side, | ||
scalar_t | alpha, | ||
TriangularMatrix< scalar_t > & | A, | ||
Matrix< scalar_t > & | B, | ||
int | priority, | ||
int64_t | queue_index | ||
) |
Triangular matrix multiply.
Host nested OpenMP implementation.
void slate::internal::trmm | ( | internal::TargetType< Target::HostTask > | , |
Side | side, | ||
scalar_t | alpha, | ||
TriangularMatrix< scalar_t > & | A, | ||
Matrix< scalar_t > & | B, | ||
int | priority, | ||
int64_t | queue_index | ||
) |
Triangular matrix multiply.
Host OpenMP task implementation.
void slate::internal::trmm | ( | Side | side, |
scalar_t | alpha, | ||
TriangularMatrix< scalar_t > && | A, | ||
Matrix< scalar_t > && | B, | ||
int | priority, | ||
int64_t | queue_index | ||
) |
Triangular matrix multiply.
Dispatches to target implementations.
void slate::work::trmm | ( | Side | side, |
scalar_t | alpha, | ||
TriangularMatrix< scalar_t > | A, | ||
Matrix< scalar_t > | B, | ||
uint8_t * | bcast, | ||
uint8_t * | gemm, | ||
int64_t | lookahead | ||
) |
Triangular matrix multiply.
Note A and B are passed by value, so we can transpose if needed (for side = right) without affecting caller.
target | One of HostTask, HostNest, HostBatch, Devices. |
scalar_t | One of float, double, std::complex<float>, std::complex<double>. |
[in] | side | Whether A appears on the left or on the right of B:
|
[in] | alpha | The scalar alpha. |
[in] | A |
|
[in,out] | B | On entry, the m-by-n matrix B. On exit, overwritten by the result \(\alpha A B\) or \(\alpha B A\). |
[in] | bcast | A raw pointer to a dummy vector data.. The dummy vector is used for OpenMP dependencies tracking, not based on the actual data. Entries in the dummy vector represent each column of matrix \(A\) and each row of matrix \(B\). The size of bcast should be number of block columns of matrix \(A\). |
[in] | gemm | A raw pointer to a dummy vector data. The dummy vector is used for OpenMP dependencies tracking, not based on the actual data. Entries in the dummy vector represent each column of matrix \(A\) and each row of matrix \(B\). The size of gemm should be number of block columns of matrix \(A\). |
[in] | lookahead | Number of blocks to overlap communication and computation. lookahead >= 0. Default 1. |