SLATE 2024.05.31
Software for Linear Algebra Targeting Exascale
|
Namespace used for target implementations. More...
Typedefs | |
using | ProgressVector = std::vector< std::atomic< int64_t > > |
using | Progress = std::vector< std::atomic< int64_t > > |
Functions | |
template<Target target, typename scalar_t > | |
void | add (scalar_t alpha, Matrix< scalar_t > &A, scalar_t beta, Matrix< scalar_t > &B, Options const &opts) |
template<Target target, typename scalar_t > | |
void | add (scalar_t alpha, BaseTrapezoidMatrix< scalar_t > &A, scalar_t beta, BaseTrapezoidMatrix< scalar_t > &B, Options const &opts) |
template<Target target, typename scalar_t > | |
void | cholqr (Matrix< scalar_t > &A, Matrix< scalar_t > &R, Options const &opts) |
template<Target target, typename scalar_t > | |
void | cholqr (Matrix< scalar_t > &A, HermitianMatrix< scalar_t > &R, Options const &opts) |
template<Target target, typename matrix_type > | |
void | colNorms (Norm in_norm, matrix_type A, blas::real_type< typename matrix_type::value_type > *values, Options const &opts) |
template<Target target, typename src_matrix_type , typename dst_matrix_type > | |
void | copy (src_matrix_type A, dst_matrix_type B, Options const &opts) |
template<Target target, typename scalar_t > | |
void | gbmm (scalar_t alpha, BandMatrix< scalar_t > &A, Matrix< scalar_t > &B, scalar_t beta, Matrix< scalar_t > &C, Options const &opts) |
template<Target target, typename scalar_t > | |
int64_t | gbtrf (BandMatrix< scalar_t > &A, Pivots &pivots, Options const &opts) |
Distributed parallel band LU factorization. | |
template<Target target, typename scalar_t > | |
void | ge2tb (Matrix< scalar_t > &A, TriangularFactors< scalar_t > &TU, TriangularFactors< scalar_t > &TV, Options const &opts) |
Distributed parallel reduction to band for 3-stage SVD. | |
template<Target target, typename scalar_t > | |
void | gelqf (Matrix< scalar_t > &A, TriangularFactors< scalar_t > &T, Options const &opts) |
Distributed parallel LQ factorization. | |
template<Target target, typename scalar_t > | |
void | gemmA (scalar_t alpha, Matrix< scalar_t > &A, Matrix< scalar_t > &B, scalar_t beta, Matrix< scalar_t > &C, Options const &opts) |
template<Target target, typename scalar_t > | |
void | gemmC (scalar_t alpha, Matrix< scalar_t > &A, Matrix< scalar_t > &B, scalar_t beta, Matrix< scalar_t > &C, Options const &opts) |
template<Target target, typename scalar_t > | |
void | geqrf (Matrix< scalar_t > &A, TriangularFactors< scalar_t > &T, Options const &opts) |
Distributed parallel QR factorization. | |
template<Target target, typename scalar_t > | |
int64_t | getrf (Matrix< scalar_t > &A, Pivots &pivots, Options const &opts) |
Distributed parallel LU factorization. | |
template<Target target, typename scalar_t > | |
int64_t | getrf_nopiv (Matrix< scalar_t > &A, Options const &opts) |
Distributed parallel LU factorization without pivoting. | |
template<Target target, typename scalar_t > | |
int64_t | getrf_tntpiv (Matrix< scalar_t > &A, Pivots &pivots, Options const &opts) |
Distributed parallel CALU factorization. | |
template<Target target, typename scalar_t > | |
void | getri (Matrix< scalar_t > &A, Pivots &pivots, Options const &opts) |
Distributed parallel inverse of a general matrix. | |
template<typename scalar_t > | |
void | hb2st_step (HermitianBandMatrix< scalar_t > &A, Matrix< scalar_t > &V, int64_t sweep, int64_t step) |
template<typename scalar_t > | |
void | hb2st_run (HermitianBandMatrix< scalar_t > &A, Matrix< scalar_t > &V, int thread_rank, int thread_size, ProgressVector &progress) |
template<Target target, typename scalar_t > | |
void | hb2st (HermitianBandMatrix< scalar_t > &A, Matrix< scalar_t > &V, Options const &opts) |
template<Target target, typename scalar_t > | |
void | hbmm (Side side, scalar_t alpha, HermitianBandMatrix< scalar_t > A, Matrix< scalar_t > B, scalar_t beta, Matrix< scalar_t > C, Options const &opts) |
template<Target target, typename scalar_t > | |
void | he2hb (HermitianMatrix< scalar_t > &A, TriangularFactors< scalar_t > &T, Options const &opts) |
Distributed parallel reduction to band for 3-stage Hermitian eigenvalue decomposition. | |
template<Target target, typename scalar_t > | |
void | hegst (int64_t itype, HermitianMatrix< scalar_t > A, HermitianMatrix< scalar_t > B, Options const &opts) |
Distributed parallel reduction of a complex Hermitian positive-definite generalized eigenvalue problem to the standard form. | |
template<Target target, typename scalar_t > | |
void | hemmA (Side side, scalar_t alpha, HermitianMatrix< scalar_t > A, Matrix< scalar_t > B, scalar_t beta, Matrix< scalar_t > C, Options const &opts) |
template<Target target, typename scalar_t > | |
void | hemmC (Side side, scalar_t alpha, HermitianMatrix< scalar_t > A, Matrix< scalar_t > B, scalar_t beta, Matrix< scalar_t > C, Options const &opts) |
template<Target target, typename scalar_t > | |
void | her2k (scalar_t alpha, Matrix< scalar_t > A, Matrix< scalar_t > B, blas::real_type< scalar_t > beta, HermitianMatrix< scalar_t > C, Options const &opts) |
template<Target target, typename scalar_t > | |
void | herk (blas::real_type< scalar_t > alpha, Matrix< scalar_t > A, blas::real_type< scalar_t > beta, HermitianMatrix< scalar_t > C, Options const &opts) |
template<Target target, typename scalar_t > | |
int64_t | hetrf (HermitianMatrix< scalar_t > &A, Pivots &pivots, BandMatrix< scalar_t > &T, Pivots &pivots2, Matrix< scalar_t > &H, Options const &opts) |
Distributed parallel Hermitian indefinite \(LTL^T\) factorization. | |
template<Target target, typename matrix_type > | |
blas::real_type< typename matrix_type::value_type > | norm (Norm in_norm, matrix_type A, Options const &opts) |
template<Target target, typename scalar_t > | |
int64_t | pbtrf (HermitianBandMatrix< scalar_t > A, Options const &opts) |
Distributed parallel band Cholesky factorization. | |
template<Target target, typename scalar_t > | |
int64_t | potrf (slate::internal::TargetType< target >, HermitianMatrix< scalar_t > A, Options const &opts) |
Distributed parallel Cholesky factorization. | |
template<Target target, typename scalar_t > | |
void | scale (blas::real_type< scalar_t > numer, blas::real_type< scalar_t > denom, Matrix< scalar_t > &A, Options const &opts) |
template<Target target, typename scalar_t > | |
void | scale (blas::real_type< scalar_t > numer, blas::real_type< scalar_t > denom, BaseTrapezoidMatrix< scalar_t > &A, Options const &opts) |
template<Target target, typename scalar_t , typename scalar_t2 > | |
void | scale_row_col (Equed equed, std::vector< scalar_t2 > const &R, std::vector< scalar_t2 > const &C, Matrix< scalar_t > &A, Options const &opts) |
template<Target target, typename scalar_t > | |
void | set (scalar_t offdiag_value, scalar_t diag_value, Matrix< scalar_t > &A, Options const &opts) |
template<Target target, typename scalar_t > | |
void | set (scalar_t offdiag_value, scalar_t diag_value, BaseTrapezoidMatrix< scalar_t > &A, Options const &opts) |
template<Target target, typename scalar_t > | |
void | symm (Side side, scalar_t alpha, SymmetricMatrix< scalar_t > A, Matrix< scalar_t > B, scalar_t beta, Matrix< scalar_t > C, Options const &opts) |
template<Target target, typename scalar_t > | |
void | syr2k (scalar_t alpha, Matrix< scalar_t > A, Matrix< scalar_t > B, scalar_t beta, SymmetricMatrix< scalar_t > C, Options const &opts) |
template<Target target, typename scalar_t > | |
void | syrk (scalar_t alpha, Matrix< scalar_t > A, scalar_t beta, SymmetricMatrix< scalar_t > C, Options const &opts) |
template<typename scalar_t > | |
void | tb2bd_step (TriangularBandMatrix< scalar_t > &A, Matrix< scalar_t > &U, Matrix< scalar_t > &V, int64_t band, int64_t sweep, int64_t step) |
template<typename scalar_t > | |
void | tb2bd_run (TriangularBandMatrix< scalar_t > &A, Matrix< scalar_t > &U, Matrix< scalar_t > &V, int64_t band, int64_t diag_len, int64_t pass_size, int thread_rank, int thread_size, Progress &progress) |
template<Target target, typename scalar_t > | |
void | tb2bd (TriangularBandMatrix< scalar_t > &A, Matrix< scalar_t > &U, Matrix< scalar_t > &V, Options const &opts) |
template<Target target, typename scalar_t > | |
void | tbsm (Side side, scalar_t alpha, TriangularBandMatrix< scalar_t > A, Pivots &pivots, Matrix< scalar_t > B, Options const &opts) |
template<Target target, typename scalar_t > | |
void | trmm (Side side, scalar_t alpha, TriangularMatrix< scalar_t > &A, Matrix< scalar_t > &B, Options const &opts) |
template<Target target, typename scalar_t > | |
void | trsmA (Side side, scalar_t alpha, TriangularMatrix< scalar_t > &A, Matrix< scalar_t > &B, Options const &opts) |
template<Target target, typename scalar_t > | |
void | trsmB (Side side, scalar_t alpha, TriangularMatrix< scalar_t > &A, Matrix< scalar_t > &B, Options const &opts) |
template<Target target, typename scalar_t > | |
void | trtri (TriangularMatrix< scalar_t > A, Options const &opts) |
Distributed parallel inverse of a triangular matrix. | |
template<Target target, typename scalar_t > | |
void | trtrm (TriangularMatrix< scalar_t > A, Options const &opts) |
todo: update docs: multiply not inverse. | |
template<Target target, typename scalar_t > | |
void | unmlq (Side side, Op op, Matrix< scalar_t > &A, TriangularFactors< scalar_t > &T, Matrix< scalar_t > &C, Options const &opts) |
Distributed parallel multiply by Q from LQ factorization. | |
template<Target target, typename scalar_t > | |
void | unmqr (Side side, Op op, Matrix< scalar_t > &A, TriangularFactors< scalar_t > &T, Matrix< scalar_t > &C, Options const &opts) |
Distributed parallel multiply by Q from QR factorization. | |
template<Target target, typename scalar_t > | |
void | unmtr_hb2st (Side side, Op op, Matrix< scalar_t > &V, Matrix< scalar_t > &C, const std::map< Option, Value > &opts) |
Distributed parallel unmtr_hb2st. | |
Namespace used for target implementations.
This differentiates, for example:
void slate::impl::add | ( | scalar_t | alpha, |
BaseTrapezoidMatrix< scalar_t > & | A, | ||
scalar_t | beta, | ||
BaseTrapezoidMatrix< scalar_t > & | B, | ||
Options const & | opts | ||
) |
Distributed parallel matrix-matrix addition. Generic implementation for any target.
void slate::impl::add | ( | scalar_t | alpha, |
Matrix< scalar_t > & | A, | ||
scalar_t | beta, | ||
Matrix< scalar_t > & | B, | ||
Options const & | opts | ||
) |
Distributed parallel general matrix-matrix addition. Generic implementation for any target.
void slate::impl::colNorms | ( | Norm | in_norm, |
matrix_type | A, | ||
blas::real_type< typename matrix_type::value_type > * | values, | ||
Options const & | opts | ||
) |
Distributed parallel matrix norm. Generic implementation for any target.
void slate::impl::copy | ( | src_matrix_type | A, |
dst_matrix_type | B, | ||
Options const & | opts | ||
) |
Copy and precision conversion. Generic implementation for any target.
void slate::impl::gbmm | ( | scalar_t | alpha, |
BandMatrix< scalar_t > & | A, | ||
Matrix< scalar_t > & | B, | ||
scalar_t | beta, | ||
Matrix< scalar_t > & | C, | ||
Options const & | opts | ||
) |
Distributed parallel general matrix-matrix multiplication. Generic implementation for any target. Dependencies enforce the following behavior:
int64_t slate::impl::gbtrf | ( | BandMatrix< scalar_t > & | A, |
Pivots & | pivots, | ||
Options const & | opts | ||
) |
Distributed parallel band LU factorization.
Generic implementation for any target. Panel and lookahead computed on host using Host OpenMP task.
Warning: ColMajor layout is assumed
void slate::impl::ge2tb | ( | Matrix< scalar_t > & | A, |
TriangularFactors< scalar_t > & | TU, | ||
TriangularFactors< scalar_t > & | TV, | ||
Options const & | opts | ||
) |
Distributed parallel reduction to band for 3-stage SVD.
Generic implementation for any target. Panel computed on host using Host OpenMP task.
ColMajor layout is assumed
void slate::impl::gelqf | ( | Matrix< scalar_t > & | A, |
TriangularFactors< scalar_t > & | T, | ||
Options const & | opts | ||
) |
Distributed parallel LQ factorization.
Generic implementation for any target. Panel and lookahead computed on host using Host OpenMP task.
ColMajor layout is assumed
void slate::impl::gemmC | ( | scalar_t | alpha, |
Matrix< scalar_t > & | A, | ||
Matrix< scalar_t > & | B, | ||
scalar_t | beta, | ||
Matrix< scalar_t > & | C, | ||
Options const & | opts | ||
) |
Distributed parallel general matrix-matrix multiplication. Generic implementation for any target. Dependencies enforce the following behavior:
void slate::impl::geqrf | ( | Matrix< scalar_t > & | A, |
TriangularFactors< scalar_t > & | T, | ||
Options const & | opts | ||
) |
Distributed parallel QR factorization.
Generic implementation for any target. Panel and lookahead computed on host using Host OpenMP task.
ColMajor layout is assumed
void slate::impl::hb2st_run | ( | HermitianBandMatrix< scalar_t > & | A, |
Matrix< scalar_t > & | V, | ||
int | thread_rank, | ||
int | thread_size, | ||
ProgressVector & | progress | ||
) |
Implements multi-threaded tridiagonal bulge chasing. This is the main routine that each thread runs.
[in,out] | A | The band Hermitian matrix A. |
[out] | V | Matrix of Householder reflectors produced in the process. Dimension 2*band-by-XYZ todo |
[in] | thread_rank | rank of this thread |
[in] | thread_size | number of threads |
[in] | progress | progress table for synchronizing threads |
printf( "tid %d pass %lld, task %lld, %lld\n",
void slate::impl::hb2st_step | ( | HermitianBandMatrix< scalar_t > & | A, |
Matrix< scalar_t > & | V, | ||
int64_t | sweep, | ||
int64_t | step | ||
) |
Implements the tasks of tridiagonal bulge chasing.
[in,out] | A | The band Hermitian matrix A. |
[out] | V | Matrix of Householder reflectors produced in the process. Dimension 2*band-by-... todo. |
[in] | sweep | The sweep number. One sweep eliminates one row and sweeps the entire matrix. |
[in] | step | The step number. Steps in each sweep have consecutive numbers. |
void slate::impl::hbmm | ( | Side | side, |
scalar_t | alpha, | ||
HermitianBandMatrix< scalar_t > | A, | ||
Matrix< scalar_t > | B, | ||
scalar_t | beta, | ||
Matrix< scalar_t > | C, | ||
Options const & | opts | ||
) |
Distributed parallel Hermitian banded matrix-matrix multiplication. Generic implementation for any target. Dependencies enforce the following behavior:
ColMajor layout is assumed
void slate::impl::hegst | ( | int64_t | itype, |
HermitianMatrix< scalar_t > | A, | ||
HermitianMatrix< scalar_t > | B, | ||
Options const & | opts | ||
) |
Distributed parallel reduction of a complex Hermitian positive-definite generalized eigenvalue problem to the standard form.
Generic implementation for any target.
void slate::impl::her2k | ( | scalar_t | alpha, |
Matrix< scalar_t > | A, | ||
Matrix< scalar_t > | B, | ||
blas::real_type< scalar_t > | beta, | ||
HermitianMatrix< scalar_t > | C, | ||
Options const & | opts | ||
) |
Distributed parallel Hermitian rank 2k update. Generic implementation for any target. Dependencies enforce the following behavior:
int64_t slate::impl::hetrf | ( | HermitianMatrix< scalar_t > & | A, |
Pivots & | pivots, | ||
BandMatrix< scalar_t > & | T, | ||
Pivots & | pivots2, | ||
Matrix< scalar_t > & | H, | ||
Options const & | opts | ||
) |
Distributed parallel Hermitian indefinite \(LTL^T\) factorization.
Generic implementation for any target. GPU version not yet implemented.
blas::real_type< typename matrix_type::value_type > slate::impl::norm | ( | Norm | in_norm, |
matrix_type | A, | ||
Options const & | opts | ||
) |
Distributed parallel general matrix norm. Generic implementation for any target.
int64_t slate::impl::pbtrf | ( | HermitianBandMatrix< scalar_t > | A, |
Options const & | opts | ||
) |
Distributed parallel band Cholesky factorization.
Generic implementation for any target. Panel and lookahead computed on host using Host OpenMP task.
Warning: ColMajor layout is assumed
int64_t slate::impl::potrf | ( | slate::internal::TargetType< target > | , |
HermitianMatrix< scalar_t > | A, | ||
Options const & | opts | ||
) |
Distributed parallel Cholesky factorization.
Generic implementation for any target.
void slate::impl::scale | ( | blas::real_type< scalar_t > | numer, |
blas::real_type< scalar_t > | denom, | ||
BaseTrapezoidMatrix< scalar_t > & | A, | ||
Options const & | opts | ||
) |
Set matrix entries. Generic implementation for any target.
void slate::impl::scale | ( | blas::real_type< scalar_t > | numer, |
blas::real_type< scalar_t > | denom, | ||
Matrix< scalar_t > & | A, | ||
Options const & | opts | ||
) |
Scale matrix entries by the real scalar numer/denom. Generic implementation for any target.
void slate::impl::scale_row_col | ( | Equed | equed, |
std::vector< scalar_t2 > const & | R, | ||
std::vector< scalar_t2 > const & | C, | ||
Matrix< scalar_t > & | A, | ||
Options const & | opts | ||
) |
Apply row or column scaling, or both, to a Matrix. Generic implementation for any target.
void slate::impl::set | ( | scalar_t | offdiag_value, |
scalar_t | diag_value, | ||
BaseTrapezoidMatrix< scalar_t > & | A, | ||
Options const & | opts | ||
) |
Set matrix entries. Generic implementation for any target.
void slate::impl::set | ( | scalar_t | offdiag_value, |
scalar_t | diag_value, | ||
Matrix< scalar_t > & | A, | ||
Options const & | opts | ||
) |
Set matrix entries. Generic implementation for any target.
void slate::impl::syr2k | ( | scalar_t | alpha, |
Matrix< scalar_t > | A, | ||
Matrix< scalar_t > | B, | ||
scalar_t | beta, | ||
SymmetricMatrix< scalar_t > | C, | ||
Options const & | opts | ||
) |
Distributed parallel symmetric rank 2k update. Generic implementation for any target. Dependencies enforce the following behavior:
void slate::impl::syrk | ( | scalar_t | alpha, |
Matrix< scalar_t > | A, | ||
scalar_t | beta, | ||
SymmetricMatrix< scalar_t > | C, | ||
Options const & | opts | ||
) |
Distributed parallel symmetric rank k update. Generic implementation for any target. Dependencies enforce the following behavior:
void slate::impl::tb2bd | ( | TriangularBandMatrix< scalar_t > & | A, |
Matrix< scalar_t > & | U, | ||
Matrix< scalar_t > & | V, | ||
Options const & | opts | ||
) |
Reduces a band matrix to a bidiagonal matrix using bulge chasing.
void slate::impl::tb2bd_run | ( | TriangularBandMatrix< scalar_t > & | A, |
Matrix< scalar_t > & | U, | ||
Matrix< scalar_t > & | V, | ||
int64_t | band, | ||
int64_t | diag_len, | ||
int64_t | pass_size, | ||
int | thread_rank, | ||
int | thread_size, | ||
Progress & | progress | ||
) |
Implements multi-threaded bidiagonal bulge chasing.
[in,out] | A | The band matrix A. |
[in] | band | The bandwidth of matrix A. |
[in] | diag_len | The length of the diagonal. |
[in] | pass_size | The number of rows eliminated at a time. |
[in] | thread_rank | rank of this thread |
[in] | thread_size | number of threads |
[in] | progress | progress table for synchronizing threads |
void slate::impl::tb2bd_step | ( | TriangularBandMatrix< scalar_t > & | A, |
Matrix< scalar_t > & | U, | ||
Matrix< scalar_t > & | V, | ||
int64_t | band, | ||
int64_t | sweep, | ||
int64_t | step | ||
) |
Implements the tasks of bidiagonal bulge chasing.
[in,out] | A | The band matrix A. |
[out] | U | Matrix to store the householder vectors applied to the left of the band matrix A. U is 2*nb-by-nt*(nt + 1)/2*nb, where nb is the tile size (A.tileNb(0)) and nt is the number of A tiles (A.nt()). U Matrix need to be allocated on mpi rank 0 where the band A matrix is. |
[out] | V | Matrix to store the householder vectors applied to the right of the band matrix A. V is 2*nb-by-nt*(nt + 1)/2*nb, where nb is the tile size (A.tileNb(0)) and nt is the number of A tiles (A.nt()). V Matrix need to be allocated on mpi rank 0 where the band A matrix is. |
[in] | band | The bandwidth of matrix A. |
[in] | sweep | The sweep number. One sweep eliminates one row and sweeps the entire matrix. |
[in] | step | The step number. Steps in each sweep have consecutive numbers. |
void slate::impl::tbsm | ( | Side | side, |
scalar_t | alpha, | ||
TriangularBandMatrix< scalar_t > | A, | ||
Pivots & | pivots, | ||
Matrix< scalar_t > | B, | ||
Options const & | opts | ||
) |
Distributed parallel triangular matrix solve. Generic implementation for any target. Note A and B are passed by value, so we can transpose if needed (for side = right) without affecting caller.
void slate::impl::trmm | ( | Side | side, |
scalar_t | alpha, | ||
TriangularMatrix< scalar_t > & | A, | ||
Matrix< scalar_t > & | B, | ||
Options const & | opts | ||
) |
Distributed parallel triangular matrix-matrix multiplication. Generic implementation for any target.
void slate::impl::trsmB | ( | Side | side, |
scalar_t | alpha, | ||
TriangularMatrix< scalar_t > & | A, | ||
Matrix< scalar_t > & | B, | ||
Options const & | opts | ||
) |
Distributed parallel triangular matrix solve. Generic implementation for any target.
void slate::impl::trtri | ( | TriangularMatrix< scalar_t > | A, |
Options const & | opts | ||
) |
Distributed parallel inverse of a triangular matrix.
Generic implementation for any target. Panel and lookahead computed on host using Host OpenMP task.
void slate::impl::trtrm | ( | TriangularMatrix< scalar_t > | A, |
Options const & | opts | ||
) |
todo: update docs: multiply not inverse.
Distributed parallel inverse of a triangular matrix. Generic implementation for any target.
void slate::impl::unmlq | ( | Side | side, |
Op | op, | ||
Matrix< scalar_t > & | A, | ||
TriangularFactors< scalar_t > & | T, | ||
Matrix< scalar_t > & | C, | ||
Options const & | opts | ||
) |
Distributed parallel multiply by Q from LQ factorization.
Generic implementation for any target.