|
SLATE 2024.05.31
Software for Linear Algebra Targeting Exascale
|
Namespace used for target implementations. More...
Typedefs | |
| using | ProgressVector = std::vector< std::atomic< int64_t > > |
| using | Progress = std::vector< std::atomic< int64_t > > |
Functions | |
| template<Target target, typename scalar_t > | |
| void | add (scalar_t alpha, Matrix< scalar_t > &A, scalar_t beta, Matrix< scalar_t > &B, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | add (scalar_t alpha, BaseTrapezoidMatrix< scalar_t > &A, scalar_t beta, BaseTrapezoidMatrix< scalar_t > &B, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | cholqr (Matrix< scalar_t > &A, Matrix< scalar_t > &R, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | cholqr (Matrix< scalar_t > &A, HermitianMatrix< scalar_t > &R, Options const &opts) |
| template<Target target, typename matrix_type > | |
| void | colNorms (Norm in_norm, matrix_type A, blas::real_type< typename matrix_type::value_type > *values, Options const &opts) |
| template<Target target, typename src_matrix_type , typename dst_matrix_type > | |
| void | copy (src_matrix_type A, dst_matrix_type B, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | gbmm (scalar_t alpha, BandMatrix< scalar_t > &A, Matrix< scalar_t > &B, scalar_t beta, Matrix< scalar_t > &C, Options const &opts) |
| template<Target target, typename scalar_t > | |
| int64_t | gbtrf (BandMatrix< scalar_t > &A, Pivots &pivots, Options const &opts) |
| Distributed parallel band LU factorization. | |
| template<Target target, typename scalar_t > | |
| void | ge2tb (Matrix< scalar_t > &A, TriangularFactors< scalar_t > &TU, TriangularFactors< scalar_t > &TV, Options const &opts) |
| Distributed parallel reduction to band for 3-stage SVD. | |
| template<Target target, typename scalar_t > | |
| void | gelqf (Matrix< scalar_t > &A, TriangularFactors< scalar_t > &T, Options const &opts) |
| Distributed parallel LQ factorization. | |
| template<Target target, typename scalar_t > | |
| void | gemmA (scalar_t alpha, Matrix< scalar_t > &A, Matrix< scalar_t > &B, scalar_t beta, Matrix< scalar_t > &C, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | gemmC (scalar_t alpha, Matrix< scalar_t > &A, Matrix< scalar_t > &B, scalar_t beta, Matrix< scalar_t > &C, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | geqrf (Matrix< scalar_t > &A, TriangularFactors< scalar_t > &T, Options const &opts) |
| Distributed parallel QR factorization. | |
| template<Target target, typename scalar_t > | |
| int64_t | getrf (Matrix< scalar_t > &A, Pivots &pivots, Options const &opts) |
| Distributed parallel LU factorization. | |
| template<Target target, typename scalar_t > | |
| int64_t | getrf_nopiv (Matrix< scalar_t > &A, Options const &opts) |
| Distributed parallel LU factorization without pivoting. | |
| template<Target target, typename scalar_t > | |
| int64_t | getrf_tntpiv (Matrix< scalar_t > &A, Pivots &pivots, Options const &opts) |
| Distributed parallel CALU factorization. | |
| template<Target target, typename scalar_t > | |
| void | getri (Matrix< scalar_t > &A, Pivots &pivots, Options const &opts) |
| Distributed parallel inverse of a general matrix. | |
| template<typename scalar_t > | |
| void | hb2st_step (HermitianBandMatrix< scalar_t > &A, Matrix< scalar_t > &V, int64_t sweep, int64_t step) |
| template<typename scalar_t > | |
| void | hb2st_run (HermitianBandMatrix< scalar_t > &A, Matrix< scalar_t > &V, int thread_rank, int thread_size, ProgressVector &progress) |
| template<Target target, typename scalar_t > | |
| void | hb2st (HermitianBandMatrix< scalar_t > &A, Matrix< scalar_t > &V, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | hbmm (Side side, scalar_t alpha, HermitianBandMatrix< scalar_t > A, Matrix< scalar_t > B, scalar_t beta, Matrix< scalar_t > C, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | he2hb (HermitianMatrix< scalar_t > &A, TriangularFactors< scalar_t > &T, Options const &opts) |
| Distributed parallel reduction to band for 3-stage Hermitian eigenvalue decomposition. | |
| template<Target target, typename scalar_t > | |
| void | hegst (int64_t itype, HermitianMatrix< scalar_t > A, HermitianMatrix< scalar_t > B, Options const &opts) |
| Distributed parallel reduction of a complex Hermitian positive-definite generalized eigenvalue problem to the standard form. | |
| template<Target target, typename scalar_t > | |
| void | hemmA (Side side, scalar_t alpha, HermitianMatrix< scalar_t > A, Matrix< scalar_t > B, scalar_t beta, Matrix< scalar_t > C, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | hemmC (Side side, scalar_t alpha, HermitianMatrix< scalar_t > A, Matrix< scalar_t > B, scalar_t beta, Matrix< scalar_t > C, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | her2k (scalar_t alpha, Matrix< scalar_t > A, Matrix< scalar_t > B, blas::real_type< scalar_t > beta, HermitianMatrix< scalar_t > C, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | herk (blas::real_type< scalar_t > alpha, Matrix< scalar_t > A, blas::real_type< scalar_t > beta, HermitianMatrix< scalar_t > C, Options const &opts) |
| template<Target target, typename scalar_t > | |
| int64_t | hetrf (HermitianMatrix< scalar_t > &A, Pivots &pivots, BandMatrix< scalar_t > &T, Pivots &pivots2, Matrix< scalar_t > &H, Options const &opts) |
| Distributed parallel Hermitian indefinite \(LTL^T\) factorization. | |
| template<Target target, typename matrix_type > | |
| blas::real_type< typename matrix_type::value_type > | norm (Norm in_norm, matrix_type A, Options const &opts) |
| template<Target target, typename scalar_t > | |
| int64_t | pbtrf (HermitianBandMatrix< scalar_t > A, Options const &opts) |
| Distributed parallel band Cholesky factorization. | |
| template<Target target, typename scalar_t > | |
| int64_t | potrf (slate::internal::TargetType< target >, HermitianMatrix< scalar_t > A, Options const &opts) |
| Distributed parallel Cholesky factorization. | |
| template<Target target, typename scalar_t > | |
| void | scale (blas::real_type< scalar_t > numer, blas::real_type< scalar_t > denom, Matrix< scalar_t > &A, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | scale (blas::real_type< scalar_t > numer, blas::real_type< scalar_t > denom, BaseTrapezoidMatrix< scalar_t > &A, Options const &opts) |
| template<Target target, typename scalar_t , typename scalar_t2 > | |
| void | scale_row_col (Equed equed, std::vector< scalar_t2 > const &R, std::vector< scalar_t2 > const &C, Matrix< scalar_t > &A, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | set (scalar_t offdiag_value, scalar_t diag_value, Matrix< scalar_t > &A, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | set (scalar_t offdiag_value, scalar_t diag_value, BaseTrapezoidMatrix< scalar_t > &A, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | symm (Side side, scalar_t alpha, SymmetricMatrix< scalar_t > A, Matrix< scalar_t > B, scalar_t beta, Matrix< scalar_t > C, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | syr2k (scalar_t alpha, Matrix< scalar_t > A, Matrix< scalar_t > B, scalar_t beta, SymmetricMatrix< scalar_t > C, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | syrk (scalar_t alpha, Matrix< scalar_t > A, scalar_t beta, SymmetricMatrix< scalar_t > C, Options const &opts) |
| template<typename scalar_t > | |
| void | tb2bd_step (TriangularBandMatrix< scalar_t > &A, Matrix< scalar_t > &U, Matrix< scalar_t > &V, int64_t band, int64_t sweep, int64_t step) |
| template<typename scalar_t > | |
| void | tb2bd_run (TriangularBandMatrix< scalar_t > &A, Matrix< scalar_t > &U, Matrix< scalar_t > &V, int64_t band, int64_t diag_len, int64_t pass_size, int thread_rank, int thread_size, Progress &progress) |
| template<Target target, typename scalar_t > | |
| void | tb2bd (TriangularBandMatrix< scalar_t > &A, Matrix< scalar_t > &U, Matrix< scalar_t > &V, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | tbsm (Side side, scalar_t alpha, TriangularBandMatrix< scalar_t > A, Pivots &pivots, Matrix< scalar_t > B, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | trmm (Side side, scalar_t alpha, TriangularMatrix< scalar_t > &A, Matrix< scalar_t > &B, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | trsmA (Side side, scalar_t alpha, TriangularMatrix< scalar_t > &A, Matrix< scalar_t > &B, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | trsmB (Side side, scalar_t alpha, TriangularMatrix< scalar_t > &A, Matrix< scalar_t > &B, Options const &opts) |
| template<Target target, typename scalar_t > | |
| void | trtri (TriangularMatrix< scalar_t > A, Options const &opts) |
| Distributed parallel inverse of a triangular matrix. | |
| template<Target target, typename scalar_t > | |
| void | trtrm (TriangularMatrix< scalar_t > A, Options const &opts) |
| todo: update docs: multiply not inverse. | |
| template<Target target, typename scalar_t > | |
| void | unmlq (Side side, Op op, Matrix< scalar_t > &A, TriangularFactors< scalar_t > &T, Matrix< scalar_t > &C, Options const &opts) |
| Distributed parallel multiply by Q from LQ factorization. | |
| template<Target target, typename scalar_t > | |
| void | unmqr (Side side, Op op, Matrix< scalar_t > &A, TriangularFactors< scalar_t > &T, Matrix< scalar_t > &C, Options const &opts) |
| Distributed parallel multiply by Q from QR factorization. | |
| template<Target target, typename scalar_t > | |
| void | unmtr_hb2st (Side side, Op op, Matrix< scalar_t > &V, Matrix< scalar_t > &C, const std::map< Option, Value > &opts) |
| Distributed parallel unmtr_hb2st. | |
Namespace used for target implementations.
This differentiates, for example:
| void slate::impl::add | ( | scalar_t | alpha, |
| BaseTrapezoidMatrix< scalar_t > & | A, | ||
| scalar_t | beta, | ||
| BaseTrapezoidMatrix< scalar_t > & | B, | ||
| Options const & | opts | ||
| ) |
Distributed parallel matrix-matrix addition. Generic implementation for any target.
| void slate::impl::add | ( | scalar_t | alpha, |
| Matrix< scalar_t > & | A, | ||
| scalar_t | beta, | ||
| Matrix< scalar_t > & | B, | ||
| Options const & | opts | ||
| ) |
Distributed parallel general matrix-matrix addition. Generic implementation for any target.
| void slate::impl::colNorms | ( | Norm | in_norm, |
| matrix_type | A, | ||
| blas::real_type< typename matrix_type::value_type > * | values, | ||
| Options const & | opts | ||
| ) |
Distributed parallel matrix norm. Generic implementation for any target.
| void slate::impl::copy | ( | src_matrix_type | A, |
| dst_matrix_type | B, | ||
| Options const & | opts | ||
| ) |
Copy and precision conversion. Generic implementation for any target.
| void slate::impl::gbmm | ( | scalar_t | alpha, |
| BandMatrix< scalar_t > & | A, | ||
| Matrix< scalar_t > & | B, | ||
| scalar_t | beta, | ||
| Matrix< scalar_t > & | C, | ||
| Options const & | opts | ||
| ) |
Distributed parallel general matrix-matrix multiplication. Generic implementation for any target. Dependencies enforce the following behavior:
| int64_t slate::impl::gbtrf | ( | BandMatrix< scalar_t > & | A, |
| Pivots & | pivots, | ||
| Options const & | opts | ||
| ) |
Distributed parallel band LU factorization.
Generic implementation for any target. Panel and lookahead computed on host using Host OpenMP task.
Warning: ColMajor layout is assumed
| void slate::impl::ge2tb | ( | Matrix< scalar_t > & | A, |
| TriangularFactors< scalar_t > & | TU, | ||
| TriangularFactors< scalar_t > & | TV, | ||
| Options const & | opts | ||
| ) |
Distributed parallel reduction to band for 3-stage SVD.
Generic implementation for any target. Panel computed on host using Host OpenMP task.
ColMajor layout is assumed
| void slate::impl::gelqf | ( | Matrix< scalar_t > & | A, |
| TriangularFactors< scalar_t > & | T, | ||
| Options const & | opts | ||
| ) |
Distributed parallel LQ factorization.
Generic implementation for any target. Panel and lookahead computed on host using Host OpenMP task.
ColMajor layout is assumed
| void slate::impl::gemmC | ( | scalar_t | alpha, |
| Matrix< scalar_t > & | A, | ||
| Matrix< scalar_t > & | B, | ||
| scalar_t | beta, | ||
| Matrix< scalar_t > & | C, | ||
| Options const & | opts | ||
| ) |
Distributed parallel general matrix-matrix multiplication. Generic implementation for any target. Dependencies enforce the following behavior:
| void slate::impl::geqrf | ( | Matrix< scalar_t > & | A, |
| TriangularFactors< scalar_t > & | T, | ||
| Options const & | opts | ||
| ) |
Distributed parallel QR factorization.
Generic implementation for any target. Panel and lookahead computed on host using Host OpenMP task.
ColMajor layout is assumed
| void slate::impl::hb2st_run | ( | HermitianBandMatrix< scalar_t > & | A, |
| Matrix< scalar_t > & | V, | ||
| int | thread_rank, | ||
| int | thread_size, | ||
| ProgressVector & | progress | ||
| ) |
Implements multi-threaded tridiagonal bulge chasing. This is the main routine that each thread runs.
| [in,out] | A | The band Hermitian matrix A. |
| [out] | V | Matrix of Householder reflectors produced in the process. Dimension 2*band-by-XYZ todo |
| [in] | thread_rank | rank of this thread |
| [in] | thread_size | number of threads |
| [in] | progress | progress table for synchronizing threads |
printf( "tid %d pass %lld, task %lld, %lld\n",
| void slate::impl::hb2st_step | ( | HermitianBandMatrix< scalar_t > & | A, |
| Matrix< scalar_t > & | V, | ||
| int64_t | sweep, | ||
| int64_t | step | ||
| ) |
Implements the tasks of tridiagonal bulge chasing.
| [in,out] | A | The band Hermitian matrix A. |
| [out] | V | Matrix of Householder reflectors produced in the process. Dimension 2*band-by-... todo. |
| [in] | sweep | The sweep number. One sweep eliminates one row and sweeps the entire matrix. |
| [in] | step | The step number. Steps in each sweep have consecutive numbers. |
| void slate::impl::hbmm | ( | Side | side, |
| scalar_t | alpha, | ||
| HermitianBandMatrix< scalar_t > | A, | ||
| Matrix< scalar_t > | B, | ||
| scalar_t | beta, | ||
| Matrix< scalar_t > | C, | ||
| Options const & | opts | ||
| ) |
Distributed parallel Hermitian banded matrix-matrix multiplication. Generic implementation for any target. Dependencies enforce the following behavior:
ColMajor layout is assumed
| void slate::impl::hegst | ( | int64_t | itype, |
| HermitianMatrix< scalar_t > | A, | ||
| HermitianMatrix< scalar_t > | B, | ||
| Options const & | opts | ||
| ) |
Distributed parallel reduction of a complex Hermitian positive-definite generalized eigenvalue problem to the standard form.
Generic implementation for any target.
| void slate::impl::her2k | ( | scalar_t | alpha, |
| Matrix< scalar_t > | A, | ||
| Matrix< scalar_t > | B, | ||
| blas::real_type< scalar_t > | beta, | ||
| HermitianMatrix< scalar_t > | C, | ||
| Options const & | opts | ||
| ) |
Distributed parallel Hermitian rank 2k update. Generic implementation for any target. Dependencies enforce the following behavior:
| int64_t slate::impl::hetrf | ( | HermitianMatrix< scalar_t > & | A, |
| Pivots & | pivots, | ||
| BandMatrix< scalar_t > & | T, | ||
| Pivots & | pivots2, | ||
| Matrix< scalar_t > & | H, | ||
| Options const & | opts | ||
| ) |
Distributed parallel Hermitian indefinite \(LTL^T\) factorization.
Generic implementation for any target. GPU version not yet implemented.
| blas::real_type< typename matrix_type::value_type > slate::impl::norm | ( | Norm | in_norm, |
| matrix_type | A, | ||
| Options const & | opts | ||
| ) |
Distributed parallel general matrix norm. Generic implementation for any target.
| int64_t slate::impl::pbtrf | ( | HermitianBandMatrix< scalar_t > | A, |
| Options const & | opts | ||
| ) |
Distributed parallel band Cholesky factorization.
Generic implementation for any target. Panel and lookahead computed on host using Host OpenMP task.
Warning: ColMajor layout is assumed
| int64_t slate::impl::potrf | ( | slate::internal::TargetType< target > | , |
| HermitianMatrix< scalar_t > | A, | ||
| Options const & | opts | ||
| ) |
Distributed parallel Cholesky factorization.
Generic implementation for any target.
| void slate::impl::scale | ( | blas::real_type< scalar_t > | numer, |
| blas::real_type< scalar_t > | denom, | ||
| BaseTrapezoidMatrix< scalar_t > & | A, | ||
| Options const & | opts | ||
| ) |
Set matrix entries. Generic implementation for any target.
| void slate::impl::scale | ( | blas::real_type< scalar_t > | numer, |
| blas::real_type< scalar_t > | denom, | ||
| Matrix< scalar_t > & | A, | ||
| Options const & | opts | ||
| ) |
Scale matrix entries by the real scalar numer/denom. Generic implementation for any target.
| void slate::impl::scale_row_col | ( | Equed | equed, |
| std::vector< scalar_t2 > const & | R, | ||
| std::vector< scalar_t2 > const & | C, | ||
| Matrix< scalar_t > & | A, | ||
| Options const & | opts | ||
| ) |
Apply row or column scaling, or both, to a Matrix. Generic implementation for any target.
| void slate::impl::set | ( | scalar_t | offdiag_value, |
| scalar_t | diag_value, | ||
| BaseTrapezoidMatrix< scalar_t > & | A, | ||
| Options const & | opts | ||
| ) |
Set matrix entries. Generic implementation for any target.
| void slate::impl::set | ( | scalar_t | offdiag_value, |
| scalar_t | diag_value, | ||
| Matrix< scalar_t > & | A, | ||
| Options const & | opts | ||
| ) |
Set matrix entries. Generic implementation for any target.
| void slate::impl::syr2k | ( | scalar_t | alpha, |
| Matrix< scalar_t > | A, | ||
| Matrix< scalar_t > | B, | ||
| scalar_t | beta, | ||
| SymmetricMatrix< scalar_t > | C, | ||
| Options const & | opts | ||
| ) |
Distributed parallel symmetric rank 2k update. Generic implementation for any target. Dependencies enforce the following behavior:
| void slate::impl::syrk | ( | scalar_t | alpha, |
| Matrix< scalar_t > | A, | ||
| scalar_t | beta, | ||
| SymmetricMatrix< scalar_t > | C, | ||
| Options const & | opts | ||
| ) |
Distributed parallel symmetric rank k update. Generic implementation for any target. Dependencies enforce the following behavior:
| void slate::impl::tb2bd | ( | TriangularBandMatrix< scalar_t > & | A, |
| Matrix< scalar_t > & | U, | ||
| Matrix< scalar_t > & | V, | ||
| Options const & | opts | ||
| ) |
Reduces a band matrix to a bidiagonal matrix using bulge chasing.
| void slate::impl::tb2bd_run | ( | TriangularBandMatrix< scalar_t > & | A, |
| Matrix< scalar_t > & | U, | ||
| Matrix< scalar_t > & | V, | ||
| int64_t | band, | ||
| int64_t | diag_len, | ||
| int64_t | pass_size, | ||
| int | thread_rank, | ||
| int | thread_size, | ||
| Progress & | progress | ||
| ) |
Implements multi-threaded bidiagonal bulge chasing.
| [in,out] | A | The band matrix A. |
| [in] | band | The bandwidth of matrix A. |
| [in] | diag_len | The length of the diagonal. |
| [in] | pass_size | The number of rows eliminated at a time. |
| [in] | thread_rank | rank of this thread |
| [in] | thread_size | number of threads |
| [in] | progress | progress table for synchronizing threads |
| void slate::impl::tb2bd_step | ( | TriangularBandMatrix< scalar_t > & | A, |
| Matrix< scalar_t > & | U, | ||
| Matrix< scalar_t > & | V, | ||
| int64_t | band, | ||
| int64_t | sweep, | ||
| int64_t | step | ||
| ) |
Implements the tasks of bidiagonal bulge chasing.
| [in,out] | A | The band matrix A. |
| [out] | U | Matrix to store the householder vectors applied to the left of the band matrix A. U is 2*nb-by-nt*(nt + 1)/2*nb, where nb is the tile size (A.tileNb(0)) and nt is the number of A tiles (A.nt()). U Matrix need to be allocated on mpi rank 0 where the band A matrix is. |
| [out] | V | Matrix to store the householder vectors applied to the right of the band matrix A. V is 2*nb-by-nt*(nt + 1)/2*nb, where nb is the tile size (A.tileNb(0)) and nt is the number of A tiles (A.nt()). V Matrix need to be allocated on mpi rank 0 where the band A matrix is. |
| [in] | band | The bandwidth of matrix A. |
| [in] | sweep | The sweep number. One sweep eliminates one row and sweeps the entire matrix. |
| [in] | step | The step number. Steps in each sweep have consecutive numbers. |
| void slate::impl::tbsm | ( | Side | side, |
| scalar_t | alpha, | ||
| TriangularBandMatrix< scalar_t > | A, | ||
| Pivots & | pivots, | ||
| Matrix< scalar_t > | B, | ||
| Options const & | opts | ||
| ) |
Distributed parallel triangular matrix solve. Generic implementation for any target. Note A and B are passed by value, so we can transpose if needed (for side = right) without affecting caller.
| void slate::impl::trmm | ( | Side | side, |
| scalar_t | alpha, | ||
| TriangularMatrix< scalar_t > & | A, | ||
| Matrix< scalar_t > & | B, | ||
| Options const & | opts | ||
| ) |
Distributed parallel triangular matrix-matrix multiplication. Generic implementation for any target.
| void slate::impl::trsmB | ( | Side | side, |
| scalar_t | alpha, | ||
| TriangularMatrix< scalar_t > & | A, | ||
| Matrix< scalar_t > & | B, | ||
| Options const & | opts | ||
| ) |
Distributed parallel triangular matrix solve. Generic implementation for any target.
| void slate::impl::trtri | ( | TriangularMatrix< scalar_t > | A, |
| Options const & | opts | ||
| ) |
Distributed parallel inverse of a triangular matrix.
Generic implementation for any target. Panel and lookahead computed on host using Host OpenMP task.
| void slate::impl::trtrm | ( | TriangularMatrix< scalar_t > | A, |
| Options const & | opts | ||
| ) |
todo: update docs: multiply not inverse.
Distributed parallel inverse of a triangular matrix. Generic implementation for any target.
| void slate::impl::unmlq | ( | Side | side, |
| Op | op, | ||
| Matrix< scalar_t > & | A, | ||
| TriangularFactors< scalar_t > & | T, | ||
| Matrix< scalar_t > & | C, | ||
| Options const & | opts | ||
| ) |
Distributed parallel multiply by Q from LQ factorization.
Generic implementation for any target.