SLATE 2024.05.31
Software for Linear Algebra Targeting Exascale
|
Functions | |
template<typename scalar_t > | |
void | slate::internal::gerbt (Matrix< scalar_t > A11, Matrix< scalar_t > A12, Matrix< scalar_t > A21, Matrix< scalar_t > A22, Matrix< scalar_t > U1, Matrix< scalar_t > U2, Matrix< scalar_t > V1, Matrix< scalar_t > V2) |
Applies a single butterfly matrix to each side of A. | |
template<typename scalar_t > | |
void | slate::internal::gerbt (Side side, Op trans, Matrix< scalar_t > B1, Matrix< scalar_t > B2, Matrix< scalar_t > U1, Matrix< scalar_t > U2) |
Applies a single butterfly matrix to one side of B. | |
template<typename scalar_t > | |
void | slate::internal::getrf_panel (internal::TargetType< Target::HostTask >, Matrix< scalar_t > &A, int64_t diag_len, int64_t ib, std::vector< Pivot > &pivot, blas::real_type< scalar_t > pivot_threshold, int max_panel_threads, int priority, int tag, int64_t *info) |
LU factorization of a column of tiles, host implementation. | |
template<Target target = Target::HostTask, typename scalar_t > | |
void | slate::internal::getrf_panel (Matrix< scalar_t > &&A, int64_t diag_len, int64_t ib, std::vector< Pivot > &pivot, blas::real_type< scalar_t > pivot_threshold, int max_panel_threads, int priority, int tag, int64_t *info) |
LU factorization of a column of tiles. | |
template<Target target = Target::HostTask, typename scalar_t > | |
void | slate::internal::getrf_nopiv (Matrix< scalar_t > &&A, int64_t ib, int priority, int64_t *info) |
LU factorization of single tile without pivoting. | |
template<typename scalar_t > | |
void | slate::internal::getrf_nopiv (internal::TargetType< Target::HostTask >, Matrix< scalar_t > &A, int64_t ib, int priority, int64_t *info) |
LU factorization of single tile without pivoting, host implementation. | |
template<typename scalar_t > | |
void | slate::internal::getrf_tntpiv_local (internal::TargetType< Target::HostTask >, std::vector< Tile< scalar_t > > &tiles, std::vector< char * > dwork_array, size_t dwork_bytes, int mlocal, int device, lapack::Queue *queue, int64_t diag_len, int64_t ib, int stage, int64_t mb, int64_t nb, std::vector< int64_t > &tile_indices, std::vector< std::vector< AuxPivot< scalar_t > > > &aux_pivot, int mpi_rank, int max_panel_threads, int priority, int64_t *info) |
Multi-threaded LU factorization of local tiles. | |
template<Target target, typename scalar_t > | |
void | slate::internal::getrf_tntpiv_panel (internal::TargetType< target >, Matrix< scalar_t > &A, Matrix< scalar_t > &Awork, std::vector< char * > dwork_array, size_t work_bytes, int64_t diag_len, int64_t ib, std::vector< Pivot > &pivot, int max_panel_threads, int priority, int64_t *info) |
LU factorization of a column of tiles. | |
template<Target target = Target::HostTask, typename scalar_t > | |
void | slate::internal::getrf_tntpiv_panel (Matrix< scalar_t > &&A, Matrix< scalar_t > &&Awork, std::vector< char * > dwork_array, size_t work_bytes, int64_t diag_len, int64_t ib, std::vector< Pivot > &pivot, int max_panel_threads, int priority, int64_t *info) |
LU factorization of a column of tiles. | |
template<typename scalar_t > | |
void | slate::internal::rbt_fill (Matrix< scalar_t > &U, const int64_t seed) |
Allocates and fills a random butterfly transform in packed storage. | |
template<typename scalar_t > | |
std::pair< Matrix< scalar_t >, Matrix< scalar_t > > | slate::internal::rbt_generate (const Matrix< scalar_t > &A, const int64_t d, const int64_t seed) |
Constructs two random butterfly matrices in packed storage to transform the given matrix. | |
void slate::internal::gerbt | ( | Matrix< scalar_t > | A11, |
Matrix< scalar_t > | A12, | ||
Matrix< scalar_t > | A21, | ||
Matrix< scalar_t > | A22, | ||
Matrix< scalar_t > | U1, | ||
Matrix< scalar_t > | U2, | ||
Matrix< scalar_t > | V1, | ||
Matrix< scalar_t > | V2 | ||
) |
Applies a single butterfly matrix to each side of A.
The matrices are divided into the submatrices along the halfs of the butterfly matrices.
void slate::internal::gerbt | ( | Side | side, |
Op | trans, | ||
Matrix< scalar_t > | B1, | ||
Matrix< scalar_t > | B2, | ||
Matrix< scalar_t > | U1, | ||
Matrix< scalar_t > | U2 | ||
) |
Applies a single butterfly matrix to one side of B.
The matrices are divided into the submatrices along the half of the butterfly matrix.
void slate::internal::getrf_nopiv | ( | internal::TargetType< Target::HostTask > | , |
Matrix< scalar_t > & | A, | ||
int64_t | ib, | ||
int | priority, | ||
int64_t * | info | ||
) |
LU factorization of single tile without pivoting, host implementation.
[in,out] | info | Exit status.
|
void slate::internal::getrf_nopiv | ( | Matrix< scalar_t > && | A, |
int64_t | ib, | ||
int | priority, | ||
int64_t * | info | ||
) |
LU factorization of single tile without pivoting.
Dispatches to target implementations.
void slate::internal::getrf_panel | ( | Matrix< scalar_t > && | A, |
int64_t | diag_len, | ||
int64_t | ib, | ||
std::vector< Pivot > & | pivot, | ||
blas::real_type< scalar_t > | pivot_threshold, | ||
int | max_panel_threads, | ||
int | priority, | ||
int | tag, | ||
int64_t * | info | ||
) |
LU factorization of a column of tiles.
Dispatches to target implementations.
void slate::internal::getrf_tntpiv_local | ( | internal::TargetType< Target::HostTask > | , |
std::vector< Tile< scalar_t > > & | tiles, | ||
std::vector< char * > | dwork_array, | ||
size_t | dwork_bytes, | ||
int | mlocal, | ||
int | device, | ||
lapack::Queue * | queue, | ||
int64_t | diag_len, | ||
int64_t | ib, | ||
int | stage, | ||
int64_t | mb, | ||
int64_t | nb, | ||
std::vector< int64_t > & | tile_indices, | ||
std::vector< std::vector< AuxPivot< scalar_t > > > & | aux_pivot, | ||
int | mpi_rank, | ||
int | max_panel_threads, | ||
int | priority, | ||
int64_t * | info | ||
) |
Multi-threaded LU factorization of local tiles.
[in] | target | Target for dispatch to correct implementation. |
[in,out] | tiles | List of tiles to factor on the CPU. |
[in,out] | dwork_array | Array of GPU device workspaces, dimension (num_devices). dwork_array[ dev ] stores dA, dwork, dipiv, and dinfo on GPU dev; dA is contiguous copy of tiles on GPU, dwork is getrf workspace, dipiv is pivot vector, dinfo is getrf return value. |
[in] | dwork_bytes | Total size of dwork_array[ dev ] in bytes for each GPU device. |
[in] | mlocal | Number of rows in dwork_array. |
[in] | device | Device performing factorization, needed for pointing to correct memory in dwork_array. Device == HostNum for CPU implementation. |
[in] | queue | Queue associated to input device. |
[in] | diag_len | Length of diagonal, min( mb, nb ) of diagonal tile. |
[in] | ib | Inner blocking. |
[in] | stage | Stage = 0 is initial local tiles, stage = 1 is subsequent tournament. |
[in] | mb | Tile row block size. |
[in] | nb | Tile column block size. |
[in] | tile_indices | Block row indices of tiles in tiles array. |
[in] | mpi_rank | MPI rank of this process. |
[in] | max_panel_threads | Maximum number of threads to launch for local panel. |
[in] | priority | OpenMP priority. todo: unused. Should it be on taskloop? |
void slate::internal::getrf_tntpiv_panel | ( | Matrix< scalar_t > && | A, |
Matrix< scalar_t > && | Awork, | ||
std::vector< char * > | dwork_array, | ||
size_t | work_bytes, | ||
int64_t | diag_len, | ||
int64_t | ib, | ||
std::vector< Pivot > & | pivot, | ||
int | max_panel_threads, | ||
int | priority, | ||
int64_t * | info | ||
) |
LU factorization of a column of tiles.
Dispatches to target implementations.
void slate::internal::rbt_fill | ( | Matrix< scalar_t > & | U, |
const int64_t | seed | ||
) |
Allocates and fills a random butterfly transform in packed storage.
The depth is computed based on the number of column in U.
std::pair< Matrix< scalar_t >, Matrix< scalar_t > > slate::internal::rbt_generate | ( | const Matrix< scalar_t > & | A, |
const int64_t | d, | ||
const int64_t | seed | ||
) |
Constructs two random butterfly matrices in packed storage to transform the given matrix.
[in] | A | The matrix to be transformed |
[in] | d | The depth of the transform |
[in] | seed | A seed for controlling the random number generation |