SLATE 2024.05.31
Software for Linear Algebra Targeting Exascale
|
Functions | |
template<Target target = Target::HostTask, typename scalar_t > | |
void | slate::internal::geqrf (Matrix< scalar_t > &&A, Matrix< scalar_t > &&T, std::vector< scalar_t * > dwork_array, size_t work_size, int64_t ib, int max_panel_threads, int priority) |
QR factorization of a column of tiles. | |
template<typename scalar_t > | |
void | slate::internal::geqrf (internal::TargetType< Target::HostTask >, Matrix< scalar_t > &A, Matrix< scalar_t > &T, std::vector< scalar_t * > dwork_array, size_t work_size, int64_t ib, int max_panel_threads, int priority) |
QR factorization of a column of tiles, HostTask implementation. | |
template<typename scalar_t > | |
void | slate::internal::geqrf (internal::TargetType< Target::HostNest >, Matrix< scalar_t > &A, Matrix< scalar_t > &T, std::vector< scalar_t * > dwork_array, size_t work_size, int64_t ib, int max_panel_threads, int priority) |
QR factorization of a column of tiles, HostNest implementation. | |
template<typename scalar_t > | |
void | slate::internal::geqrf (internal::TargetType< Target::HostBatch >, Matrix< scalar_t > &A, Matrix< scalar_t > &T, std::vector< scalar_t * > dwork_array, size_t work_size, int64_t ib, int max_panel_threads, int priority) |
QR factorization of a column of tiles, HostBatch implementation. | |
template<typename scalar_t > | |
void | slate::internal::geqrf (internal::TargetType< Target::Devices >, Matrix< scalar_t > &A, Matrix< scalar_t > &T, std::vector< scalar_t * > dwork_array, size_t work_size, int64_t ib, int max_panel_threads, int priority) |
QR factorization of a column of tiles, device implementation. | |
template<Target target = Target::HostTask, typename scalar_t > | |
void | slate::internal::ttmqr (Side side, Op op, Matrix< scalar_t > &&A, Matrix< scalar_t > &&T, Matrix< scalar_t > &&C, int tag) |
Distributed multiply matrix by Q from QR triangle-triangle factorization of column of tiles. | |
template<typename scalar_t > | |
void | slate::internal::ttmqr (internal::TargetType< Target::HostTask >, Side side, Op op, Matrix< scalar_t > &A, Matrix< scalar_t > &T, Matrix< scalar_t > &C, int tag) |
Distributed multiply matrix by Q from QR triangle-triangle factorization of column of tiles, host implementation. | |
template<Target target = Target::HostTask, typename scalar_t > | |
void | slate::internal::ttqrt (Matrix< scalar_t > &&A, Matrix< scalar_t > &&T) |
Distributed QR triangle-triangle factorization of column of tiles. | |
template<typename scalar_t > | |
void | slate::internal::ttqrt (internal::TargetType< Target::HostTask >, Matrix< scalar_t > &A, Matrix< scalar_t > &T) |
Distributed QR triangle-triangle factorization, host implementation. | |
template<Target target = Target::HostTask, typename scalar_t > | |
void | slate::internal::unmqr (Side side, Op op, Matrix< scalar_t > &&V, Matrix< scalar_t > &&T, Matrix< scalar_t > &&C, Matrix< scalar_t > &&W, int priority, int64_t queue_index) |
Multiply matrix by Q from local QR factorization. | |
template<Target target, typename scalar_t > | |
void | slate::internal::unmqr (internal::TargetType< target >, Side side, Op op, Matrix< scalar_t > V, Matrix< scalar_t > &T, Matrix< scalar_t > &C, Matrix< scalar_t > &W, int priority, int64_t queue_index) |
Multiply matrix by Q from local QR factorization. | |
void slate::internal::geqrf | ( | internal::TargetType< Target::HostBatch > | , |
Matrix< scalar_t > & | A, | ||
Matrix< scalar_t > & | T, | ||
std::vector< scalar_t * > | dwork_array, | ||
size_t | work_size, | ||
int64_t | ib, | ||
int | max_panel_threads, | ||
int | priority | ||
) |
QR factorization of a column of tiles, HostBatch implementation.
Forwarding to HostTask as there is no implementation currently.
void slate::internal::geqrf | ( | internal::TargetType< Target::HostNest > | , |
Matrix< scalar_t > & | A, | ||
Matrix< scalar_t > & | T, | ||
std::vector< scalar_t * > | dwork_array, | ||
size_t | work_size, | ||
int64_t | ib, | ||
int | max_panel_threads, | ||
int | priority | ||
) |
QR factorization of a column of tiles, HostNest implementation.
Forwarding to HostTask as there is no implementation currently.
void slate::internal::geqrf | ( | Matrix< scalar_t > && | A, |
Matrix< scalar_t > && | T, | ||
std::vector< scalar_t * > | dwork_array, | ||
size_t | work_size, | ||
int64_t | ib, | ||
int | max_panel_threads, | ||
int | priority | ||
) |
QR factorization of a column of tiles.
Dispatches to target implementations.
void slate::internal::ttmqr | ( | Side | side, |
Op | op, | ||
Matrix< scalar_t > && | A, | ||
Matrix< scalar_t > && | T, | ||
Matrix< scalar_t > && | C, | ||
int | tag | ||
) |
Distributed multiply matrix by Q from QR triangle-triangle factorization of column of tiles.
Dispatches to target implementations. todo: This assumes A and T have already been communicated as needed. However, it necessarily handles communication for C. Tag is used in geqrf to differentiate communication for look-ahead panel from rest of trailing matrix.
void slate::internal::ttqrt | ( | internal::TargetType< Target::HostTask > | , |
Matrix< scalar_t > & | A, | ||
Matrix< scalar_t > & | T | ||
) |
Distributed QR triangle-triangle factorization, host implementation.
Assumes panel tiles reside on host.
void slate::internal::ttqrt | ( | Matrix< scalar_t > && | A, |
Matrix< scalar_t > && | T | ||
) |
Distributed QR triangle-triangle factorization of column of tiles.
Each rank has one triangular tile, the result of local geqrf panel. Dispatches to target implementations.
void slate::internal::unmqr | ( | internal::TargetType< target > | , |
Side | side, | ||
Op | op, | ||
Matrix< scalar_t > | V, | ||
Matrix< scalar_t > & | T, | ||
Matrix< scalar_t > & | C, | ||
Matrix< scalar_t > & | W, | ||
int | priority, | ||
int64_t | queue_index | ||
) |
Multiply matrix by Q from local QR factorization.
C = op(Q) C for side = left, or C = C op(Q) for side = right. Assumes V and T are each a single block-column. Assumes W and C have the same dimensions and distribution. This corresponds to larfb( ..., direct=Forward, storev=Columnwise, ... ). This does not include applying the distributed triangle-triangle reductions.