SLATE 2024.05.31
Software for Linear Algebra Targeting Exascale
|
Base class for all SLATE distributed, tiled matrices. More...
#include <BaseMatrix.hh>
Public Types | |
using | BcastList = std::vector< std::tuple< int64_t, int64_t, std::list< BaseMatrix< scalar_t > > > > |
using | BcastListTag = std::vector< std::tuple< int64_t, int64_t, std::list< BaseMatrix< scalar_t > >, int64_t > > |
using | ReduceList = std::vector< std::tuple< int64_t, int64_t, BaseMatrix< scalar_t >, std::list< BaseMatrix< scalar_t > > > > |
using | ij_tuple = typename MatrixStorage< scalar_t >::ij_tuple |
typedef scalar_t | value_type |
Public Member Functions | |
template<typename out_scalar_t > | |
BaseMatrix< out_scalar_t > | baseEmptyLike (int64_t mb, int64_t nb, Op deepOp) |
[internal] | |
Tile< scalar_t > | operator() (int64_t i, int64_t j, int device=HostNum) |
Get shallow copy of tile {i, j} of op(A) on given device, with the tile's op flag set to match the matrix's. | |
Tile< scalar_t > | at (int64_t i, int64_t j, int device=HostNum) |
Alias of operator(). | |
int | num_devices () const |
Returns number of devices (per MPI process) to distribute matrix to. | |
void | gridinfo (GridOrder *order, int *nprow, int *npcol, int *myrow, int *mycol) |
Get nprow, npcol, myrow, mycol for 2D block cyclic (2DBC) distribution. | |
std::function< int64_t(int64_t i)> | tileMbFunc () const |
Returns tileMb function. | |
std::function< int64_t(int64_t j)> | tileNbFunc () const |
Returns tileNb function. | |
std::function< int(ij_tuple ij)> | tileRankFunc () const |
Returns tileRank function. | |
std::function< int(ij_tuple ij)> | tileDeviceFunc () const |
Returns tileDevice function. | |
int64_t | m () const |
Returns number of rows in op(A). | |
int64_t | n () const |
Returns number of columns in op(A). | |
int64_t | mt () const |
Returns number of block rows in op(A). | |
int64_t | nt () const |
Returns number of block cols in op(A). | |
Op | op () const |
Returns transposition operation op(A) as NoTrans, Trans, or ConjTrans. | |
bool | tileExists (int64_t i, int64_t j, int device=HostNum) |
returns true if tile exists on specified device | |
int | tileRank (int64_t i, int64_t j) const |
Returns MPI rank of tile {i, j} of op(A). | |
int | tileDevice (int64_t i, int64_t j) const |
Returns device of tile {i, j} of op(A). | |
bool | tileIsLocal (int64_t i, int64_t j) const |
Returns whether tile {i, j} of op(A) is local. | |
Uplo | uplo () const |
Returns whether op(A) is logically Lower, Upper, or General. | |
Uplo | uploLogical () const |
Returns whether op(A) is logically Lower, Upper, or General storage, taking the transposition operation into account. | |
Uplo | uploPhysical () const |
Returns whether A is physically Lower, Upper, or General storage, ignoring the transposition operation. | |
Tile< scalar_t > | originTile (int64_t i, int64_t j) |
Returns the origin tile instance of tile(i, j) | |
Target | origin () const |
Tile origin. | |
int64_t | tileMb (int64_t i) const |
Returns number of rows (mb) in block row i of op(A). | |
int64_t | tileNb (int64_t j) const |
Returns number of cols (nb) in block col j of op(A). | |
Tile< scalar_t > | tileInsert (int64_t i, int64_t j, int device=HostNum) |
Insert tile {i, j} of op(A) and allocate its data. | |
Tile< scalar_t > | tileInsert (int64_t i, int64_t j, int device, scalar_t *A, int64_t ld) |
Insert tile {i, j} of op(A) with existing data. | |
Tile< scalar_t > | tileInsert (int64_t i, int64_t j, scalar_t *A, int64_t ld) |
Insert tile with default device=HostNum. | |
Tile< scalar_t > | tileInsertWorkspace (int64_t i, int64_t j, int device, Layout layout) |
Insert a workspace tile {i, j} of op(A) and allocate its data. | |
Tile< scalar_t > | tileInsertWorkspace (int64_t i, int64_t j, int device) |
Tile< scalar_t > | tileInsertWorkspace (int64_t i, int64_t j, Layout layout) |
Tile< scalar_t > | tileInsertWorkspace (int64_t i, int64_t j) |
scalar_t * | allocWorkspaceBuffer (int device, int64_t size) |
Allocates a workspace buffer using the matrix's memory pool. | |
void | freeWorkspaceBuffer (int device, scalar_t *buffer) |
Frees a workspace buffer allocated with BaseMatrix::allocWorkspaceBuffer. | |
MOSI | tileState (int64_t i, int64_t j, int device=HostNum) |
Returns tile(i, j)'s state on device (defaults to host). | |
bool | tileOnHold (int64_t i, int64_t j, int device=HostNum) |
Returns whether tile(i, j) is OnHold on device (defaults to host). | |
void | tileUnsetHold (int64_t i, int64_t j, int device=HostNum) |
Unsets the hold of tile(i, j) on device (defaults to host) if it was OnHold. | |
void | tileUnsetHoldAll (int device=HostNum) |
Unsets all local tiles' hold on device. | |
void | tileUnsetHoldAllOnDevices () |
Unsets all local tiles' hold on all devices. | |
void | tileModified (int64_t i, int64_t j, int device=HostNum, bool permissive=false) |
Marks tile(i, j) as Modified on device. | |
void | tileAcquire (int64_t i, int64_t j, int device, Layout layout) |
Acquire tile(i, j) on device without copying data if not already exists. | |
void | tileAcquire (int64_t i, int64_t j, Layout layout) |
void | tileGetForReading (int64_t i, int64_t j, int device, LayoutConvert layout) |
Gets tile(i, j) for reading on device. | |
void | tileGetForReading (std::set< ij_tuple > &tile_set, int device, LayoutConvert layout) |
Gets a set of tiles for reading on device. | |
void | tileGetForReading (int64_t i, int64_t j, LayoutConvert layout) |
Gets tile(i, j) for reading on host. | |
void | tileGetForReading (std::set< ij_tuple > &tile_set, LayoutConvert layout) |
Gets a set of tiles for reading on host. | |
void | tileGetForReading (std::set< ij_tuple > &tile_set, LayoutConvert layout, int from_device) |
void | tileGetAllForReading (int device, LayoutConvert layout) |
Gets all local tiles for reading on device. | |
void | tileGetAllForReadingOnDevices (LayoutConvert layout) |
Gets all local tiles for reading on corresponding devices. | |
void | tileGetForWriting (int64_t i, int64_t j, int device, LayoutConvert layout) |
Gets tile(i, j) for writing on device. | |
void | tileGetForWriting (std::set< ij_tuple > &tile_set, int device, LayoutConvert layout) |
Gets a set of tiles for writing on device. | |
void | tileGetForWriting (int64_t i, int64_t j, LayoutConvert layout) |
Gets tile(i, j) for writing on host. | |
void | tileGetForWriting (std::set< ij_tuple > &tile_set, LayoutConvert layout) |
Gets a set of tiles for writing on host. | |
void | tileGetAllForWriting (int device, LayoutConvert layout) |
Gets all local tiles for writing on device. | |
void | tileGetAllForWritingOnDevices (LayoutConvert layout) |
Gets all local tiles for writing on corresponding devices. | |
void | tileGetAndHold (int64_t i, int64_t j, int device, LayoutConvert layout) |
Gets tile(i, j) on device for reading and marks it as MOSI::OnHold. | |
void | tileGetAndHold (int64_t i, int64_t j, LayoutConvert layout) |
Gets tile(i, j) on host for reading and marks it as MOSI::OnHold. | |
void | tileGetAndHold (std::set< ij_tuple > &tile_set, int device, LayoutConvert layout) |
Gets a set of tiles for reading on device and marks them as MOSI::OnHold. | |
void | tileGetAndHold (std::set< ij_tuple > &tile_set, LayoutConvert layout) |
Gets a set of tiles for reading on host and marks them as MOSI::OnHold. | |
void | tileGetAndHoldAll (int device, LayoutConvert layout) |
Gets all local tiles on device and marks them as MOSI::OnHold. | |
void | tileGetAndHoldAllOnDevices (LayoutConvert layout) |
Gets all local tiles on corresponding devices and marks them as MOSI::OnHold. | |
Tile< scalar_t > | tileUpdateOrigin (int64_t i, int64_t j) |
Updates the origin instance of tile(i, j) if MOSI::Invalid tile must be local. | |
void | tileUpdateAllOrigin () |
Updates all origin instances of local tiles if MOSI::Invalid. | |
int64_t | tileLife (int64_t i, int64_t j) const |
Returns life counter of tile {i, j} of op(A). | |
void | tileLife (int64_t i, int64_t j, int64_t life) |
Set life counter of tile {i, j} of op(A). | |
void | tileTick (int64_t i, int64_t j) |
Decrements life counter of workspace tile {i, j} of op(A). | |
int64_t | tileReceiveCount (int64_t i, int64_t j) const |
Returns how many times the tile {i, j} is received through MPI. | |
void | tileIncrementReceiveCount (int64_t i, int64_t j) |
Increments the number of times the tile {i, j} is received through MPI. | |
void | tileDecrementReceiveCount (int64_t i, int64_t j, int64_t release_count=1) |
Decrements the number of times the tile {i, j} is received through MPI. | |
void | tileErase (int64_t i, int64_t j, int device=HostNum) |
Erase tile {i, j} of op(A) on device (host, one device or all devices). | |
void | tileRelease (int64_t i, int64_t j, int device=HostNum) |
Erase the tile {i, j}'s instance on device if it is a workspace tile with no hold set on it. | |
void | tileSend (int64_t i, int64_t j, int dst_rank, int tag=0) |
Send tile {i, j} of op(A) to the given MPI rank. | |
template<Target target> | |
void | tileSend (int64_t i, int64_t j, int dst_rank, int tag=0) |
void | tileIsend (int64_t i, int64_t j, int dst_rank, int tag, MPI_Request *request) |
Immediately send tile {i, j} of op(A) to the given MPI rank. | |
template<Target target = Target::Host> | |
void | tileRecv (int64_t i, int64_t j, int dst_rank, Layout layout, int tag=0) |
Receive tile {i, j} of op(A) to the given MPI rank. | |
template<Target target = Target::Host> | |
void | tileIrecv (int64_t i, int64_t j, int dst_rank, Layout layout, int tag, MPI_Request *request) |
Receive tile {i, j} of op(A) to the given MPI rank using immediate mode. | |
template<Target target = Target::Host> | |
void | tileBcast (int64_t i, int64_t j, BaseMatrix const &B, Layout layout, int tag=0) |
Send tile {i, j} of op(A) to all MPI ranks in matrix B. | |
template<Target target = Target::Host> | |
void | tileBcast (int64_t i, int64_t j, BaseMatrix const &B, Layout layout, int tag, int64_t life_factor) |
template<Target target = Target::Host> | |
void | listBcast (BcastList &bcast_list, Layout layout, int tag=0, bool is_shared=false) |
Send tile {i, j} of op(A) to all MPI ranks in the list of submatrices bcast_list. | |
template<Target target = Target::Host> | |
void | listBcast (BcastList &bcast_list, Layout layout, int tag, int64_t life_factor, bool is_shared=false) |
template<Target target = Target::Host> | |
void | listBcastMT (BcastListTag &bcast_list, Layout layout, bool is_shared=false) |
Send tile {i, j} of op(A) to all MPI ranks in the list of submatrices bcast_list (using OpenMP taskloop and multi-threaded MPI). | |
template<Target target = Target::Host> | |
void | listBcastMT (BcastListTag &bcast_list, Layout layout, int64_t life_factor, bool is_shared=false) |
template<Target target = Target::Host> | |
void | listReduce (ReduceList &reduce_list, Layout layout, int tag=0) |
Layout | layout () const |
Returns matrix layout flag. | |
Layout | tileLayout (int64_t i, int64_t j, int device=HostNum) |
Returns Layout of tile(i, j, device) | |
void | tileLayout (int64_t i, int64_t j, int device, Layout layout) |
Sets Layout of tile(i, j, device) | |
void | tileLayout (int64_t i, int64_t j, Layout layout) |
Sets Layout of tile(i, j, host) | |
bool | tileLayoutIsConvertible (int64_t i, int64_t j, int device=HostNum) |
Returns whether tile(i, j, device) can be safely transposed. | |
void | tileLayoutConvert (int64_t i, int64_t j, int device, Layout layout, bool reset=false, bool async=false) |
Converts tile(i, j, device) into 'layout'. | |
void | tileLayoutConvert (int64_t i, int64_t j, Layout layout, bool reset=false, bool async=false) |
Convert layout of tile(i, j) to layout on host, optionally reset. | |
void | tileLayoutConvert (std::set< ij_tuple > &tile_set, int device, Layout layout, bool reset=false) |
Converts tiles indicated in 'tile_set' that exist on 'device' into 'layout' if not already in 'layout' major. | |
void | tileLayoutConvert (std::set< ij_tuple > &tile_set, Layout layout, bool reset=false) |
Convert layout of a set of tiles to layout on host, optionally reset. | |
void | tileLayoutConvert (int device, Layout layout, bool reset=false) |
Converts all existing tile instances on 'device' into 'layout' Operates in batch mode. | |
void | tileLayoutConvertOnDevices (Layout layout, bool reset=false) |
Converts all existing tile instances on available devices into 'layout'. | |
void | tileLayoutReset (int64_t i, int64_t j, int device, Layout layout) |
Converts tile(i, j) into current layout and resets its extended buffer. | |
void | tileLayoutReset (int64_t i, int64_t j, Layout layout) |
void | tileLayoutReset (std::set< ij_tuple > &tile_set, int device, Layout layout) |
Converts set of tiles into current layout and resets their extended buffers. | |
void | tileLayoutReset (std::set< ij_tuple > &tile_set, Layout layout) |
void | tileLayoutReset () |
Converts all origin tiles into current matrix-layout. | |
void | tileReduceFromSet (int64_t i, int64_t j, int root_rank, std::set< int > &reduce_set, int radix, int tag, Layout layout) |
[internal] WARNING: Sent and received tiles are converted to 'layout' major. | |
void | getRanks (std::set< int > *bcast_set) const |
Puts all MPI ranks that have tiles in the matrix into the set. | |
void | getLocalDevices (std::set< int > *dev_set) const |
Puts all devices that have local tiles in the matrix into the set. | |
int64_t | numLocalTiles () const |
Returns number of local tiles in this matrix. | |
MPI_Comm | mpiComm () const |
int | mpiRank () const |
MPI_Group | mpiGroup () const |
void | clear () |
Removes all tiles from matrix. | |
void | releaseWorkspace () |
Clears all workspace tiles that are not OnHold. | |
void | releaseLocalWorkspaceTile (int64_t i, int64_t j) |
Erases a given local workspace tile, if not on hold or modified. | |
void | releaseLocalWorkspace () |
Erases all local workspace tiles, if not on hold or modified. | |
void | releaseLocalWorkspace (std::set< ij_tuple > &tile_set) |
Erases a given set of local workspace tiles from all devices including host, if not on hold or modified. | |
void | releaseRemoteWorkspaceTile (int64_t i, int64_t j, int64_t release_count=1) |
Erases a given tile that is not local to node from all devices including host, if not on hold or modified. | |
void | releaseRemoteWorkspace (int64_t receive_count=1) |
Erases tiles that are not local to node from all devices including host, if not on hold or modified. | |
void | releaseRemoteWorkspace (std::set< ij_tuple > &tile_set, int64_t release_count=1) |
Erases a given set of tiles that are not local to node from all devices including host, if not on hold or modified. | |
void | clearWorkspace () |
Removes all temporary host and device workspace tiles from matrix. | |
void | allocateBatchArrays (int64_t batch_size, int64_t num_arrays) |
Allocates batch arrays and BLAS++ queues for all devices. | |
void | clearBatchArrays () |
Removes batch arrays from matrix for all devices. | |
int64_t | batchArraySize () |
scalar_t ** | array_host (int device, int64_t batch_arrays_index=0) |
scalar_t ** | array_device (int device, int64_t batch_arrays_index=0) |
lapack::Queue * | comm_queue (int device) |
lapack::Queue * | compute_queue (int device, int queue_index=0) |
int | numComputeQueues () |
Static Public Attributes | |
static constexpr bool | is_complex = slate::is_complex<scalar_t>::value |
static constexpr bool | is_real = ! is_complex |
Protected Member Functions | |
BaseMatrix () | |
[internal] Default constructor creates an empty matrix. | |
BaseMatrix (int64_t m, int64_t n, std::function< int64_t(int64_t i)> &inTileMb, std::function< int64_t(int64_t j)> &inTileNb, std::function< int(ij_tuple ij)> &inTileRank, std::function< int(ij_tuple ij)> &inTileDevice, MPI_Comm mpi_comm) | |
[internal] Construct matrix with mt block rows and nt block columns, such that sum_{i = 0}^{mt-1} tileMb(i) >= m, sum_{j = 0}^{nt-1} tileNb(j) >= n, where tileMb, tileNb, tileRank, tileDevice are given as functions. | |
BaseMatrix (int64_t m, int64_t n, int64_t mb, int64_t nb, GridOrder order, int nprow, int npcol, MPI_Comm mpi_comm) | |
[internal] Construct matrix with mt = ceil( m / mb ) block rows and nt = ceil( n / nb ) block columns, with fixed mb-by-nb tile size and 2D block cyclic distribution. | |
BaseMatrix (int64_t m, int64_t n, int64_t mb, int64_t nb, int nprow, int npcol, MPI_Comm mpi_comm) | |
With order = Col. | |
BaseMatrix (int64_t m, int64_t n, int64_t nb, int nprow, int npcol, MPI_Comm mpi_comm) | |
With mb = nb, order = Col. | |
BaseMatrix (BaseMatrix &orig, int64_t i1, int64_t i2, int64_t j1, int64_t j2) | |
[internal] Sub-matrix constructor creates shallow copy view of parent matrix, B[ i1:i2, j1:j2 ]. | |
BaseMatrix (BaseMatrix &orig, Slice slice) | |
[internal] Sliced matrix constructor creates shallow copy view of parent matrix, B[ row1:row2, col1:col2 ]. | |
void | tileBcastToSet (int64_t i, int64_t j, std::set< int > const &bcast_set) |
void | tileBcastToSet (int64_t i, int64_t j, std::set< int > const &bcast_set, int radix, int tag, Layout layout, Target target) |
[internal] Broadcast tile {i, j} to all MPI ranks in the bcast_set. | |
void | tileIbcastToSet (int64_t i, int64_t j, std::set< int > const &bcast_set, int radix, int tag, Layout layout, std::vector< MPI_Request > &send_requests, Target target) |
[internal] Broadcast tile {i, j} to all MPI ranks in the bcast_set. | |
std::tuple< int64_t, int64_t > | globalIndex (int64_t i, int64_t j) const |
[internal] Returns index {i, j} in global matrix as a tuple, taking into account the local offset and transpose. | |
std::tuple< int64_t, int64_t, int > | globalIndex (int64_t i, int64_t j, int device) const |
[internal] Returns index {i, j, dev} in global matrix as a tuple, taking into account the local offset and transpose. | |
int64_t | row0_offset () const |
row offset of first block row. | |
int64_t | col0_offset () const |
col offset of first block col. | |
int64_t | last_mb () const |
rows in last block row. | |
int64_t | last_nb () const |
cols in last block col. | |
int64_t | ioffset () const |
block row offset with respect to original matrix | |
int64_t | joffset () const |
block col offset with respect to original matrix | |
Protected Attributes | |
Uplo | uplo_ |
upper or lower storage | |
Op | op_ |
transpose operation with respect to original matrix | |
Layout | layout_ |
intended layout of the matrix. defaults to ColMajor. | |
Target | origin_ |
Tile origins. defaults to Host. | |
std::shared_ptr< MatrixStorage< scalar_t > > | storage_ |
shared storage of tiles and buffers | |
MPI_Comm | mpi_comm_ |
MPI_Group | mpi_group_ |
int | mpi_rank_ |
Friends | |
class | Debug |
template<typename MatrixType > | |
MatrixType | transpose (MatrixType &A) |
Returns shallow copy of op(A) that is transposed. | |
template<typename MatrixType > | |
MatrixType | conj_transpose (MatrixType &A) |
Returns shallow copy of op(A) that is conjugate-transpose. | |
template<typename T > | |
void | swap (BaseMatrix< T > &A, BaseMatrix< T > &B) |
Base class for all SLATE distributed, tiled matrices.
In general, the documentation refers to the current matrix object as op(A), to emphasize that it may be transposed with respect to its parent matrix.
|
protected |
[internal] Default constructor creates an empty matrix.
Does not allocate any memory.
|
protected |
[internal] Construct matrix with mt block rows and nt block columns, such that sum_{i = 0}^{mt-1} tileMb(i) >= m, sum_{j = 0}^{nt-1} tileNb(j) >= n, where tileMb, tileNb, tileRank, tileDevice are given as functions.
No tiles are allocated. Creates empty matrix storage.
[in] | m | Number of rows of the matrix. m >= 0. |
[in] | n | Number of columns of the matrix. n >= 0 |
[in] | inTileMb | Function that takes block-row index, returns block-row size. |
[in] | inTileNb | Function that takes block-col index, returns block-col size. |
[in] | inTileRank | Function that takes tuple of { block-row, block-col } indices, returns MPI rank for that tile. |
[in] | inTileDevice | Function that takes tuple of { block-row, block-col } indices, returns local GPU device ID for that tile. |
[in] | mpi_comm | MPI communicator to distribute matrix across. |
|
protected |
[internal] Construct matrix with mt = ceil( m / mb ) block rows and nt = ceil( n / nb ) block columns, with fixed mb-by-nb tile size and 2D block cyclic distribution.
No tiles are allocated. Creates empty matrix storage.
[in] | m | Number of rows of the matrix. m >= 0. |
[in] | n | Number of columns of the matrix. n >= 0 |
[in] | mb | Row block size in 2D block-cyclic distribution. mb > 0. |
[in] | nb | Column block size in 2D block-cyclic distribution. nb > 0. |
[in] | order | Order to map MPI processes to tile grid, GridOrder::ColMajor (default) or GridOrder::RowMajor. |
[in] | nprow | Number of process rows in 2D block-cyclic distribution. nprow > 0. |
[in] | npcol | Number of process cols of 2D block-cyclic distribution. npcol > 0. |
[in] | mpi_comm | MPI communicator to distribute matrix across. nprow * npcol <= MPI_Comm_size( mpi_comm ). |
|
protected |
[internal] Sub-matrix constructor creates shallow copy view of parent matrix, B[ i1:i2, j1:j2 ].
B[ 0:mt-1, 0:nt-1 ] is the entire B matrix. If i2 < i1 or j2 < j1, produces empty matrix. See Matrix::sub().
[in,out] | B | Parent matrix B. |
[in] | i1 | Starting block row index. 0 <= i1 < mt. |
[in] | i2 | Ending block row index (inclusive). i2 < mt. |
[in] | j1 | Starting block column index. 0 <= j1 < nt. |
[in] | j2 | Ending block column index (inclusive). j2 < nt. |
|
protected |
[internal] Sliced matrix constructor creates shallow copy view of parent matrix, B[ row1:row2, col1:col2 ].
This takes row & col indices instead of block row & block col indices. B[ 0:m-1, 0:n-1 ] is the entire B matrix. If row2 < row1 or col2 < col1, produces empty matrix. See Matrix::slice().
[in,out] | B | Parent matrix B. |
[in] | slice | Start and end row and column indices: |
|
inline |
Allocates batch arrays and BLAS++ queues for all devices.
Matrix classes override this with versions that can also allocate based on the number of local tiles.
[in] | batch_size | Allocate batch arrays as needed so that size of each batch array >= batch_size >= 0. |
[in] | num_arrays | Allocate batch arrays as needed so that number of batch arrays per device >= num_arrays >= 1. |
scalar_t * slate::BaseMatrix< scalar_t >::allocWorkspaceBuffer | ( | int | device, |
int64_t | size | ||
) |
Allocates a workspace buffer using the matrix's memory pool.
The memory must be freed with BaseMatrix::freeWorkspaceBuffer
[in] | device | Device ID (GPU or Host) where the memory block is needed. |
[in] | size | The required allocation size |
|
inline |
|
inline |
|
inline |
|
inline |
Removes all tiles from matrix.
WARNING: currently this clears the entire parent matrix, not just a sub-matrix.
|
inline |
Removes batch arrays from matrix for all devices.
WARNING: currently this clears the entire parent matrix, not just a sub-matrix.
|
inline |
Removes all temporary host and device workspace tiles from matrix.
WARNING: currently, this clears the entire parent matrix, not just a sub-matrix.
|
inline |
[in] | device | Tile's device ID. |
|
inline |
[in] | device | Tile's device ID |
[in] | queue_index | The index of a specific set of queues |
void slate::BaseMatrix< scalar_t >::freeWorkspaceBuffer | ( | int | device, |
scalar_t * | buffer | ||
) |
Frees a workspace buffer allocated with BaseMatrix::allocWorkspaceBuffer.
[in] | device | Device ID (GPU or Host) where the memory block is needed. |
[in] | buffer | Pointer to the buffer |
void slate::BaseMatrix< scalar_t >::getLocalDevices | ( | std::set< int > * | dev_set | ) | const |
Puts all devices that have local tiles in the matrix into the set.
[out] | dev_set | On output, set of device IDs that this sub-matrix has local tiles on. |
void slate::BaseMatrix< scalar_t >::getRanks | ( | std::set< int > * | bcast_set | ) | const |
Puts all MPI ranks that have tiles in the matrix into the set.
[out] | bcast_set | On output, set of MPI ranks that this sub-matrix has tiles on. |
|
protected |
|
protected |
void slate::BaseMatrix< scalar_t >::gridinfo | ( | GridOrder * | order, |
int * | nprow, | ||
int * | npcol, | ||
int * | myrow, | ||
int * | mycol | ||
) |
Get nprow, npcol, myrow, mycol for 2D block cyclic (2DBC) distribution.
If SLATE doesn't know the distribution, sets all values to -1. todo: Assumes col-major 2D block cyclic distribution, not row-major.
[out] | order | Order to map MPI processes to tile grid. |
[out] | nprow | Number of process rows. |
[out] | npcol | Number of process cols. |
[out] | myrow | Process row for this process (MPI rank). |
[out] | mycol | Process col for this process (MPI rank). |
void slate::BaseMatrix< scalar_t >::listBcast | ( | BcastList & | bcast_list, |
Layout | layout, | ||
int | tag = 0 , |
||
bool | is_shared = false |
||
) |
Send tile {i, j} of op(A) to all MPI ranks in the list of submatrices bcast_list.
Data received must be in 'layout' (ColMajor/RowMajor) major.
target | Destination to target; either Host (default) or Device. |
[in] | bcast_list | List of submatrices defining the MPI ranks to send to. Usually it is the portion of the matrix to be updated by tile {i, j}. |
[in] | layout | Indicates the Layout (ColMajor/RowMajor) of the broadcasted data. WARNING: must match the layout of the tile in the sender MPI rank. |
[in] | tag | MPI tag, default 0. |
[in] | is_shared | A flag to get and hold the broadcasted (prefetched) tiles on the devices. This flag prevents any subsequent calls of tileRelease() routine to release these tiles (clear up the devices memories). WARNING: must set unhold these tiles before releasing them to free up the allocated memories. |
void slate::BaseMatrix< scalar_t >::listBcastMT | ( | BcastListTag & | bcast_list, |
Layout | layout, | ||
bool | is_shared = false |
||
) |
Send tile {i, j} of op(A) to all MPI ranks in the list of submatrices bcast_list (using OpenMP taskloop and multi-threaded MPI).
Data received must be in 'layout' (ColMajor/RowMajor) major.
target | Destination to target; either Host (default) or Device. |
[in] | bcast_list | List of submatrices defining the MPI ranks to send to. Usually it is the portion of the matrix to be updated by tile {i, j}. Each tile {i, j} to be broadcast has a tag in the bcast_list. |
[in] | layout | Indicates the Layout (ColMajor/RowMajor) of the broadcasted data. WARNING: must match the layout of the tile in the sender MPI rank. |
[in] | is_shared | A flag to get and hold the broadcasted (prefetched) tiles on the devices. This flag prevents any subsequent calls of tileRelease() routine to release these tiles (clear up the devices memories). WARNING: must set unhold these tiles before releasing them to free up the allocated memories. |
|
inline |
Tile< scalar_t > slate::BaseMatrix< scalar_t >::operator() | ( | int64_t | i, |
int64_t | j, | ||
int | device = HostNum |
||
) |
Tile< scalar_t > slate::BaseMatrix< scalar_t >::originTile | ( | int64_t | i, |
int64_t | j | ||
) |
void slate::BaseMatrix< scalar_t >::releaseLocalWorkspace | ( | std::set< ij_tuple > & | tile_set | ) |
Erases a given set of local workspace tiles from all devices including host, if not on hold or modified.
[in] | tile_set | Set of (i, j) tuples indicating indices of tiles to be erased. |
void slate::BaseMatrix< scalar_t >::releaseRemoteWorkspace | ( | std::set< ij_tuple > & | tile_set, |
int64_t | release_count = 1 |
||
) |
Erases a given set of tiles that are not local to node from all devices including host, if not on hold or modified.
[in] | tile_set | Set of (i, j) tuples indicating indices of tiles to be erased. |
void slate::BaseMatrix< scalar_t >::releaseRemoteWorkspaceTile | ( | int64_t | i, |
int64_t | j, | ||
int64_t | release_count = 1 |
||
) |
Erases a given tile that is not local to node from all devices including host, if not on hold or modified.
The tile's receive count is decremented. If the receive count reaches zero, the tile is erased. Otherwise, tile is not erased.
|
inline |
Clears all workspace tiles that are not OnHold.
For local tiles, it ensures that a valid copy remains.
Note that local tiles are currently not released if it would leave all remaining tiles invalid, but this behavior may change in the future and should not be relied on.
void slate::BaseMatrix< scalar_t >::tileAcquire | ( | int64_t | i, |
int64_t | j, | ||
int | device, | ||
Layout | layout | ||
) |
Acquire tile(i, j) on device without copying data if not already exists.
This is used when the destination tile's data will be overridden. Converts destination Layout to 'layout' param. Assumes the TileNode(i, j) already exists.
void slate::BaseMatrix< scalar_t >::tileBcast | ( | int64_t | i, |
int64_t | j, | ||
BaseMatrix< scalar_t > const & | B, | ||
Layout | layout, | ||
int | tag = 0 |
||
) |
Send tile {i, j} of op(A) to all MPI ranks in matrix B.
If target is Devices, also copies tile to all devices on each MPI rank. This should be called by at least all ranks with local tiles in B; ones that do not have any local tiles are excluded from the broadcast. Data received must be in 'layout' (ColMajor/RowMajor) major.
target | Destination to target; either Host (default) or Device. |
[in] | i | Tile's block row index. 0 <= i < mt. |
[in] | j | Tile's block column index. 0 <= j < nt. |
[in] | B | Sub-matrix B defines the MPI ranks to send to. Usually it is the portion of the matrix to be updated by tile {i, j}. |
[in] | layout | Indicates the Layout (ColMajor/RowMajor) of the broadcasted data. WARNING: must match the layout of the tile in the sender MPI rank. |
[in] | tag | MPI tag, default 0. |
|
protected |
[internal] Broadcast tile {i, j} to all MPI ranks in the bcast_set.
This should be called by all (and only) ranks that are in bcast_set, as either the root sender or a receiver. This function implements a custom pattern using sends and receives. Data received must be in 'layout' (ColMajor/RowMajor) major.
|
inline |
Returns tileDevice function.
Useful to construct matrices with the same block size. For submatrices, this is of the parent matrix.
void slate::BaseMatrix< scalar_t >::tileErase | ( | int64_t | i, |
int64_t | j, | ||
int | device = HostNum |
||
) |
Erase tile {i, j} of op(A) on device (host, one device or all devices).
If tile's memory was allocated by SLATE, via tileInsert(i, j, dev) or tileInsertWorkspace(i, j, dev), then the memory is released to the allocator pool.
void slate::BaseMatrix< scalar_t >::tileGetAllForReading | ( | int | device, |
LayoutConvert | layout | ||
) |
Gets all local tiles for reading on device.
[in] | device | Tile's destination: host or device ID, defaults to host. |
[in] | layout | Indicates whether to convert the Layout of the received data:
|
void slate::BaseMatrix< scalar_t >::tileGetAllForReadingOnDevices | ( | LayoutConvert | layout | ) |
Gets all local tiles for reading on corresponding devices.
[in] | layout | Indicates whether to convert the Layout of the received data:
|
void slate::BaseMatrix< scalar_t >::tileGetAllForWriting | ( | int | device, |
LayoutConvert | layout | ||
) |
Gets all local tiles for writing on device.
[in] | device | Tile's destination: host or device ID, defaults to host. |
[in] | layout | Indicates whether to convert the Layout of the received data:
|
void slate::BaseMatrix< scalar_t >::tileGetAllForWritingOnDevices | ( | LayoutConvert | layout | ) |
Gets all local tiles for writing on corresponding devices.
[in] | layout | Indicates whether to convert the Layout of the received data:
|
void slate::BaseMatrix< scalar_t >::tileGetAndHold | ( | int64_t | i, |
int64_t | j, | ||
int | device, | ||
LayoutConvert | layout | ||
) |
Gets tile(i, j) on device for reading and marks it as MOSI::OnHold.
Will copy tile in if it does not exist or its state is MOSI::Invalid. Updates the source tile's state to MOSI::Shared if copied-in. Converts destination Layout based on 'layout' param.
[in] | i | Tile's block row index. 0 <= i < mt. |
[in] | j | Tile's block column index. 0 <= j < nt. |
[in] | device | Tile's destination: host or device ID, defaults to host. |
[in] | layout | Indicates whether to convert the Layout of the received data:
|
|
inline |
Gets tile(i, j) on host for reading and marks it as MOSI::OnHold.
void slate::BaseMatrix< scalar_t >::tileGetAndHold | ( | std::set< ij_tuple > & | tile_set, |
int | device, | ||
LayoutConvert | layout | ||
) |
Gets a set of tiles for reading on device and marks them as MOSI::OnHold.
[in] | tile_set | Set of (i, j) tuples indicating indices of Tiles' to be acquired. |
[in] | device | Tile's destination: host or device ID, defaults to host. |
[in] | layout | Indicates whether to convert the Layout of the received data:
|
|
inline |
Gets a set of tiles for reading on host and marks them as MOSI::OnHold.
void slate::BaseMatrix< scalar_t >::tileGetAndHoldAll | ( | int | device, |
LayoutConvert | layout | ||
) |
Gets all local tiles on device and marks them as MOSI::OnHold.
[in] | device | Tile's destination: host or device ID, defaults to host. |
[in] | layout | Indicates whether to convert the Layout of the received data:
|
void slate::BaseMatrix< scalar_t >::tileGetAndHoldAllOnDevices | ( | LayoutConvert | layout | ) |
Gets all local tiles on corresponding devices and marks them as MOSI::OnHold.
[in] | layout | Indicates whether to convert the Layout of the received data:
|
void slate::BaseMatrix< scalar_t >::tileGetForReading | ( | int64_t | i, |
int64_t | j, | ||
int | device, | ||
LayoutConvert | layout | ||
) |
Gets tile(i, j) for reading on device.
Will copy-in the tile if it does not exist or its state is MOSI::Invalid. Sets tile state to MOSI::Shared if copied-in. Finds a source tile whose state is valid (Modified|Shared) by looping on existing tile instances. Updates source tile's state to shared if copied-in. Converts destination Layout based on 'layout' param.
[in] | i | Tile's block row index. 0 <= i < mt. |
[in] | j | Tile's block column index. 0 <= j < nt. |
[in] | device | Tile's destination: host or device ID, defaults to host. |
[in] | layout | Indicates whether to convert the Layout of the received data:
|
|
inline |
Gets tile(i, j) for reading on host.
void slate::BaseMatrix< scalar_t >::tileGetForReading | ( | std::set< ij_tuple > & | tile_set, |
int | device, | ||
LayoutConvert | layout | ||
) |
Gets a set of tiles for reading on device.
[in] | tile_set | Set of (i, j) tuples indicating indices of Tiles' to be acquired. |
[in] | device | Tile's destination: host or device ID. |
[in] | layout | Indicates whether to convert the Layout of the received data:
|
|
inline |
Gets a set of tiles for reading on host.
void slate::BaseMatrix< scalar_t >::tileGetForWriting | ( | int64_t | i, |
int64_t | j, | ||
int | device, | ||
LayoutConvert | layout | ||
) |
Gets tile(i, j) for writing on device.
Sets destination tile's state to MOSI::Modified. Will copy-in the tile if it does not exist or its state is MOSI::Invalid. Other instances will be invalidated. Finds a source tile whose state is valid (Modified|Shared) by scanning existing tile instances. Converts destination Layout based on 'layout' param.
[in] | i | Tile's block row index. 0 <= i < mt. |
[in] | j | Tile's block column index. 0 <= j < nt. |
[in] | device | Tile's destination: host or device ID, defaults to host. |
[in] | layout | Indicates whether to convert the Layout of the received data:
|
|
inline |
Gets tile(i, j) for writing on host.
void slate::BaseMatrix< scalar_t >::tileGetForWriting | ( | std::set< ij_tuple > & | tile_set, |
int | device, | ||
LayoutConvert | layout | ||
) |
Gets a set of tiles for writing on device.
[in] | tile_set | Set of (i, j) tuples indicating indices of Tiles' to be acquired. |
[in] | device | Tile's destination: host or device ID, defaults to host. |
[in] | layout | Indicates whether to convert the Layout of the received data:
|
|
inline |
Gets a set of tiles for writing on host.
|
protected |
[internal] Broadcast tile {i, j} to all MPI ranks in the bcast_set.
This should be called by all (and only) ranks that are in bcast_set, as either the root sender or a receiver. This function implements a custom pattern using sends and receives. Data received must be in 'layout' (ColMajor/RowMajor) major. Nonblocking sends are used, with requests appended to the provided vector.
[in] | i | Tile's block row index. 0 <= i < mt. |
[in] | j | Tile's block column index. 0 <= j < nt. |
[in] | bcast_set | Set of MPI ranks to broadcast to. |
[in] | radix | Radix of the communication pattern. |
[in] | tag | MPI tag, default 0. |
[in] | layout | Indicates the Layout (ColMajor/RowMajor) of the received data. |
[in,out] | send_requests | Vector where requests for this bcast are appended. |
Tile< scalar_t > slate::BaseMatrix< scalar_t >::tileInsert | ( | int64_t | i, |
int64_t | j, | ||
int | device, | ||
scalar_t * | data, | ||
int64_t | ld | ||
) |
Insert tile {i, j} of op(A) with existing data.
[in] | i | Tile's block row index. 0 <= i < mt. |
[in] | j | Tile's block column index. 0 <= j < nt. |
[in] | device | Tile's device ID; default is HostNum (provided by overloaded function). |
[in,out] | data | Tile's data. The matrix uses this pointer directly, it does not copy the data, so the data must remain valid while the matrix exists. |
[in] | ld | Leading dimension of data; column stride. ld >= tileMb(i). |
Tile< scalar_t > slate::BaseMatrix< scalar_t >::tileInsert | ( | int64_t | i, |
int64_t | j, | ||
int | device = HostNum |
||
) |
|
inline |
Insert tile with default device=HostNum.
Tile< scalar_t > slate::BaseMatrix< scalar_t >::tileInsertWorkspace | ( | int64_t | i, |
int64_t | j, | ||
int | device, | ||
Layout | layout | ||
) |
void slate::BaseMatrix< scalar_t >::tileIrecv | ( | int64_t | i, |
int64_t | j, | ||
int | src_rank, | ||
Layout | layout, | ||
int | tag, | ||
MPI_Request * | request | ||
) |
Receive tile {i, j} of op(A) to the given MPI rank using immediate mode.
Tile is allocated as workspace if it doesn't yet exist. Source rank must call tileSend() or tileIsend(). Data received must be in 'layout' (ColMajor/RowMajor) major.
For non-GPU-aware MPI, Irecv currently can't respect the target.
[in] | i | Tile's block row index. 0 <= i < mt. |
[in] | j | Tile's block column index. 0 <= j < nt. |
[in] | src_rank | Source MPI rank. If src_rank == mpiRank, this is a no-op. |
[in] | layout | Indicates the Layout (ColMajor/RowMajor) of the received data. WARNING: must match the layout of the tile in the sender MPI rank. |
[in] | tag | MPI tag |
[out] | request | MPI request object |
void slate::BaseMatrix< scalar_t >::tileIsend | ( | int64_t | i, |
int64_t | j, | ||
int | dst_rank, | ||
int | tag, | ||
MPI_Request * | request | ||
) |
Immediately send tile {i, j} of op(A) to the given MPI rank.
Destination rank must call tileRecv() or tileIrecv().
target | Destination to target; either Host (default) or Device. |
void slate::BaseMatrix< scalar_t >::tileLayoutConvert | ( | int | device, |
Layout | layout, | ||
bool | reset = false |
||
) |
Converts all existing tile instances on 'device' into 'layout' Operates in batch mode.
[in] | device | Tiles' host or device ID. |
[in] | layout | Intended Layout of tiles:
|
[in] | reset | Optionally resets the tiles extended buffers. |
void slate::BaseMatrix< scalar_t >::tileLayoutConvert | ( | int64_t | i, |
int64_t | j, | ||
int | device, | ||
Layout | layout, | ||
bool | reset = false , |
||
bool | async = false |
||
) |
Converts tile(i, j, device) into 'layout'.
Tile should exist on 'device', will assert otherwise. Tile will be made Convertible if it was not.
[in] | i | Tile's block row index. 0 <= i < mt. |
[in] | j | Tile's block column index. 0 <= j < nt. |
[in] | device | Tile's host or device ID. |
[in] | layout | Intended Layout of tile:
|
[in] | reset | Optionally resets the tile extended buffers. |
todo: handle op(A), sub-matrix, and sliced-matrix
void slate::BaseMatrix< scalar_t >::tileLayoutConvert | ( | std::set< ij_tuple > & | tile_set, |
int | device, | ||
Layout | layout, | ||
bool | reset = false |
||
) |
Converts tiles indicated in 'tile_set' that exist on 'device' into 'layout' if not already in 'layout' major.
Tiles should exist on 'device', will throw exception otherwise. Operates in batch mode when tiles are on devices. If device is not Host, will bucket tiles into uniform size and stride batches, then launches each batch transpose.
[in] | tile_set | Set of (i, j) tuples indicating indices of Tiles' to be converted. |
[in] | device | Tiles' host or device ID. |
[in] | layout | Intended Layout of tiles:
|
[in] | reset | Optionally resets the tiles extended buffers. |
void slate::BaseMatrix< scalar_t >::tileLayoutConvertOnDevices | ( | Layout | layout, |
bool | reset = false |
||
) |
Converts all existing tile instances on available devices into 'layout'.
Host tiles are not affected. Tiles should exist already on devices. Operates in batch mode.
[in] | layout | Intended Layout of tiles:
|
[in] | reset | Optionally resets the tile extended buffers. |
bool slate::BaseMatrix< scalar_t >::tileLayoutIsConvertible | ( | int64_t | i, |
int64_t | j, | ||
int | device = HostNum |
||
) |
Returns whether tile(i, j, device) can be safely transposed.
based on its 'TileKind', buffer size, Layout, and stride. Tile instance on 'device' should exist.
void slate::BaseMatrix< scalar_t >::tileLayoutReset |
Converts all origin tiles into current matrix-layout.
Operates in batch mode.
void slate::BaseMatrix< scalar_t >::tileLayoutReset | ( | int64_t | i, |
int64_t | j, | ||
int | device, | ||
Layout | layout | ||
) |
void slate::BaseMatrix< scalar_t >::tileLayoutReset | ( | std::set< ij_tuple > & | tile_set, |
int | device, | ||
Layout | layout | ||
) |
Converts set of tiles into current layout and resets their extended buffers.
Operates in batch mode.
[in] | tile_set | Set of (i, j) indices of tiles to be converted and reset. |
[in] | device | Tile's device ID; default is HostNum. |
[in] | layout | Intended Layout of tiles:
|
int64_t slate::BaseMatrix< scalar_t >::tileMb | ( | int64_t | i | ) | const |
Returns number of rows (mb) in block row i of op(A).
[in] | i | Tile's block row index. 0 <= i < mt. |
|
inline |
Returns tileMb function.
Useful to construct matrices with the same block size. For submatrices, this is of the parent matrix.
void slate::BaseMatrix< scalar_t >::tileModified | ( | int64_t | i, |
int64_t | j, | ||
int | device = HostNum , |
||
bool | permissive = false |
||
) |
Marks tile(i, j) as Modified on device.
Other instances will be invalidated. Unless permissive, asserts if other instances are in Modified state.
int64_t slate::BaseMatrix< scalar_t >::tileNb | ( | int64_t | j | ) | const |
Returns number of cols (nb) in block col j of op(A).
[in] | j | Tile's block column index. 0 <= j < nt. |
|
inline |
Returns tileNb function.
Useful to construct matrices with the same block size. For submatrices, this is of the parent matrix.
|
inline |
|
inline |
Returns tileRank function.
Useful to construct matrices with the same block size. For submatrices, this is of the parent matrix.
|
inline |
Returns how many times the tile {i, j} is received through MPI.
This function is used to track tiles that may be communicated twice due to symmetry during hemm and symm operations.
void slate::BaseMatrix< scalar_t >::tileRecv | ( | int64_t | i, |
int64_t | j, | ||
int | src_rank, | ||
Layout | layout, | ||
int | tag = 0 |
||
) |
Receive tile {i, j} of op(A) to the given MPI rank.
Tile is allocated as workspace if it doesn't yet exist. Source rank must call tileSend() or tileIsend(). Data received must be in 'layout' (ColMajor/RowMajor) major.
Note that when target=Devices and non-GPU-aware MPI is used, an OpenMP task may be created to copy the tile to device.
target | Destination to target; either Host (default) or Device. |
[in] | i | Tile's block row index. 0 <= i < mt. |
[in] | j | Tile's block column index. 0 <= j < nt. |
[in] | src_rank | Source MPI rank. If src_rank == mpiRank, this is a no-op. |
[in] | layout | Indicates the Layout (ColMajor/RowMajor) of the received data. WARNING: must match the layout of the tile in the sender MPI rank. |
[in] | tag | MPI tag, default 0. |
void slate::BaseMatrix< scalar_t >::tileRelease | ( | int64_t | i, |
int64_t | j, | ||
int | device = HostNum |
||
) |
Erase the tile {i, j}'s instance on device if it is a workspace tile with no hold set on it.
If tile's memory was allocated by SLATE, via tileInsert(i, j, dev) or tileInsertWorkspace(i, j, dev), then the memory is released to the allocator pool. For local tiles, it ensures that a valid copy remains.
Note that local tiles are currently not released if it would leave all remaining tiles invalid, but this behavior may change in the future and should not be relied on.
void slate::BaseMatrix< scalar_t >::tileSend | ( | int64_t | i, |
int64_t | j, | ||
int | dst_rank, | ||
int | tag = 0 |
||
) |
Send tile {i, j} of op(A) to the given MPI rank.
Destination rank must call tileRecv() or tileIrecv().
target | Destination to target; either Host (default) or Device. |
|
inline |
|
inline |
Decrements life counter of workspace tile {i, j} of op(A).
Then, if life reaches 0, deletes tile on all devices. For local, non-workspace tiles, does nothing.
|
inline |
void slate::BaseMatrix< scalar_t >::tileUnsetHoldAll | ( | int | device = HostNum | ) |
Unsets all local tiles' hold on device.
[in] | device | Tile's device ID. |
Tile< scalar_t > slate::BaseMatrix< scalar_t >::tileUpdateOrigin | ( | int64_t | i, |
int64_t | j | ||
) |
Updates the origin instance of tile(i, j) if MOSI::Invalid tile must be local.
|
friend |
Returns shallow copy of op(A) that is conjugate-transpose.
|
friend |
Returns shallow copy of op(A) that is transposed.
Making this a template avoids repeating the code ad nauseam in each class. Tile and BaseMatrix make this a friend, to change op.