SLATE 2024.05.31
Software for Linear Algebra Targeting Exascale
Loading...
Searching...
No Matches
slate::BaseMatrix< scalar_t > Class Template Reference

Base class for all SLATE distributed, tiled matrices. More...

#include <BaseMatrix.hh>

Inheritance diagram for slate::BaseMatrix< scalar_t >:
slate::BaseBandMatrix< scalar_t > slate::BaseTrapezoidMatrix< scalar_t > slate::Matrix< scalar_t > slate::BandMatrix< scalar_t > slate::BaseTriangularBandMatrix< scalar_t > slate::HermitianMatrix< scalar_t > slate::SymmetricMatrix< scalar_t > slate::TrapezoidMatrix< scalar_t > slate::HermitianBandMatrix< scalar_t > slate::TriangularBandMatrix< scalar_t > slate::TriangularMatrix< scalar_t >

Public Types

using BcastList = std::vector< std::tuple< int64_t, int64_t, std::list< BaseMatrix< scalar_t > > > >
 
using BcastListTag = std::vector< std::tuple< int64_t, int64_t, std::list< BaseMatrix< scalar_t > >, int64_t > >
 
using ReduceList = std::vector< std::tuple< int64_t, int64_t, BaseMatrix< scalar_t >, std::list< BaseMatrix< scalar_t > > > >
 
using ij_tuple = typename MatrixStorage< scalar_t >::ij_tuple
 
typedef scalar_t value_type
 

Public Member Functions

template<typename out_scalar_t >
BaseMatrix< out_scalar_t > baseEmptyLike (int64_t mb, int64_t nb, Op deepOp)
 [internal]
 
Tile< scalar_t > operator() (int64_t i, int64_t j, int device=HostNum)
 Get shallow copy of tile {i, j} of op(A) on given device, with the tile's op flag set to match the matrix's.
 
Tile< scalar_t > at (int64_t i, int64_t j, int device=HostNum)
 Alias of operator().
 
int num_devices () const
 Returns number of devices (per MPI process) to distribute matrix to.
 
void gridinfo (GridOrder *order, int *nprow, int *npcol, int *myrow, int *mycol)
 Get nprow, npcol, myrow, mycol for 2D block cyclic (2DBC) distribution.
 
std::function< int64_t(int64_t i)> tileMbFunc () const
 Returns tileMb function.
 
std::function< int64_t(int64_t j)> tileNbFunc () const
 Returns tileNb function.
 
std::function< int(ij_tuple ij)> tileRankFunc () const
 Returns tileRank function.
 
std::function< int(ij_tuple ij)> tileDeviceFunc () const
 Returns tileDevice function.
 
int64_t m () const
 Returns number of rows in op(A).
 
int64_t n () const
 Returns number of columns in op(A).
 
int64_t mt () const
 Returns number of block rows in op(A).
 
int64_t nt () const
 Returns number of block cols in op(A).
 
Op op () const
 Returns transposition operation op(A) as NoTrans, Trans, or ConjTrans.
 
bool tileExists (int64_t i, int64_t j, int device=HostNum)
 returns true if tile exists on specified device
 
int tileRank (int64_t i, int64_t j) const
 Returns MPI rank of tile {i, j} of op(A).
 
int tileDevice (int64_t i, int64_t j) const
 Returns device of tile {i, j} of op(A).
 
bool tileIsLocal (int64_t i, int64_t j) const
 Returns whether tile {i, j} of op(A) is local.
 
Uplo uplo () const
 Returns whether op(A) is logically Lower, Upper, or General.
 
Uplo uploLogical () const
 Returns whether op(A) is logically Lower, Upper, or General storage, taking the transposition operation into account.
 
Uplo uploPhysical () const
 Returns whether A is physically Lower, Upper, or General storage, ignoring the transposition operation.
 
Tile< scalar_t > originTile (int64_t i, int64_t j)
 Returns the origin tile instance of tile(i, j)
 
Target origin () const
 Tile origin.
 
int64_t tileMb (int64_t i) const
 Returns number of rows (mb) in block row i of op(A).
 
int64_t tileNb (int64_t j) const
 Returns number of cols (nb) in block col j of op(A).
 
Tile< scalar_t > tileInsert (int64_t i, int64_t j, int device=HostNum)
 Insert tile {i, j} of op(A) and allocate its data.
 
Tile< scalar_t > tileInsert (int64_t i, int64_t j, int device, scalar_t *A, int64_t ld)
 Insert tile {i, j} of op(A) with existing data.
 
Tile< scalar_t > tileInsert (int64_t i, int64_t j, scalar_t *A, int64_t ld)
 Insert tile with default device=HostNum.
 
Tile< scalar_t > tileInsertWorkspace (int64_t i, int64_t j, int device, Layout layout)
 Insert a workspace tile {i, j} of op(A) and allocate its data.
 
Tile< scalar_t > tileInsertWorkspace (int64_t i, int64_t j, int device)
 
Tile< scalar_t > tileInsertWorkspace (int64_t i, int64_t j, Layout layout)
 
Tile< scalar_t > tileInsertWorkspace (int64_t i, int64_t j)
 
scalar_t * allocWorkspaceBuffer (int device, int64_t size)
 Allocates a workspace buffer using the matrix's memory pool.
 
void freeWorkspaceBuffer (int device, scalar_t *buffer)
 Frees a workspace buffer allocated with BaseMatrix::allocWorkspaceBuffer.
 
MOSI tileState (int64_t i, int64_t j, int device=HostNum)
 Returns tile(i, j)'s state on device (defaults to host).
 
bool tileOnHold (int64_t i, int64_t j, int device=HostNum)
 Returns whether tile(i, j) is OnHold on device (defaults to host).
 
void tileUnsetHold (int64_t i, int64_t j, int device=HostNum)
 Unsets the hold of tile(i, j) on device (defaults to host) if it was OnHold.
 
void tileUnsetHoldAll (int device=HostNum)
 Unsets all local tiles' hold on device.
 
void tileUnsetHoldAllOnDevices ()
 Unsets all local tiles' hold on all devices.
 
void tileModified (int64_t i, int64_t j, int device=HostNum, bool permissive=false)
 Marks tile(i, j) as Modified on device.
 
void tileAcquire (int64_t i, int64_t j, int device, Layout layout)
 Acquire tile(i, j) on device without copying data if not already exists.
 
void tileAcquire (int64_t i, int64_t j, Layout layout)
 
void tileGetForReading (int64_t i, int64_t j, int device, LayoutConvert layout)
 Gets tile(i, j) for reading on device.
 
void tileGetForReading (std::set< ij_tuple > &tile_set, int device, LayoutConvert layout)
 Gets a set of tiles for reading on device.
 
void tileGetForReading (int64_t i, int64_t j, LayoutConvert layout)
 Gets tile(i, j) for reading on host.
 
void tileGetForReading (std::set< ij_tuple > &tile_set, LayoutConvert layout)
 Gets a set of tiles for reading on host.
 
void tileGetForReading (std::set< ij_tuple > &tile_set, LayoutConvert layout, int from_device)
 
void tileGetAllForReading (int device, LayoutConvert layout)
 Gets all local tiles for reading on device.
 
void tileGetAllForReadingOnDevices (LayoutConvert layout)
 Gets all local tiles for reading on corresponding devices.
 
void tileGetForWriting (int64_t i, int64_t j, int device, LayoutConvert layout)
 Gets tile(i, j) for writing on device.
 
void tileGetForWriting (std::set< ij_tuple > &tile_set, int device, LayoutConvert layout)
 Gets a set of tiles for writing on device.
 
void tileGetForWriting (int64_t i, int64_t j, LayoutConvert layout)
 Gets tile(i, j) for writing on host.
 
void tileGetForWriting (std::set< ij_tuple > &tile_set, LayoutConvert layout)
 Gets a set of tiles for writing on host.
 
void tileGetAllForWriting (int device, LayoutConvert layout)
 Gets all local tiles for writing on device.
 
void tileGetAllForWritingOnDevices (LayoutConvert layout)
 Gets all local tiles for writing on corresponding devices.
 
void tileGetAndHold (int64_t i, int64_t j, int device, LayoutConvert layout)
 Gets tile(i, j) on device for reading and marks it as MOSI::OnHold.
 
void tileGetAndHold (int64_t i, int64_t j, LayoutConvert layout)
 Gets tile(i, j) on host for reading and marks it as MOSI::OnHold.
 
void tileGetAndHold (std::set< ij_tuple > &tile_set, int device, LayoutConvert layout)
 Gets a set of tiles for reading on device and marks them as MOSI::OnHold.
 
void tileGetAndHold (std::set< ij_tuple > &tile_set, LayoutConvert layout)
 Gets a set of tiles for reading on host and marks them as MOSI::OnHold.
 
void tileGetAndHoldAll (int device, LayoutConvert layout)
 Gets all local tiles on device and marks them as MOSI::OnHold.
 
void tileGetAndHoldAllOnDevices (LayoutConvert layout)
 Gets all local tiles on corresponding devices and marks them as MOSI::OnHold.
 
Tile< scalar_t > tileUpdateOrigin (int64_t i, int64_t j)
 Updates the origin instance of tile(i, j) if MOSI::Invalid tile must be local.
 
void tileUpdateAllOrigin ()
 Updates all origin instances of local tiles if MOSI::Invalid.
 
int64_t tileLife (int64_t i, int64_t j) const
 Returns life counter of tile {i, j} of op(A).
 
void tileLife (int64_t i, int64_t j, int64_t life)
 Set life counter of tile {i, j} of op(A).
 
void tileTick (int64_t i, int64_t j)
 Decrements life counter of workspace tile {i, j} of op(A).
 
int64_t tileReceiveCount (int64_t i, int64_t j) const
 Returns how many times the tile {i, j} is received through MPI.
 
void tileIncrementReceiveCount (int64_t i, int64_t j)
 Increments the number of times the tile {i, j} is received through MPI.
 
void tileDecrementReceiveCount (int64_t i, int64_t j, int64_t release_count=1)
 Decrements the number of times the tile {i, j} is received through MPI.
 
void tileErase (int64_t i, int64_t j, int device=HostNum)
 Erase tile {i, j} of op(A) on device (host, one device or all devices).
 
void tileRelease (int64_t i, int64_t j, int device=HostNum)
 Erase the tile {i, j}'s instance on device if it is a workspace tile with no hold set on it.
 
void tileSend (int64_t i, int64_t j, int dst_rank, int tag=0)
 Send tile {i, j} of op(A) to the given MPI rank.
 
template<Target target>
void tileSend (int64_t i, int64_t j, int dst_rank, int tag=0)
 
void tileIsend (int64_t i, int64_t j, int dst_rank, int tag, MPI_Request *request)
 Immediately send tile {i, j} of op(A) to the given MPI rank.
 
template<Target target = Target::Host>
void tileRecv (int64_t i, int64_t j, int dst_rank, Layout layout, int tag=0)
 Receive tile {i, j} of op(A) to the given MPI rank.
 
template<Target target = Target::Host>
void tileIrecv (int64_t i, int64_t j, int dst_rank, Layout layout, int tag, MPI_Request *request)
 Receive tile {i, j} of op(A) to the given MPI rank using immediate mode.
 
template<Target target = Target::Host>
void tileBcast (int64_t i, int64_t j, BaseMatrix const &B, Layout layout, int tag=0)
 Send tile {i, j} of op(A) to all MPI ranks in matrix B.
 
template<Target target = Target::Host>
void tileBcast (int64_t i, int64_t j, BaseMatrix const &B, Layout layout, int tag, int64_t life_factor)
 
template<Target target = Target::Host>
void listBcast (BcastList &bcast_list, Layout layout, int tag=0, bool is_shared=false)
 Send tile {i, j} of op(A) to all MPI ranks in the list of submatrices bcast_list.
 
template<Target target = Target::Host>
void listBcast (BcastList &bcast_list, Layout layout, int tag, int64_t life_factor, bool is_shared=false)
 
template<Target target = Target::Host>
void listBcastMT (BcastListTag &bcast_list, Layout layout, bool is_shared=false)
 Send tile {i, j} of op(A) to all MPI ranks in the list of submatrices bcast_list (using OpenMP taskloop and multi-threaded MPI).
 
template<Target target = Target::Host>
void listBcastMT (BcastListTag &bcast_list, Layout layout, int64_t life_factor, bool is_shared=false)
 
template<Target target = Target::Host>
void listReduce (ReduceList &reduce_list, Layout layout, int tag=0)
 
Layout layout () const
 Returns matrix layout flag.
 
Layout tileLayout (int64_t i, int64_t j, int device=HostNum)
 Returns Layout of tile(i, j, device)
 
void tileLayout (int64_t i, int64_t j, int device, Layout layout)
 Sets Layout of tile(i, j, device)
 
void tileLayout (int64_t i, int64_t j, Layout layout)
 Sets Layout of tile(i, j, host)
 
bool tileLayoutIsConvertible (int64_t i, int64_t j, int device=HostNum)
 Returns whether tile(i, j, device) can be safely transposed.
 
void tileLayoutConvert (int64_t i, int64_t j, int device, Layout layout, bool reset=false, bool async=false)
 Converts tile(i, j, device) into 'layout'.
 
void tileLayoutConvert (int64_t i, int64_t j, Layout layout, bool reset=false, bool async=false)
 Convert layout of tile(i, j) to layout on host, optionally reset.
 
void tileLayoutConvert (std::set< ij_tuple > &tile_set, int device, Layout layout, bool reset=false)
 Converts tiles indicated in 'tile_set' that exist on 'device' into 'layout' if not already in 'layout' major.
 
void tileLayoutConvert (std::set< ij_tuple > &tile_set, Layout layout, bool reset=false)
 Convert layout of a set of tiles to layout on host, optionally reset.
 
void tileLayoutConvert (int device, Layout layout, bool reset=false)
 Converts all existing tile instances on 'device' into 'layout' Operates in batch mode.
 
void tileLayoutConvertOnDevices (Layout layout, bool reset=false)
 Converts all existing tile instances on available devices into 'layout'.
 
void tileLayoutReset (int64_t i, int64_t j, int device, Layout layout)
 Converts tile(i, j) into current layout and resets its extended buffer.
 
void tileLayoutReset (int64_t i, int64_t j, Layout layout)
 
void tileLayoutReset (std::set< ij_tuple > &tile_set, int device, Layout layout)
 Converts set of tiles into current layout and resets their extended buffers.
 
void tileLayoutReset (std::set< ij_tuple > &tile_set, Layout layout)
 
void tileLayoutReset ()
 Converts all origin tiles into current matrix-layout.
 
void tileReduceFromSet (int64_t i, int64_t j, int root_rank, std::set< int > &reduce_set, int radix, int tag, Layout layout)
 [internal] WARNING: Sent and received tiles are converted to 'layout' major.
 
void getRanks (std::set< int > *bcast_set) const
 Puts all MPI ranks that have tiles in the matrix into the set.
 
void getLocalDevices (std::set< int > *dev_set) const
 Puts all devices that have local tiles in the matrix into the set.
 
int64_t numLocalTiles () const
 Returns number of local tiles in this matrix.
 
MPI_Comm mpiComm () const
 
int mpiRank () const
 
MPI_Group mpiGroup () const
 
void clear ()
 Removes all tiles from matrix.
 
void releaseWorkspace ()
 Clears all workspace tiles that are not OnHold.
 
void releaseLocalWorkspaceTile (int64_t i, int64_t j)
 Erases a given local workspace tile, if not on hold or modified.
 
void releaseLocalWorkspace ()
 Erases all local workspace tiles, if not on hold or modified.
 
void releaseLocalWorkspace (std::set< ij_tuple > &tile_set)
 Erases a given set of local workspace tiles from all devices including host, if not on hold or modified.
 
void releaseRemoteWorkspaceTile (int64_t i, int64_t j, int64_t release_count=1)
 Erases a given tile that is not local to node from all devices including host, if not on hold or modified.
 
void releaseRemoteWorkspace (int64_t receive_count=1)
 Erases tiles that are not local to node from all devices including host, if not on hold or modified.
 
void releaseRemoteWorkspace (std::set< ij_tuple > &tile_set, int64_t release_count=1)
 Erases a given set of tiles that are not local to node from all devices including host, if not on hold or modified.
 
void clearWorkspace ()
 Removes all temporary host and device workspace tiles from matrix.
 
void allocateBatchArrays (int64_t batch_size, int64_t num_arrays)
 Allocates batch arrays and BLAS++ queues for all devices.
 
void clearBatchArrays ()
 Removes batch arrays from matrix for all devices.
 
int64_t batchArraySize ()
 
scalar_t ** array_host (int device, int64_t batch_arrays_index=0)
 
scalar_t ** array_device (int device, int64_t batch_arrays_index=0)
 
lapack::Queue * comm_queue (int device)
 
lapack::Queue * compute_queue (int device, int queue_index=0)
 
int numComputeQueues ()
 

Static Public Attributes

static constexpr bool is_complex = slate::is_complex<scalar_t>::value
 
static constexpr bool is_real = ! is_complex
 

Protected Member Functions

 BaseMatrix ()
 [internal] Default constructor creates an empty matrix.
 
 BaseMatrix (int64_t m, int64_t n, std::function< int64_t(int64_t i)> &inTileMb, std::function< int64_t(int64_t j)> &inTileNb, std::function< int(ij_tuple ij)> &inTileRank, std::function< int(ij_tuple ij)> &inTileDevice, MPI_Comm mpi_comm)
 [internal] Construct matrix with mt block rows and nt block columns, such that sum_{i = 0}^{mt-1} tileMb(i) >= m, sum_{j = 0}^{nt-1} tileNb(j) >= n, where tileMb, tileNb, tileRank, tileDevice are given as functions.
 
 BaseMatrix (int64_t m, int64_t n, int64_t mb, int64_t nb, GridOrder order, int nprow, int npcol, MPI_Comm mpi_comm)
 [internal] Construct matrix with mt = ceil( m / mb ) block rows and nt = ceil( n / nb ) block columns, with fixed mb-by-nb tile size and 2D block cyclic distribution.
 
 BaseMatrix (int64_t m, int64_t n, int64_t mb, int64_t nb, int nprow, int npcol, MPI_Comm mpi_comm)
 With order = Col.
 
 BaseMatrix (int64_t m, int64_t n, int64_t nb, int nprow, int npcol, MPI_Comm mpi_comm)
 With mb = nb, order = Col.
 
 BaseMatrix (BaseMatrix &orig, int64_t i1, int64_t i2, int64_t j1, int64_t j2)
 [internal] Sub-matrix constructor creates shallow copy view of parent matrix, B[ i1:i2, j1:j2 ].
 
 BaseMatrix (BaseMatrix &orig, Slice slice)
 [internal] Sliced matrix constructor creates shallow copy view of parent matrix, B[ row1:row2, col1:col2 ].
 
void tileBcastToSet (int64_t i, int64_t j, std::set< int > const &bcast_set)
 
void tileBcastToSet (int64_t i, int64_t j, std::set< int > const &bcast_set, int radix, int tag, Layout layout, Target target)
 [internal] Broadcast tile {i, j} to all MPI ranks in the bcast_set.
 
void tileIbcastToSet (int64_t i, int64_t j, std::set< int > const &bcast_set, int radix, int tag, Layout layout, std::vector< MPI_Request > &send_requests, Target target)
 [internal] Broadcast tile {i, j} to all MPI ranks in the bcast_set.
 
std::tuple< int64_t, int64_t > globalIndex (int64_t i, int64_t j) const
 [internal] Returns index {i, j} in global matrix as a tuple, taking into account the local offset and transpose.
 
std::tuple< int64_t, int64_t, int > globalIndex (int64_t i, int64_t j, int device) const
 [internal] Returns index {i, j, dev} in global matrix as a tuple, taking into account the local offset and transpose.
 
int64_t row0_offset () const
 row offset of first block row.
 
int64_t col0_offset () const
 col offset of first block col.
 
int64_t last_mb () const
 rows in last block row.
 
int64_t last_nb () const
 cols in last block col.
 
int64_t ioffset () const
 block row offset with respect to original matrix
 
int64_t joffset () const
 block col offset with respect to original matrix
 

Protected Attributes

Uplo uplo_
 upper or lower storage
 
Op op_
 transpose operation with respect to original matrix
 
Layout layout_
 intended layout of the matrix. defaults to ColMajor.
 
Target origin_
 Tile origins. defaults to Host.
 
std::shared_ptr< MatrixStorage< scalar_t > > storage_
 shared storage of tiles and buffers
 
MPI_Comm mpi_comm_
 
MPI_Group mpi_group_
 
int mpi_rank_
 

Friends

class Debug
 
template<typename MatrixType >
MatrixType transpose (MatrixType &A)
 Returns shallow copy of op(A) that is transposed.
 
template<typename MatrixType >
MatrixType conj_transpose (MatrixType &A)
 Returns shallow copy of op(A) that is conjugate-transpose.
 
template<typename T >
void swap (BaseMatrix< T > &A, BaseMatrix< T > &B)
 

Detailed Description

template<typename scalar_t>
class slate::BaseMatrix< scalar_t >

Base class for all SLATE distributed, tiled matrices.

In general, the documentation refers to the current matrix object as op(A), to emphasize that it may be transposed with respect to its parent matrix.

Constructor & Destructor Documentation

◆ BaseMatrix() [1/5]

template<typename scalar_t >
slate::BaseMatrix< scalar_t >::BaseMatrix
protected

[internal] Default constructor creates an empty matrix.

Does not allocate any memory.

◆ BaseMatrix() [2/5]

template<typename scalar_t >
slate::BaseMatrix< scalar_t >::BaseMatrix ( int64_t  m,
int64_t  n,
std::function< int64_t(int64_t i)> &  inTileMb,
std::function< int64_t(int64_t j)> &  inTileNb,
std::function< int(ij_tuple ij)> &  inTileRank,
std::function< int(ij_tuple ij)> &  inTileDevice,
MPI_Comm  mpi_comm 
)
protected

[internal] Construct matrix with mt block rows and nt block columns, such that sum_{i = 0}^{mt-1} tileMb(i) >= m, sum_{j = 0}^{nt-1} tileNb(j) >= n, where tileMb, tileNb, tileRank, tileDevice are given as functions.

No tiles are allocated. Creates empty matrix storage.

Parameters
[in]mNumber of rows of the matrix. m >= 0.
[in]nNumber of columns of the matrix. n >= 0
[in]inTileMbFunction that takes block-row index, returns block-row size.
[in]inTileNbFunction that takes block-col index, returns block-col size.
[in]inTileRankFunction that takes tuple of { block-row, block-col } indices, returns MPI rank for that tile.
[in]inTileDeviceFunction that takes tuple of { block-row, block-col } indices, returns local GPU device ID for that tile.
[in]mpi_commMPI communicator to distribute matrix across.

◆ BaseMatrix() [3/5]

template<typename scalar_t >
slate::BaseMatrix< scalar_t >::BaseMatrix ( int64_t  m,
int64_t  n,
int64_t  mb,
int64_t  nb,
GridOrder  order,
int  nprow,
int  npcol,
MPI_Comm  mpi_comm 
)
protected

[internal] Construct matrix with mt = ceil( m / mb ) block rows and nt = ceil( n / nb ) block columns, with fixed mb-by-nb tile size and 2D block cyclic distribution.

No tiles are allocated. Creates empty matrix storage.

Parameters
[in]mNumber of rows of the matrix. m >= 0.
[in]nNumber of columns of the matrix. n >= 0
[in]mbRow block size in 2D block-cyclic distribution. mb > 0.
[in]nbColumn block size in 2D block-cyclic distribution. nb > 0.
[in]orderOrder to map MPI processes to tile grid, GridOrder::ColMajor (default) or GridOrder::RowMajor.
[in]nprowNumber of process rows in 2D block-cyclic distribution. nprow > 0.
[in]npcolNumber of process cols of 2D block-cyclic distribution. npcol > 0.
[in]mpi_commMPI communicator to distribute matrix across. nprow * npcol <= MPI_Comm_size( mpi_comm ).

◆ BaseMatrix() [4/5]

template<typename scalar_t >
slate::BaseMatrix< scalar_t >::BaseMatrix ( BaseMatrix< scalar_t > &  orig,
int64_t  i1,
int64_t  i2,
int64_t  j1,
int64_t  j2 
)
protected

[internal] Sub-matrix constructor creates shallow copy view of parent matrix, B[ i1:i2, j1:j2 ].

B[ 0:mt-1, 0:nt-1 ] is the entire B matrix. If i2 < i1 or j2 < j1, produces empty matrix. See Matrix::sub().

Parameters
[in,out]BParent matrix B.
[in]i1Starting block row index. 0 <= i1 < mt.
[in]i2Ending block row index (inclusive). i2 < mt.
[in]j1Starting block column index. 0 <= j1 < nt.
[in]j2Ending block column index (inclusive). j2 < nt.

◆ BaseMatrix() [5/5]

template<typename scalar_t >
slate::BaseMatrix< scalar_t >::BaseMatrix ( BaseMatrix< scalar_t > &  orig,
Slice  slice 
)
protected

[internal] Sliced matrix constructor creates shallow copy view of parent matrix, B[ row1:row2, col1:col2 ].

This takes row & col indices instead of block row & block col indices. B[ 0:m-1, 0:n-1 ] is the entire B matrix. If row2 < row1 or col2 < col1, produces empty matrix. See Matrix::slice().

Parameters
[in,out]BParent matrix B.
[in]sliceStart and end row and column indices:
  • slice.row1: Starting row index. 0 <= row1 < m.
  • slice.row2: Ending row index (inclusive). row2 < m.
  • slice.col1: Starting column index. 0 <= col1 < n.
  • slice.col2: Ending column index (inclusive). col2 < n.

Member Function Documentation

◆ allocateBatchArrays()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::allocateBatchArrays ( int64_t  batch_size,
int64_t  num_arrays 
)
inline

Allocates batch arrays and BLAS++ queues for all devices.

Matrix classes override this with versions that can also allocate based on the number of local tiles.

Parameters
[in]batch_sizeAllocate batch arrays as needed so that size of each batch array >= batch_size >= 0.
[in]num_arraysAllocate batch arrays as needed so that number of batch arrays per device >= num_arrays >= 1.

◆ allocWorkspaceBuffer()

template<typename scalar_t >
scalar_t * slate::BaseMatrix< scalar_t >::allocWorkspaceBuffer ( int  device,
int64_t  size 
)

Allocates a workspace buffer using the matrix's memory pool.

The memory must be freed with BaseMatrix::freeWorkspaceBuffer

Parameters
[in]deviceDevice ID (GPU or Host) where the memory block is needed.
[in]sizeThe required allocation size
Returns
Pointer to the buffer

◆ array_device()

template<typename scalar_t >
scalar_t ** slate::BaseMatrix< scalar_t >::array_device ( int  device,
int64_t  batch_arrays_index = 0 
)
inline
Returns
batch arrays on device

◆ array_host()

template<typename scalar_t >
scalar_t ** slate::BaseMatrix< scalar_t >::array_host ( int  device,
int64_t  batch_arrays_index = 0 
)
inline
Returns
batch arrays on host, to send to device

◆ batchArraySize()

template<typename scalar_t >
int64_t slate::BaseMatrix< scalar_t >::batchArraySize ( )
inline
Returns
currently allocated batch array size

◆ clear()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::clear ( )
inline

Removes all tiles from matrix.

WARNING: currently this clears the entire parent matrix, not just a sub-matrix.

◆ clearBatchArrays()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::clearBatchArrays ( )
inline

Removes batch arrays from matrix for all devices.

WARNING: currently this clears the entire parent matrix, not just a sub-matrix.

◆ clearWorkspace()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::clearWorkspace ( )
inline

Removes all temporary host and device workspace tiles from matrix.

WARNING: currently, this clears the entire parent matrix, not just a sub-matrix.

◆ comm_queue()

template<typename scalar_t >
lapack::Queue * slate::BaseMatrix< scalar_t >::comm_queue ( int  device)
inline
Returns
BLAS++ communication queues
Parameters
[in]deviceTile's device ID.

◆ compute_queue()

template<typename scalar_t >
lapack::Queue * slate::BaseMatrix< scalar_t >::compute_queue ( int  device,
int  queue_index = 0 
)
inline
Returns
BLAS++ compute queues
Parameters
[in]deviceTile's device ID
[in]queue_indexThe index of a specific set of queues

◆ freeWorkspaceBuffer()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::freeWorkspaceBuffer ( int  device,
scalar_t *  buffer 
)

Frees a workspace buffer allocated with BaseMatrix::allocWorkspaceBuffer.

Parameters
[in]deviceDevice ID (GPU or Host) where the memory block is needed.
[in]bufferPointer to the buffer

◆ getLocalDevices()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::getLocalDevices ( std::set< int > *  dev_set) const

Puts all devices that have local tiles in the matrix into the set.

Parameters
[out]dev_setOn output, set of device IDs that this sub-matrix has local tiles on.

◆ getRanks()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::getRanks ( std::set< int > *  bcast_set) const

Puts all MPI ranks that have tiles in the matrix into the set.

Parameters
[out]bcast_setOn output, set of MPI ranks that this sub-matrix has tiles on.

◆ globalIndex() [1/2]

template<typename scalar_t >
std::tuple< int64_t, int64_t > slate::BaseMatrix< scalar_t >::globalIndex ( int64_t  i,
int64_t  j 
) const
protected

[internal] Returns index {i, j} in global matrix as a tuple, taking into account the local offset and transpose.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.

◆ globalIndex() [2/2]

template<typename scalar_t >
std::tuple< int64_t, int64_t, int > slate::BaseMatrix< scalar_t >::globalIndex ( int64_t  i,
int64_t  j,
int  device 
) const
protected

[internal] Returns index {i, j, dev} in global matrix as a tuple, taking into account the local offset and transpose.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's device ID.

◆ gridinfo()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::gridinfo ( GridOrder order,
int *  nprow,
int *  npcol,
int *  myrow,
int *  mycol 
)

Get nprow, npcol, myrow, mycol for 2D block cyclic (2DBC) distribution.

If SLATE doesn't know the distribution, sets all values to -1. todo: Assumes col-major 2D block cyclic distribution, not row-major.

Parameters
[out]orderOrder to map MPI processes to tile grid.
[out]nprowNumber of process rows.
[out]npcolNumber of process cols.
[out]myrowProcess row for this process (MPI rank).
[out]mycolProcess col for this process (MPI rank).

◆ listBcast()

template<typename scalar_t >
template<Target target>
void slate::BaseMatrix< scalar_t >::listBcast ( BcastList &  bcast_list,
Layout  layout,
int  tag = 0,
bool  is_shared = false 
)

Send tile {i, j} of op(A) to all MPI ranks in the list of submatrices bcast_list.

Data received must be in 'layout' (ColMajor/RowMajor) major.

Template Parameters
targetDestination to target; either Host (default) or Device.
Parameters
[in]bcast_listList of submatrices defining the MPI ranks to send to. Usually it is the portion of the matrix to be updated by tile {i, j}.
[in]layoutIndicates the Layout (ColMajor/RowMajor) of the broadcasted data. WARNING: must match the layout of the tile in the sender MPI rank.
[in]tagMPI tag, default 0.
[in]is_sharedA flag to get and hold the broadcasted (prefetched) tiles on the devices. This flag prevents any subsequent calls of tileRelease() routine to release these tiles (clear up the devices memories). WARNING: must set unhold these tiles before releasing them to free up the allocated memories.

◆ listBcastMT()

template<typename scalar_t >
template<Target target>
void slate::BaseMatrix< scalar_t >::listBcastMT ( BcastListTag &  bcast_list,
Layout  layout,
bool  is_shared = false 
)

Send tile {i, j} of op(A) to all MPI ranks in the list of submatrices bcast_list (using OpenMP taskloop and multi-threaded MPI).

Data received must be in 'layout' (ColMajor/RowMajor) major.

Template Parameters
targetDestination to target; either Host (default) or Device.
Parameters
[in]bcast_listList of submatrices defining the MPI ranks to send to. Usually it is the portion of the matrix to be updated by tile {i, j}. Each tile {i, j} to be broadcast has a tag in the bcast_list.
[in]layoutIndicates the Layout (ColMajor/RowMajor) of the broadcasted data. WARNING: must match the layout of the tile in the sender MPI rank.
[in]is_sharedA flag to get and hold the broadcasted (prefetched) tiles on the devices. This flag prevents any subsequent calls of tileRelease() routine to release these tiles (clear up the devices memories). WARNING: must set unhold these tiles before releasing them to free up the allocated memories.

◆ numComputeQueues()

template<typename scalar_t >
int slate::BaseMatrix< scalar_t >::numComputeQueues ( )
inline
Returns
number of allocated BLAS++ compute queues

◆ operator()()

template<typename scalar_t >
Tile< scalar_t > slate::BaseMatrix< scalar_t >::operator() ( int64_t  i,
int64_t  j,
int  device = HostNum 
)

Get shallow copy of tile {i, j} of op(A) on given device, with the tile's op flag set to match the matrix's.

Parameters
[in]iTile's block row index. 0 <= i1 < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's device ID; default is HostNum.
Returns
Tile {i, j, device}.

◆ originTile()

template<typename scalar_t >
Tile< scalar_t > slate::BaseMatrix< scalar_t >::originTile ( int64_t  i,
int64_t  j 
)

Returns the origin tile instance of tile(i, j)

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.

◆ releaseLocalWorkspace()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::releaseLocalWorkspace ( std::set< ij_tuple > &  tile_set)

Erases a given set of local workspace tiles from all devices including host, if not on hold or modified.

Parameters
[in]tile_setSet of (i, j) tuples indicating indices of tiles to be erased.

◆ releaseRemoteWorkspace()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::releaseRemoteWorkspace ( std::set< ij_tuple > &  tile_set,
int64_t  release_count = 1 
)

Erases a given set of tiles that are not local to node from all devices including host, if not on hold or modified.

Parameters
[in]tile_setSet of (i, j) tuples indicating indices of tiles to be erased.

◆ releaseRemoteWorkspaceTile()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::releaseRemoteWorkspaceTile ( int64_t  i,
int64_t  j,
int64_t  release_count = 1 
)

Erases a given tile that is not local to node from all devices including host, if not on hold or modified.

The tile's receive count is decremented. If the receive count reaches zero, the tile is erased. Otherwise, tile is not erased.

◆ releaseWorkspace()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::releaseWorkspace ( )
inline

Clears all workspace tiles that are not OnHold.

For local tiles, it ensures that a valid copy remains.

Note that local tiles are currently not released if it would leave all remaining tiles invalid, but this behavior may change in the future and should not be relied on.

◆ tileAcquire()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileAcquire ( int64_t  i,
int64_t  j,
int  device,
Layout  layout 
)

Acquire tile(i, j) on device without copying data if not already exists.

This is used when the destination tile's data will be overridden. Converts destination Layout to 'layout' param. Assumes the TileNode(i, j) already exists.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's destination: host or device ID.
[in]layoutIndicates the required Layout of the received tile:
  • ColMajor: column major.
  • RowMajor: row major.

◆ tileBcast()

template<typename scalar_t >
template<Target target>
void slate::BaseMatrix< scalar_t >::tileBcast ( int64_t  i,
int64_t  j,
BaseMatrix< scalar_t > const &  B,
Layout  layout,
int  tag = 0 
)

Send tile {i, j} of op(A) to all MPI ranks in matrix B.

If target is Devices, also copies tile to all devices on each MPI rank. This should be called by at least all ranks with local tiles in B; ones that do not have any local tiles are excluded from the broadcast. Data received must be in 'layout' (ColMajor/RowMajor) major.

Template Parameters
targetDestination to target; either Host (default) or Device.
Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]BSub-matrix B defines the MPI ranks to send to. Usually it is the portion of the matrix to be updated by tile {i, j}.
[in]layoutIndicates the Layout (ColMajor/RowMajor) of the broadcasted data. WARNING: must match the layout of the tile in the sender MPI rank.
[in]tagMPI tag, default 0.

◆ tileBcastToSet()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileBcastToSet ( int64_t  i,
int64_t  j,
std::set< int > const &  bcast_set,
int  radix,
int  tag,
Layout  layout,
Target  target 
)
protected

[internal] Broadcast tile {i, j} to all MPI ranks in the bcast_set.

This should be called by all (and only) ranks that are in bcast_set, as either the root sender or a receiver. This function implements a custom pattern using sends and receives. Data received must be in 'layout' (ColMajor/RowMajor) major.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]bcast_setSet of MPI ranks to broadcast to.
[in]radixRadix of the communication pattern.
[in]tagMPI tag, default 0.
[in]layoutIndicates the Layout (ColMajor/RowMajor) of the received data.

◆ tileDeviceFunc()

template<typename scalar_t >
std::function< int(ij_tuple ij)> slate::BaseMatrix< scalar_t >::tileDeviceFunc ( ) const
inline

Returns tileDevice function.

Useful to construct matrices with the same block size. For submatrices, this is of the parent matrix.

◆ tileErase()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileErase ( int64_t  i,
int64_t  j,
int  device = HostNum 
)

Erase tile {i, j} of op(A) on device (host, one device or all devices).

If tile's memory was allocated by SLATE, via tileInsert(i, j, dev) or tileInsertWorkspace(i, j, dev), then the memory is released to the allocator pool.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's device ID.

◆ tileGetAllForReading()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetAllForReading ( int  device,
LayoutConvert  layout 
)

Gets all local tiles for reading on device.

See also
tileGetForReading.
Parameters
[in]deviceTile's destination: host or device ID, defaults to host.
[in]layoutIndicates whether to convert the Layout of the received data:
  • ColMajor: convert layout to column major.
  • RowMajor: convert layout to row major.
  • None: do not convert layout.

◆ tileGetAllForReadingOnDevices()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetAllForReadingOnDevices ( LayoutConvert  layout)

Gets all local tiles for reading on corresponding devices.

See also
tileGetForReading.
Parameters
[in]layoutIndicates whether to convert the Layout of the received data:
  • ColMajor: convert layout to column major.
  • RowMajor: convert layout to row major.
  • None: do not convert layout.

◆ tileGetAllForWriting()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetAllForWriting ( int  device,
LayoutConvert  layout 
)

Gets all local tiles for writing on device.

See also
tileGetForWriting.
Parameters
[in]deviceTile's destination: host or device ID, defaults to host.
[in]layoutIndicates whether to convert the Layout of the received data:
  • ColMajor: convert layout to column major.
  • RowMajor: convert layout to row major.
  • None: do not convert layout.

◆ tileGetAllForWritingOnDevices()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetAllForWritingOnDevices ( LayoutConvert  layout)

Gets all local tiles for writing on corresponding devices.

See also
tileGetForWriting.
Parameters
[in]layoutIndicates whether to convert the Layout of the received data:
  • ColMajor: convert layout to column major.
  • RowMajor: convert layout to row major.
  • None: do not convert layout.

◆ tileGetAndHold() [1/4]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetAndHold ( int64_t  i,
int64_t  j,
int  device,
LayoutConvert  layout 
)

Gets tile(i, j) on device for reading and marks it as MOSI::OnHold.

Will copy tile in if it does not exist or its state is MOSI::Invalid. Updates the source tile's state to MOSI::Shared if copied-in. Converts destination Layout based on 'layout' param.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's destination: host or device ID, defaults to host.
[in]layoutIndicates whether to convert the Layout of the received data:
  • ColMajor: convert layout to column major.
  • RowMajor: convert layout to row major.
  • None: do not convert layout.

◆ tileGetAndHold() [2/4]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetAndHold ( int64_t  i,
int64_t  j,
LayoutConvert  layout 
)
inline

Gets tile(i, j) on host for reading and marks it as MOSI::OnHold.

See also
tileGetAndHold

◆ tileGetAndHold() [3/4]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetAndHold ( std::set< ij_tuple > &  tile_set,
int  device,
LayoutConvert  layout 
)

Gets a set of tiles for reading on device and marks them as MOSI::OnHold.

See also
tileGetAndHold
Parameters
[in]tile_setSet of (i, j) tuples indicating indices of Tiles' to be acquired.
[in]deviceTile's destination: host or device ID, defaults to host.
[in]layoutIndicates whether to convert the Layout of the received data:
  • ColMajor: convert layout to column major.
  • RowMajor: convert layout to row major.
  • None: do not convert layout.

◆ tileGetAndHold() [4/4]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetAndHold ( std::set< ij_tuple > &  tile_set,
LayoutConvert  layout 
)
inline

Gets a set of tiles for reading on host and marks them as MOSI::OnHold.

See also
tileGetAndHold

◆ tileGetAndHoldAll()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetAndHoldAll ( int  device,
LayoutConvert  layout 
)

Gets all local tiles on device and marks them as MOSI::OnHold.

See also
tileGetAndHold.
Parameters
[in]deviceTile's destination: host or device ID, defaults to host.
[in]layoutIndicates whether to convert the Layout of the received data:
  • ColMajor: convert layout to column major.
  • RowMajor: convert layout to row major.
  • None: do not convert layout.

◆ tileGetAndHoldAllOnDevices()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetAndHoldAllOnDevices ( LayoutConvert  layout)

Gets all local tiles on corresponding devices and marks them as MOSI::OnHold.

See also
tileGetAndHold.
Parameters
[in]layoutIndicates whether to convert the Layout of the received data:
  • ColMajor: convert layout to column major.
  • RowMajor: convert layout to row major.
  • None: do not convert layout.

◆ tileGetForReading() [1/4]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetForReading ( int64_t  i,
int64_t  j,
int  device,
LayoutConvert  layout 
)

Gets tile(i, j) for reading on device.

Will copy-in the tile if it does not exist or its state is MOSI::Invalid. Sets tile state to MOSI::Shared if copied-in. Finds a source tile whose state is valid (Modified|Shared) by looping on existing tile instances. Updates source tile's state to shared if copied-in. Converts destination Layout based on 'layout' param.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's destination: host or device ID, defaults to host.
[in]layoutIndicates whether to convert the Layout of the received data:
  • ColMajor: convert layout to column major.
  • RowMajor: convert layout to row major.
  • None: do not convert layout.

◆ tileGetForReading() [2/4]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetForReading ( int64_t  i,
int64_t  j,
LayoutConvert  layout 
)
inline

Gets tile(i, j) for reading on host.

See also
tileGetForReading

◆ tileGetForReading() [3/4]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetForReading ( std::set< ij_tuple > &  tile_set,
int  device,
LayoutConvert  layout 
)

Gets a set of tiles for reading on device.

See also
tileGetForReading
Parameters
[in]tile_setSet of (i, j) tuples indicating indices of Tiles' to be acquired.
[in]deviceTile's destination: host or device ID.
[in]layoutIndicates whether to convert the Layout of the received data:
  • ColMajor: convert layout to column major.
  • RowMajor: convert layout to row major.
  • None: do not convert layout.

◆ tileGetForReading() [4/4]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetForReading ( std::set< ij_tuple > &  tile_set,
LayoutConvert  layout 
)
inline

Gets a set of tiles for reading on host.

See also
tileGetForReading

◆ tileGetForWriting() [1/4]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetForWriting ( int64_t  i,
int64_t  j,
int  device,
LayoutConvert  layout 
)

Gets tile(i, j) for writing on device.

Sets destination tile's state to MOSI::Modified. Will copy-in the tile if it does not exist or its state is MOSI::Invalid. Other instances will be invalidated. Finds a source tile whose state is valid (Modified|Shared) by scanning existing tile instances. Converts destination Layout based on 'layout' param.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's destination: host or device ID, defaults to host.
[in]layoutIndicates whether to convert the Layout of the received data:
  • ColMajor: convert layout to column major.
  • RowMajor: convert layout to row major.
  • None: do not convert layout.

◆ tileGetForWriting() [2/4]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetForWriting ( int64_t  i,
int64_t  j,
LayoutConvert  layout 
)
inline

Gets tile(i, j) for writing on host.

See also
tileGetForWriting

◆ tileGetForWriting() [3/4]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetForWriting ( std::set< ij_tuple > &  tile_set,
int  device,
LayoutConvert  layout 
)

Gets a set of tiles for writing on device.

See also
tileGetForWriting
Parameters
[in]tile_setSet of (i, j) tuples indicating indices of Tiles' to be acquired.
[in]deviceTile's destination: host or device ID, defaults to host.
[in]layoutIndicates whether to convert the Layout of the received data:
  • ColMajor: convert layout to column major.
  • RowMajor: convert layout to row major.
  • None: do not convert layout.

◆ tileGetForWriting() [4/4]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileGetForWriting ( std::set< ij_tuple > &  tile_set,
LayoutConvert  layout 
)
inline

Gets a set of tiles for writing on host.

See also
tileGetForWriting

◆ tileIbcastToSet()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileIbcastToSet ( int64_t  i,
int64_t  j,
std::set< int > const &  bcast_set,
int  radix,
int  tag,
Layout  layout,
std::vector< MPI_Request > &  send_requests,
Target  target 
)
protected

[internal] Broadcast tile {i, j} to all MPI ranks in the bcast_set.

This should be called by all (and only) ranks that are in bcast_set, as either the root sender or a receiver. This function implements a custom pattern using sends and receives. Data received must be in 'layout' (ColMajor/RowMajor) major. Nonblocking sends are used, with requests appended to the provided vector.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]bcast_setSet of MPI ranks to broadcast to.
[in]radixRadix of the communication pattern.
[in]tagMPI tag, default 0.
[in]layoutIndicates the Layout (ColMajor/RowMajor) of the received data.
[in,out]send_requestsVector where requests for this bcast are appended.

◆ tileInsert() [1/3]

template<typename scalar_t >
Tile< scalar_t > slate::BaseMatrix< scalar_t >::tileInsert ( int64_t  i,
int64_t  j,
int  device,
scalar_t *  data,
int64_t  ld 
)

Insert tile {i, j} of op(A) with existing data.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's device ID; default is HostNum (provided by overloaded function).
[in,out]dataTile's data. The matrix uses this pointer directly, it does not copy the data, so the data must remain valid while the matrix exists.
[in]ldLeading dimension of data; column stride. ld >= tileMb(i).
Returns
Pointer to new tile.

◆ tileInsert() [2/3]

template<typename scalar_t >
Tile< scalar_t > slate::BaseMatrix< scalar_t >::tileInsert ( int64_t  i,
int64_t  j,
int  device = HostNum 
)

Insert tile {i, j} of op(A) and allocate its data.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's device ID; default is HostNum.
Returns
Pointer to new tile.

◆ tileInsert() [3/3]

template<typename scalar_t >
Tile< scalar_t > slate::BaseMatrix< scalar_t >::tileInsert ( int64_t  i,
int64_t  j,
scalar_t *  A,
int64_t  ld 
)
inline

Insert tile with default device=HostNum.

See also
tileInsert.

◆ tileInsertWorkspace()

template<typename scalar_t >
Tile< scalar_t > slate::BaseMatrix< scalar_t >::tileInsertWorkspace ( int64_t  i,
int64_t  j,
int  device,
Layout  layout 
)

Insert a workspace tile {i, j} of op(A) and allocate its data.

The tile will be freed

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's device ID; default is HostNum.
Returns
Pointer to new tile.

◆ tileIrecv()

template<typename scalar_t >
template<Target target>
void slate::BaseMatrix< scalar_t >::tileIrecv ( int64_t  i,
int64_t  j,
int  src_rank,
Layout  layout,
int  tag,
MPI_Request *  request 
)

Receive tile {i, j} of op(A) to the given MPI rank using immediate mode.

Tile is allocated as workspace if it doesn't yet exist. Source rank must call tileSend() or tileIsend(). Data received must be in 'layout' (ColMajor/RowMajor) major.

For non-GPU-aware MPI, Irecv currently can't respect the target.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]src_rankSource MPI rank. If src_rank == mpiRank, this is a no-op.
[in]layoutIndicates the Layout (ColMajor/RowMajor) of the received data. WARNING: must match the layout of the tile in the sender MPI rank.
[in]tagMPI tag
[out]requestMPI request object

◆ tileIsend()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileIsend ( int64_t  i,
int64_t  j,
int  dst_rank,
int  tag,
MPI_Request *  request 
)

Immediately send tile {i, j} of op(A) to the given MPI rank.

Destination rank must call tileRecv() or tileIrecv().

Template Parameters
targetDestination to target; either Host (default) or Device.
Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]dst_rankDestination MPI rank. If dst_rank == mpiRank, this is a no-op.
[in]tagMPI tag, default 0.
[out]requestPointer to an MPI_Request struct

◆ tileLayoutConvert() [1/3]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileLayoutConvert ( int  device,
Layout  layout,
bool  reset = false 
)

Converts all existing tile instances on 'device' into 'layout' Operates in batch mode.

Parameters
[in]deviceTiles' host or device ID.
[in]layoutIntended Layout of tiles:
  • Layout::ColMajor or
  • Layout::RowMajor.
[in]resetOptionally resets the tiles extended buffers.

◆ tileLayoutConvert() [2/3]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileLayoutConvert ( int64_t  i,
int64_t  j,
int  device,
Layout  layout,
bool  reset = false,
bool  async = false 
)

Converts tile(i, j, device) into 'layout'.

Tile should exist on 'device', will assert otherwise. Tile will be made Convertible if it was not.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's host or device ID.
[in]layoutIntended Layout of tile:
  • Layout::ColMajor or
  • Layout::RowMajor.
[in]resetOptionally resets the tile extended buffers.

todo: handle op(A), sub-matrix, and sliced-matrix

◆ tileLayoutConvert() [3/3]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileLayoutConvert ( std::set< ij_tuple > &  tile_set,
int  device,
Layout  layout,
bool  reset = false 
)

Converts tiles indicated in 'tile_set' that exist on 'device' into 'layout' if not already in 'layout' major.

Tiles should exist on 'device', will throw exception otherwise. Operates in batch mode when tiles are on devices. If device is not Host, will bucket tiles into uniform size and stride batches, then launches each batch transpose.

Parameters
[in]tile_setSet of (i, j) tuples indicating indices of Tiles' to be converted.
[in]deviceTiles' host or device ID.
[in]layoutIntended Layout of tiles:
  • Layout::ColMajor or
  • Layout::RowMajor.
[in]resetOptionally resets the tiles extended buffers.

◆ tileLayoutConvertOnDevices()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileLayoutConvertOnDevices ( Layout  layout,
bool  reset = false 
)

Converts all existing tile instances on available devices into 'layout'.

Host tiles are not affected. Tiles should exist already on devices. Operates in batch mode.

Parameters
[in]layoutIntended Layout of tiles:
  • Layout::ColMajor or
  • Layout::RowMajor.
[in]resetOptionally resets the tile extended buffers.

◆ tileLayoutIsConvertible()

template<typename scalar_t >
bool slate::BaseMatrix< scalar_t >::tileLayoutIsConvertible ( int64_t  i,
int64_t  j,
int  device = HostNum 
)

Returns whether tile(i, j, device) can be safely transposed.

based on its 'TileKind', buffer size, Layout, and stride. Tile instance on 'device' should exist.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's host or device ID, defaults to host.

◆ tileLayoutReset() [1/3]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileLayoutReset

Converts all origin tiles into current matrix-layout.

Operates in batch mode.

◆ tileLayoutReset() [2/3]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileLayoutReset ( int64_t  i,
int64_t  j,
int  device,
Layout  layout 
)

Converts tile(i, j) into current layout and resets its extended buffer.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's device ID; default is HostNum.
[in]layoutIntended Layout of tiles:
  • Layout::ColMajor or
  • Layout::RowMajor.

◆ tileLayoutReset() [3/3]

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileLayoutReset ( std::set< ij_tuple > &  tile_set,
int  device,
Layout  layout 
)

Converts set of tiles into current layout and resets their extended buffers.

Operates in batch mode.

Parameters
[in]tile_setSet of (i, j) indices of tiles to be converted and reset.
[in]deviceTile's device ID; default is HostNum.
[in]layoutIntended Layout of tiles:
  • Layout::ColMajor or
  • Layout::RowMajor.

◆ tileMb()

template<typename scalar_t >
int64_t slate::BaseMatrix< scalar_t >::tileMb ( int64_t  i) const

Returns number of rows (mb) in block row i of op(A).

Parameters
[in]iTile's block row index. 0 <= i < mt.

◆ tileMbFunc()

template<typename scalar_t >
std::function< int64_t(int64_t i)> slate::BaseMatrix< scalar_t >::tileMbFunc ( ) const
inline

Returns tileMb function.

Useful to construct matrices with the same block size. For submatrices, this is of the parent matrix.

◆ tileModified()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileModified ( int64_t  i,
int64_t  j,
int  device = HostNum,
bool  permissive = false 
)

Marks tile(i, j) as Modified on device.

Other instances will be invalidated. Unless permissive, asserts if other instances are in Modified state.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's device ID, defaults to host.
[in]permissiveDefaults to false.

◆ tileNb()

template<typename scalar_t >
int64_t slate::BaseMatrix< scalar_t >::tileNb ( int64_t  j) const

Returns number of cols (nb) in block col j of op(A).

Parameters
[in]jTile's block column index. 0 <= j < nt.

◆ tileNbFunc()

template<typename scalar_t >
std::function< int64_t(int64_t j)> slate::BaseMatrix< scalar_t >::tileNbFunc ( ) const
inline

Returns tileNb function.

Useful to construct matrices with the same block size. For submatrices, this is of the parent matrix.

◆ tileOnHold()

template<typename scalar_t >
bool slate::BaseMatrix< scalar_t >::tileOnHold ( int64_t  i,
int64_t  j,
int  device = HostNum 
)
inline

Returns whether tile(i, j) is OnHold on device (defaults to host).

Asserts that the tile exists.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's device ID.

◆ tileRankFunc()

template<typename scalar_t >
std::function< int(ij_tuple ij)> slate::BaseMatrix< scalar_t >::tileRankFunc ( ) const
inline

Returns tileRank function.

Useful to construct matrices with the same block size. For submatrices, this is of the parent matrix.

◆ tileReceiveCount()

template<typename scalar_t >
int64_t slate::BaseMatrix< scalar_t >::tileReceiveCount ( int64_t  i,
int64_t  j 
) const
inline

Returns how many times the tile {i, j} is received through MPI.

This function is used to track tiles that may be communicated twice due to symmetry during hemm and symm operations.

◆ tileRecv()

template<typename scalar_t >
template<Target target>
void slate::BaseMatrix< scalar_t >::tileRecv ( int64_t  i,
int64_t  j,
int  src_rank,
Layout  layout,
int  tag = 0 
)

Receive tile {i, j} of op(A) to the given MPI rank.

Tile is allocated as workspace if it doesn't yet exist. Source rank must call tileSend() or tileIsend(). Data received must be in 'layout' (ColMajor/RowMajor) major.

Note that when target=Devices and non-GPU-aware MPI is used, an OpenMP task may be created to copy the tile to device.

Template Parameters
targetDestination to target; either Host (default) or Device.
Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]src_rankSource MPI rank. If src_rank == mpiRank, this is a no-op.
[in]layoutIndicates the Layout (ColMajor/RowMajor) of the received data. WARNING: must match the layout of the tile in the sender MPI rank.
[in]tagMPI tag, default 0.

◆ tileRelease()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileRelease ( int64_t  i,
int64_t  j,
int  device = HostNum 
)

Erase the tile {i, j}'s instance on device if it is a workspace tile with no hold set on it.

If tile's memory was allocated by SLATE, via tileInsert(i, j, dev) or tileInsertWorkspace(i, j, dev), then the memory is released to the allocator pool. For local tiles, it ensures that a valid copy remains.

Note that local tiles are currently not released if it would leave all remaining tiles invalid, but this behavior may change in the future and should not be relied on.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's device ID.

◆ tileSend()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileSend ( int64_t  i,
int64_t  j,
int  dst_rank,
int  tag = 0 
)

Send tile {i, j} of op(A) to the given MPI rank.

Destination rank must call tileRecv() or tileIrecv().

Template Parameters
targetDestination to target; either Host (default) or Device.
Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]dst_rankDestination MPI rank. If dst_rank == mpiRank, this is a no-op.
[in]tagMPI tag, default 0.

◆ tileState()

template<typename scalar_t >
MOSI slate::BaseMatrix< scalar_t >::tileState ( int64_t  i,
int64_t  j,
int  device = HostNum 
)
inline

Returns tile(i, j)'s state on device (defaults to host).

Asserts that the tile exists.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's device ID.

◆ tileTick()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileTick ( int64_t  i,
int64_t  j 
)
inline

Decrements life counter of workspace tile {i, j} of op(A).

Then, if life reaches 0, deletes tile on all devices. For local, non-workspace tiles, does nothing.

◆ tileUnsetHold()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileUnsetHold ( int64_t  i,
int64_t  j,
int  device = HostNum 
)
inline

Unsets the hold of tile(i, j) on device (defaults to host) if it was OnHold.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
[in]deviceTile's device ID.

◆ tileUnsetHoldAll()

template<typename scalar_t >
void slate::BaseMatrix< scalar_t >::tileUnsetHoldAll ( int  device = HostNum)

Unsets all local tiles' hold on device.

Parameters
[in]deviceTile's device ID.

◆ tileUpdateOrigin()

template<typename scalar_t >
Tile< scalar_t > slate::BaseMatrix< scalar_t >::tileUpdateOrigin ( int64_t  i,
int64_t  j 
)

Updates the origin instance of tile(i, j) if MOSI::Invalid tile must be local.

Parameters
[in]iTile's block row index. 0 <= i < mt.
[in]jTile's block column index. 0 <= j < nt.
Returns
Pointer to origin instance of tile(i, j)

Friends And Related Symbol Documentation

◆ conj_transpose

template<typename scalar_t >
template<typename MatrixType >
MatrixType conj_transpose ( MatrixType &  A)
friend

Returns shallow copy of op(A) that is conjugate-transpose.

See also
transpose
Returns
copy of Tile, etc. with updated op flag.
See also
transpose()

◆ transpose

template<typename scalar_t >
template<typename MatrixType >
MatrixType transpose ( MatrixType &  A)
friend

Returns shallow copy of op(A) that is transposed.

See also
conj_transpose
Returns
copy of Tile, etc. with updated op flag.

Making this a template avoids repeating the code ad nauseam in each class. Tile and BaseMatrix make this a friend, to change op.


The documentation for this class was generated from the following file: