Base class for all SLATE distributed, tiled matrices. More...

#include <BaseMatrix.hh>

Inheritance diagram for slate::BaseMatrix< scalar_t >:

Public Types
using	BcastList = std::vector< std::tuple< int64_t, int64_t, std::list< BaseMatrix< scalar_t > > > >

using	BcastListTag = std::vector< std::tuple< int64_t, int64_t, std::list< BaseMatrix< scalar_t > >, int64_t > >

using	ReduceList = std::vector< std::tuple< int64_t, int64_t, BaseMatrix< scalar_t >, std::list< BaseMatrix< scalar_t > > > >

using	ij_tuple = typename MatrixStorage< scalar_t >::ij_tuple

typedef scalar_t	value_type

Public Member Functions
template<typename out_scalar_t >
BaseMatrix< out_scalar_t >	baseEmptyLike (int64_t mb, int64_t nb, Op deepOp)
	[internal]

Tile< scalar_t >	operator() (int64_t i, int64_t j, int device=HostNum)
	Get shallow copy of tile {i, j} of op(A) on given device, with the tile's op flag set to match the matrix's.

Tile< scalar_t >	at (int64_t i, int64_t j, int device=HostNum)
	Alias of operator().

int	num_devices () const
	Returns number of devices (per MPI process) to distribute matrix to.

void	gridinfo (GridOrder order, int nprow, int npcol, int myrow, int *mycol)
	Get nprow, npcol, myrow, mycol for 2D block cyclic (2DBC) distribution.

std::function< int64_t(int64_t i)>	tileMbFunc () const
	Returns tileMb function.

std::function< int64_t(int64_t j)>	tileNbFunc () const
	Returns tileNb function.

std::function< int(ij_tuple ij)>	tileRankFunc () const
	Returns tileRank function.

std::function< int(ij_tuple ij)>	tileDeviceFunc () const
	Returns tileDevice function.

int64_t	m () const
	Returns number of rows in op(A).

int64_t	n () const
	Returns number of columns in op(A).

int64_t	mt () const
	Returns number of block rows in op(A).

int64_t	nt () const
	Returns number of block cols in op(A).

Op	op () const
	Returns transposition operation op(A) as NoTrans, Trans, or ConjTrans.

bool	tileExists (int64_t i, int64_t j, int device=HostNum)
	returns true if tile exists on specified device

int	tileRank (int64_t i, int64_t j) const
	Returns MPI rank of tile {i, j} of op(A).

int	tileDevice (int64_t i, int64_t j) const
	Returns device of tile {i, j} of op(A).

bool	tileIsLocal (int64_t i, int64_t j) const
	Returns whether tile {i, j} of op(A) is local.

Uplo	uplo () const
	Returns whether op(A) is logically Lower, Upper, or General.

Uplo	uploLogical () const
	Returns whether op(A) is logically Lower, Upper, or General storage, taking the transposition operation into account.

Uplo	uploPhysical () const
	Returns whether A is physically Lower, Upper, or General storage, ignoring the transposition operation.

Tile< scalar_t >	originTile (int64_t i, int64_t j)
	Returns the origin tile instance of tile(i, j)

Target	origin () const
	Tile origin.

int64_t	tileMb (int64_t i) const
	Returns number of rows (mb) in block row i of op(A).

int64_t	tileNb (int64_t j) const
	Returns number of cols (nb) in block col j of op(A).

Tile< scalar_t >	tileInsert (int64_t i, int64_t j, int device=HostNum)
	Insert tile {i, j} of op(A) and allocate its data.

Tile< scalar_t >	tileInsert (int64_t i, int64_t j, int device, scalar_t *A, int64_t ld)
	Insert tile {i, j} of op(A) with existing data.

Tile< scalar_t >	tileInsert (int64_t i, int64_t j, scalar_t *A, int64_t ld)
	Insert tile with default device=HostNum.

Tile< scalar_t >	tileInsertWorkspace (int64_t i, int64_t j, int device, Layout layout)
	Insert a workspace tile {i, j} of op(A) and allocate its data.

Tile< scalar_t >	tileInsertWorkspace (int64_t i, int64_t j, int device)

Tile< scalar_t >	tileInsertWorkspace (int64_t i, int64_t j, Layout layout)

Tile< scalar_t >	tileInsertWorkspace (int64_t i, int64_t j)

scalar_t *	allocWorkspaceBuffer (int device, int64_t size)
	Allocates a workspace buffer using the matrix's memory pool.

void	freeWorkspaceBuffer (int device, scalar_t *buffer)
	Frees a workspace buffer allocated with BaseMatrix::allocWorkspaceBuffer.

MOSI	tileState (int64_t i, int64_t j, int device=HostNum)
	Returns tile(i, j)'s state on device (defaults to host).

bool	tileOnHold (int64_t i, int64_t j, int device=HostNum)
	Returns whether tile(i, j) is OnHold on device (defaults to host).

void	tileUnsetHold (int64_t i, int64_t j, int device=HostNum)
	Unsets the hold of tile(i, j) on device (defaults to host) if it was OnHold.

void	tileUnsetHoldAll (int device=HostNum)
	Unsets all local tiles' hold on device.

void	tileUnsetHoldAllOnDevices ()
	Unsets all local tiles' hold on all devices.

void	tileModified (int64_t i, int64_t j, int device=HostNum, bool permissive=false)
	Marks tile(i, j) as Modified on device.

void	tileAcquire (int64_t i, int64_t j, int device, Layout layout)
	Acquire tile(i, j) on device without copying data if not already exists.

void	tileAcquire (int64_t i, int64_t j, Layout layout)

void	tileGetForReading (int64_t i, int64_t j, int device, LayoutConvert layout)
	Gets tile(i, j) for reading on device.

void	tileGetForReading (std::set< ij_tuple > &tile_set, int device, LayoutConvert layout)
	Gets a set of tiles for reading on device.

void	tileGetForReading (int64_t i, int64_t j, LayoutConvert layout)
	Gets tile(i, j) for reading on host.

void	tileGetForReading (std::set< ij_tuple > &tile_set, LayoutConvert layout)
	Gets a set of tiles for reading on host.

void	tileGetForReading (std::set< ij_tuple > &tile_set, LayoutConvert layout, int from_device)

void	tileGetAllForReading (int device, LayoutConvert layout)
	Gets all local tiles for reading on device.

void	tileGetAllForReadingOnDevices (LayoutConvert layout)
	Gets all local tiles for reading on corresponding devices.

void	tileGetForWriting (int64_t i, int64_t j, int device, LayoutConvert layout)
	Gets tile(i, j) for writing on device.

void	tileGetForWriting (std::set< ij_tuple > &tile_set, int device, LayoutConvert layout)
	Gets a set of tiles for writing on device.

void	tileGetForWriting (int64_t i, int64_t j, LayoutConvert layout)
	Gets tile(i, j) for writing on host.

void	tileGetForWriting (std::set< ij_tuple > &tile_set, LayoutConvert layout)
	Gets a set of tiles for writing on host.

void	tileGetAllForWriting (int device, LayoutConvert layout)
	Gets all local tiles for writing on device.

void	tileGetAllForWritingOnDevices (LayoutConvert layout)
	Gets all local tiles for writing on corresponding devices.

void	tileGetAndHold (int64_t i, int64_t j, int device, LayoutConvert layout)
	Gets tile(i, j) on device for reading and marks it as MOSI::OnHold.

void	tileGetAndHold (int64_t i, int64_t j, LayoutConvert layout)
	Gets tile(i, j) on host for reading and marks it as MOSI::OnHold.

void	tileGetAndHold (std::set< ij_tuple > &tile_set, int device, LayoutConvert layout)
	Gets a set of tiles for reading on device and marks them as MOSI::OnHold.

void	tileGetAndHold (std::set< ij_tuple > &tile_set, LayoutConvert layout)
	Gets a set of tiles for reading on host and marks them as MOSI::OnHold.

void	tileGetAndHoldAll (int device, LayoutConvert layout)
	Gets all local tiles on device and marks them as MOSI::OnHold.

void	tileGetAndHoldAllOnDevices (LayoutConvert layout)
	Gets all local tiles on corresponding devices and marks them as MOSI::OnHold.

Tile< scalar_t >	tileUpdateOrigin (int64_t i, int64_t j)
	Updates the origin instance of tile(i, j) if MOSI::Invalid tile must be local.

void	tileUpdateAllOrigin ()
	Updates all origin instances of local tiles if MOSI::Invalid.

int64_t	tileLife (int64_t i, int64_t j) const
	Returns life counter of tile {i, j} of op(A).

void	tileLife (int64_t i, int64_t j, int64_t life)
	Set life counter of tile {i, j} of op(A).

void	tileTick (int64_t i, int64_t j)
	Decrements life counter of workspace tile {i, j} of op(A).

int64_t	tileReceiveCount (int64_t i, int64_t j) const
	Returns how many times the tile {i, j} is received through MPI.

void	tileIncrementReceiveCount (int64_t i, int64_t j)
	Increments the number of times the tile {i, j} is received through MPI.

void	tileDecrementReceiveCount (int64_t i, int64_t j, int64_t release_count=1)
	Decrements the number of times the tile {i, j} is received through MPI.

void	tileErase (int64_t i, int64_t j, int device=HostNum)
	Erase tile {i, j} of op(A) on device (host, one device or all devices).

void	tileRelease (int64_t i, int64_t j, int device=HostNum)
	Erase the tile {i, j}'s instance on device if it is a workspace tile with no hold set on it.

void	tileSend (int64_t i, int64_t j, int dst_rank, int tag=0)
	Send tile {i, j} of op(A) to the given MPI rank.

template<Target target>
void	tileSend (int64_t i, int64_t j, int dst_rank, int tag=0)

void	tileIsend (int64_t i, int64_t j, int dst_rank, int tag, MPI_Request *request)
	Immediately send tile {i, j} of op(A) to the given MPI rank.

template<Target target = Target::Host>
void	tileRecv (int64_t i, int64_t j, int dst_rank, Layout layout, int tag=0)
	Receive tile {i, j} of op(A) to the given MPI rank.

template<Target target = Target::Host>
void	tileIrecv (int64_t i, int64_t j, int dst_rank, Layout layout, int tag, MPI_Request *request)
	Receive tile {i, j} of op(A) to the given MPI rank using immediate mode.

template<Target target = Target::Host>
void	tileBcast (int64_t i, int64_t j, BaseMatrix const &B, Layout layout, int tag=0)
	Send tile {i, j} of op(A) to all MPI ranks in matrix B.

template<Target target = Target::Host>
void	tileBcast (int64_t i, int64_t j, BaseMatrix const &B, Layout layout, int tag, int64_t life_factor)

template<Target target = Target::Host>
void	listBcast (BcastList &bcast_list, Layout layout, int tag=0, bool is_shared=false)
	Send tile {i, j} of op(A) to all MPI ranks in the list of submatrices bcast_list.

template<Target target = Target::Host>
void	listBcast (BcastList &bcast_list, Layout layout, int tag, int64_t life_factor, bool is_shared=false)

template<Target target = Target::Host>
void	listBcastMT (BcastListTag &bcast_list, Layout layout, bool is_shared=false)
	Send tile {i, j} of op(A) to all MPI ranks in the list of submatrices bcast_list (using OpenMP taskloop and multi-threaded MPI).

template<Target target = Target::Host>
void	listBcastMT (BcastListTag &bcast_list, Layout layout, int64_t life_factor, bool is_shared=false)

template<Target target = Target::Host>
void	listReduce (ReduceList &reduce_list, Layout layout, int tag=0)

Layout	layout () const
	Returns matrix layout flag.

Layout	tileLayout (int64_t i, int64_t j, int device=HostNum)
	Returns Layout of tile(i, j, device)

void	tileLayout (int64_t i, int64_t j, int device, Layout layout)
	Sets Layout of tile(i, j, device)

void	tileLayout (int64_t i, int64_t j, Layout layout)
	Sets Layout of tile(i, j, host)

bool	tileLayoutIsConvertible (int64_t i, int64_t j, int device=HostNum)
	Returns whether tile(i, j, device) can be safely transposed.

void	tileLayoutConvert (int64_t i, int64_t j, int device, Layout layout, bool reset=false, bool async=false)
	Converts tile(i, j, device) into 'layout'.

void	tileLayoutConvert (int64_t i, int64_t j, Layout layout, bool reset=false, bool async=false)
	Convert layout of tile(i, j) to layout on host, optionally reset.

void	tileLayoutConvert (std::set< ij_tuple > &tile_set, int device, Layout layout, bool reset=false)
	Converts tiles indicated in 'tile_set' that exist on 'device' into 'layout' if not already in 'layout' major.

void	tileLayoutConvert (std::set< ij_tuple > &tile_set, Layout layout, bool reset=false)
	Convert layout of a set of tiles to layout on host, optionally reset.

void	tileLayoutConvert (int device, Layout layout, bool reset=false)
	Converts all existing tile instances on 'device' into 'layout' Operates in batch mode.

void	tileLayoutConvertOnDevices (Layout layout, bool reset=false)
	Converts all existing tile instances on available devices into 'layout'.

void	tileLayoutReset (int64_t i, int64_t j, int device, Layout layout)
	Converts tile(i, j) into current layout and resets its extended buffer.

void	tileLayoutReset (int64_t i, int64_t j, Layout layout)

void	tileLayoutReset (std::set< ij_tuple > &tile_set, int device, Layout layout)
	Converts set of tiles into current layout and resets their extended buffers.

void	tileLayoutReset (std::set< ij_tuple > &tile_set, Layout layout)

void	tileLayoutReset ()
	Converts all origin tiles into current matrix-layout.

void	tileReduceFromSet (int64_t i, int64_t j, int root_rank, std::set< int > &reduce_set, int radix, int tag, Layout layout)
	[internal] WARNING: Sent and received tiles are converted to 'layout' major.

void	getRanks (std::set< int > *bcast_set) const
	Puts all MPI ranks that have tiles in the matrix into the set.

void	getLocalDevices (std::set< int > *dev_set) const
	Puts all devices that have local tiles in the matrix into the set.

int64_t	numLocalTiles () const
	Returns number of local tiles in this matrix.

MPI_Comm	mpiComm () const

int	mpiRank () const

MPI_Group	mpiGroup () const

void	clear ()
	Removes all tiles from matrix.

void	releaseWorkspace ()
	Clears all workspace tiles that are not OnHold.

void	releaseLocalWorkspaceTile (int64_t i, int64_t j)
	Erases a given local workspace tile, if not on hold or modified.

void	releaseLocalWorkspace ()
	Erases all local workspace tiles, if not on hold or modified.

void	releaseLocalWorkspace (std::set< ij_tuple > &tile_set)
	Erases a given set of local workspace tiles from all devices including host, if not on hold or modified.

void	releaseRemoteWorkspaceTile (int64_t i, int64_t j, int64_t release_count=1)
	Erases a given tile that is not local to node from all devices including host, if not on hold or modified.

void	releaseRemoteWorkspace (int64_t receive_count=1)
	Erases tiles that are not local to node from all devices including host, if not on hold or modified.

void	releaseRemoteWorkspace (std::set< ij_tuple > &tile_set, int64_t release_count=1)
	Erases a given set of tiles that are not local to node from all devices including host, if not on hold or modified.

void	clearWorkspace ()
	Removes all temporary host and device workspace tiles from matrix.

void	allocateBatchArrays (int64_t batch_size, int64_t num_arrays)
	Allocates batch arrays and BLAS++ queues for all devices.

void	clearBatchArrays ()
	Removes batch arrays from matrix for all devices.

int64_t	batchArraySize ()

scalar_t **	array_host (int device, int64_t batch_arrays_index=0)

scalar_t **	array_device (int device, int64_t batch_arrays_index=0)

lapack::Queue *	comm_queue (int device)

lapack::Queue *	compute_queue (int device, int queue_index=0)

int	numComputeQueues ()

Static Public Attributes
static constexpr bool	is_complex = slate::is_complex<scalar_t>::value

static constexpr bool	is_real = ! is_complex

Protected Member Functions
	BaseMatrix ()
	[internal] Default constructor creates an empty matrix.

	BaseMatrix (int64_t m, int64_t n, std::function< int64_t(int64_t i)> &inTileMb, std::function< int64_t(int64_t j)> &inTileNb, std::function< int(ij_tuple ij)> &inTileRank, std::function< int(ij_tuple ij)> &inTileDevice, MPI_Comm mpi_comm)
	[internal] Construct matrix with mt block rows and nt block columns, such that sum_{i = 0}^{mt-1} tileMb(i) >= m, sum_{j = 0}^{nt-1} tileNb(j) >= n, where tileMb, tileNb, tileRank, tileDevice are given as functions.

	BaseMatrix (int64_t m, int64_t n, int64_t mb, int64_t nb, GridOrder order, int nprow, int npcol, MPI_Comm mpi_comm)
	[internal] Construct matrix with mt = ceil( m / mb ) block rows and nt = ceil( n / nb ) block columns, with fixed mb-by-nb tile size and 2D block cyclic distribution.

	BaseMatrix (int64_t m, int64_t n, int64_t mb, int64_t nb, int nprow, int npcol, MPI_Comm mpi_comm)
	With order = Col.

	BaseMatrix (int64_t m, int64_t n, int64_t nb, int nprow, int npcol, MPI_Comm mpi_comm)
	With mb = nb, order = Col.

	BaseMatrix (BaseMatrix &orig, int64_t i1, int64_t i2, int64_t j1, int64_t j2)
	[internal] Sub-matrix constructor creates shallow copy view of parent matrix, B[ i1:i2, j1:j2 ].

	BaseMatrix (BaseMatrix &orig, Slice slice)
	[internal] Sliced matrix constructor creates shallow copy view of parent matrix, B[ row1:row2, col1:col2 ].

void	tileBcastToSet (int64_t i, int64_t j, std::set< int > const &bcast_set)

void	tileBcastToSet (int64_t i, int64_t j, std::set< int > const &bcast_set, int radix, int tag, Layout layout, Target target)
	[internal] Broadcast tile {i, j} to all MPI ranks in the bcast_set.

void	tileIbcastToSet (int64_t i, int64_t j, std::set< int > const &bcast_set, int radix, int tag, Layout layout, std::vector< MPI_Request > &send_requests, Target target)
	[internal] Broadcast tile {i, j} to all MPI ranks in the bcast_set.

std::tuple< int64_t, int64_t >	globalIndex (int64_t i, int64_t j) const
	[internal] Returns index {i, j} in global matrix as a tuple, taking into account the local offset and transpose.

std::tuple< int64_t, int64_t, int >	globalIndex (int64_t i, int64_t j, int device) const
	[internal] Returns index {i, j, dev} in global matrix as a tuple, taking into account the local offset and transpose.

int64_t	row0_offset () const
	row offset of first block row.

int64_t	col0_offset () const
	col offset of first block col.

int64_t	last_mb () const
	rows in last block row.

int64_t	last_nb () const
	cols in last block col.

int64_t	ioffset () const
	block row offset with respect to original matrix

int64_t	joffset () const
	block col offset with respect to original matrix

Protected Attributes
Uplo	uplo_
	upper or lower storage

Op	op_
	transpose operation with respect to original matrix

Layout	layout_
	intended layout of the matrix. defaults to ColMajor.

Target	origin_
	Tile origins. defaults to Host.

std::shared_ptr< MatrixStorage< scalar_t > >	storage_
	shared storage of tiles and buffers

MPI_Comm	mpi_comm_

MPI_Group	mpi_group_

int	mpi_rank_

Friends
class	Debug

template<typename MatrixType >
MatrixType	transpose (MatrixType &A)
	Returns shallow copy of op(A) that is transposed.

template<typename MatrixType >
MatrixType	conj_transpose (MatrixType &A)
	Returns shallow copy of op(A) that is conjugate-transpose.

template<typename T >
void	swap (BaseMatrix< T > &A, BaseMatrix< T > &B)

Detailed Description

template<typename scalar_t>
class slate::BaseMatrix< scalar_t >

Base class for all SLATE distributed, tiled matrices.

In general, the documentation refers to the current matrix object as op(A), to emphasize that it may be transposed with respect to its parent matrix.

Constructor & Destructor Documentation

◆ BaseMatrix() [1/5]

template<typename scalar_t >

slate::BaseMatrix< scalar_t >::BaseMatrix

protected

[internal] Default constructor creates an empty matrix.

Does not allocate any memory.

◆ BaseMatrix() [2/5]

template<typename scalar_t >

slate::BaseMatrix< scalar_t >::BaseMatrix	(	int64_t	m,
		int64_t	n,
		std::function< int64_t(int64_t i)> &	inTileMb,
		std::function< int64_t(int64_t j)> &	inTileNb,
		std::function< int(ij_tuple ij)> &	inTileRank,
		std::function< int(ij_tuple ij)> &	inTileDevice,
		MPI_Comm	mpi_comm
	)

protected

[internal] Construct matrix with mt block rows and nt block columns, such that sum_{i = 0}^{mt-1} tileMb(i) >= m, sum_{j = 0}^{nt-1} tileNb(j) >= n, where tileMb, tileNb, tileRank, tileDevice are given as functions.

No tiles are allocated. Creates empty matrix storage.

Parameters

[in]	m	Number of rows of the matrix. m >= 0.
[in]	n	Number of columns of the matrix. n >= 0
[in]	inTileMb	Function that takes block-row index, returns block-row size.
[in]	inTileNb	Function that takes block-col index, returns block-col size.
[in]	inTileRank	Function that takes tuple of { block-row, block-col } indices, returns MPI rank for that tile.
[in]	inTileDevice	Function that takes tuple of { block-row, block-col } indices, returns local GPU device ID for that tile.
[in]	mpi_comm	MPI communicator to distribute matrix across.

◆ BaseMatrix() [3/5]

template<typename scalar_t >

slate::BaseMatrix< scalar_t >::BaseMatrix	(	int64_t	m,
		int64_t	n,
		int64_t	mb,
		int64_t	nb,
		GridOrder	order,
		int	nprow,
		int	npcol,
		MPI_Comm	mpi_comm
	)

protected

[internal] Construct matrix with mt = ceil( m / mb ) block rows and nt = ceil( n / nb ) block columns, with fixed mb-by-nb tile size and 2D block cyclic distribution.

No tiles are allocated. Creates empty matrix storage.

Parameters

[in]	m	Number of rows of the matrix. m >= 0.
[in]	n	Number of columns of the matrix. n >= 0
[in]	mb	Row block size in 2D block-cyclic distribution. mb > 0.
[in]	nb	Column block size in 2D block-cyclic distribution. nb > 0.
[in]	order	Order to map MPI processes to tile grid, GridOrder::ColMajor (default) or GridOrder::RowMajor.
[in]	nprow	Number of process rows in 2D block-cyclic distribution. nprow > 0.
[in]	npcol	Number of process cols of 2D block-cyclic distribution. npcol > 0.
[in]	mpi_comm	MPI communicator to distribute matrix across. nprow * npcol <= MPI_Comm_size( mpi_comm ).

◆ BaseMatrix() [4/5]

template<typename scalar_t >

slate::BaseMatrix< scalar_t >::BaseMatrix	(	BaseMatrix< scalar_t > &	orig,
		int64_t	i1,
		int64_t	i2,
		int64_t	j1,
		int64_t	j2
	)

protected

[internal] Sub-matrix constructor creates shallow copy view of parent matrix, B[ i1:i2, j1:j2 ].

B[ 0:mt-1, 0:nt-1 ] is the entire B matrix. If i2 < i1 or j2 < j1, produces empty matrix. See Matrix::sub().

Parameters

[in,out]	B	Parent matrix B.
[in]	i1	Starting block row index. 0 <= i1 < mt.
[in]	i2	Ending block row index (inclusive). i2 < mt.
[in]	j1	Starting block column index. 0 <= j1 < nt.
[in]	j2	Ending block column index (inclusive). j2 < nt.

◆ BaseMatrix() [5/5]

template<typename scalar_t >

slate::BaseMatrix< scalar_t >::BaseMatrix	(	BaseMatrix< scalar_t > &	orig,
		Slice	slice
	)

protected

[internal] Sliced matrix constructor creates shallow copy view of parent matrix, B[ row1:row2, col1:col2 ].

This takes row & col indices instead of block row & block col indices. B[ 0:m-1, 0:n-1 ] is the entire B matrix. If row2 < row1 or col2 < col1, produces empty matrix. See Matrix::slice().

Parameters

[in,out]	B	Parent matrix B.
[in]	slice	Start and end row and column indices:

slice.row1: Starting row index. 0 <= row1 < m.
slice.row2: Ending row index (inclusive). row2 < m.
slice.col1: Starting column index. 0 <= col1 < n.
slice.col2: Ending column index (inclusive). col2 < n.

Member Function Documentation

◆ allocateBatchArrays()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::allocateBatchArrays	(	int64_t	batch_size,
		int64_t	num_arrays
	)

inline

Allocates batch arrays and BLAS++ queues for all devices.

Matrix classes override this with versions that can also allocate based on the number of local tiles.

Parameters

[in]	batch_size	Allocate batch arrays as needed so that size of each batch array >= batch_size >= 0.
[in]	num_arrays	Allocate batch arrays as needed so that number of batch arrays per device >= num_arrays >= 1.

◆ allocWorkspaceBuffer()

template<typename scalar_t >

scalar_t * slate::BaseMatrix< scalar_t >::allocWorkspaceBuffer	(	int	device,
		int64_t	size
	)

Allocates a workspace buffer using the matrix's memory pool.

The memory must be freed with BaseMatrix::freeWorkspaceBuffer

Parameters

[in]	device	Device ID (GPU or Host) where the memory block is needed.
[in]	size	The required allocation size

Returns: Pointer to the buffer

◆ array_device()

template<typename scalar_t >

scalar_t ** slate::BaseMatrix< scalar_t >::array_device	(	int	device,
		int64_t	batch_arrays_index = `0`
	)

inline

Returns: batch arrays on device

◆ array_host()

template<typename scalar_t >

scalar_t ** slate::BaseMatrix< scalar_t >::array_host	(	int	device,
		int64_t	batch_arrays_index = `0`
	)

inline

Returns: batch arrays on host, to send to device

◆ batchArraySize()

template<typename scalar_t >

int64_t slate::BaseMatrix< scalar_t >::batchArraySize ( )

inline

Returns: currently allocated batch array size

◆ clear()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::clear ( )

inline

Removes all tiles from matrix.

WARNING: currently this clears the entire parent matrix, not just a sub-matrix.

◆ clearBatchArrays()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::clearBatchArrays ( )

inline

Removes batch arrays from matrix for all devices.

WARNING: currently this clears the entire parent matrix, not just a sub-matrix.

◆ clearWorkspace()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::clearWorkspace ( )

inline

Removes all temporary host and device workspace tiles from matrix.

WARNING: currently, this clears the entire parent matrix, not just a sub-matrix.

◆ comm_queue()

template<typename scalar_t >

lapack::Queue * slate::BaseMatrix< scalar_t >::comm_queue ( int device )

inline

Returns: BLAS++ communication queues

Parameters

[in] device Tile's device ID.

◆ compute_queue()

template<typename scalar_t >

lapack::Queue * slate::BaseMatrix< scalar_t >::compute_queue	(	int	device,
		int	queue_index = `0`
	)

inline

Returns: BLAS++ compute queues

Parameters

[in]	device	Tile's device ID
[in]	queue_index	The index of a specific set of queues

◆ freeWorkspaceBuffer()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::freeWorkspaceBuffer	(	int	device,
		scalar_t *	buffer
	)

Frees a workspace buffer allocated with BaseMatrix::allocWorkspaceBuffer.

Parameters

[in]	device	Device ID (GPU or Host) where the memory block is needed.
[in]	buffer	Pointer to the buffer

◆ getLocalDevices()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::getLocalDevices ( std::set< int > * dev_set ) const

Puts all devices that have local tiles in the matrix into the set.

Parameters

[out] dev_set On output, set of device IDs that this sub-matrix has local tiles on.

◆ getRanks()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::getRanks ( std::set< int > * bcast_set ) const

Puts all MPI ranks that have tiles in the matrix into the set.

Parameters

[out] bcast_set On output, set of MPI ranks that this sub-matrix has tiles on.

◆ globalIndex() [1/2]

template<typename scalar_t >

std::tuple< int64_t, int64_t > slate::BaseMatrix< scalar_t >::globalIndex	(	int64_t	i,
		int64_t	j
	)		const

protected

[internal] Returns index {i, j} in global matrix as a tuple, taking into account the local offset and transpose.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.

◆ globalIndex() [2/2]

template<typename scalar_t >

std::tuple< int64_t, int64_t, int > slate::BaseMatrix< scalar_t >::globalIndex	(	int64_t	i,
		int64_t	j,
		int	device
	)		const

protected

[internal] Returns index {i, j, dev} in global matrix as a tuple, taking into account the local offset and transpose.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's device ID.

◆ gridinfo()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::gridinfo	(	GridOrder *	order,
		int *	nprow,
		int *	npcol,
		int *	myrow,
		int *	mycol
	)

Get nprow, npcol, myrow, mycol for 2D block cyclic (2DBC) distribution.

If SLATE doesn't know the distribution, sets all values to -1. todo: Assumes col-major 2D block cyclic distribution, not row-major.

Parameters

[out]	order	Order to map MPI processes to tile grid.
[out]	nprow	Number of process rows.
[out]	npcol	Number of process cols.
[out]	myrow	Process row for this process (MPI rank).
[out]	mycol	Process col for this process (MPI rank).

◆ listBcast()

template<typename scalar_t >

template<Target target>

void slate::BaseMatrix< scalar_t >::listBcast	(	BcastList &	bcast_list,
		Layout	layout,
		int	tag = `0`,
		bool	is_shared = `false`
	)

Send tile {i, j} of op(A) to all MPI ranks in the list of submatrices bcast_list.

Data received must be in 'layout' (ColMajor/RowMajor) major.

Template Parameters

target Destination to target; either Host (default) or Device.

Parameters

[in]	bcast_list	List of submatrices defining the MPI ranks to send to. Usually it is the portion of the matrix to be updated by tile {i, j}.
[in]	layout	Indicates the Layout (ColMajor/RowMajor) of the broadcasted data. WARNING: must match the layout of the tile in the sender MPI rank.
[in]	tag	MPI tag, default 0.
[in]	is_shared	A flag to get and hold the broadcasted (prefetched) tiles on the devices. This flag prevents any subsequent calls of tileRelease() routine to release these tiles (clear up the devices memories). WARNING: must set unhold these tiles before releasing them to free up the allocated memories.

◆ listBcastMT()

template<typename scalar_t >

template<Target target>

void slate::BaseMatrix< scalar_t >::listBcastMT	(	BcastListTag &	bcast_list,
		Layout	layout,
		bool	is_shared = `false`
	)

Send tile {i, j} of op(A) to all MPI ranks in the list of submatrices bcast_list (using OpenMP taskloop and multi-threaded MPI).

Data received must be in 'layout' (ColMajor/RowMajor) major.

Template Parameters

target Destination to target; either Host (default) or Device.

Parameters

[in]	bcast_list	List of submatrices defining the MPI ranks to send to. Usually it is the portion of the matrix to be updated by tile {i, j}. Each tile {i, j} to be broadcast has a tag in the bcast_list.
[in]	layout	Indicates the Layout (ColMajor/RowMajor) of the broadcasted data. WARNING: must match the layout of the tile in the sender MPI rank.
[in]	is_shared	A flag to get and hold the broadcasted (prefetched) tiles on the devices. This flag prevents any subsequent calls of tileRelease() routine to release these tiles (clear up the devices memories). WARNING: must set unhold these tiles before releasing them to free up the allocated memories.

◆ numComputeQueues()

template<typename scalar_t >

int slate::BaseMatrix< scalar_t >::numComputeQueues ( )

inline

Returns: number of allocated BLAS++ compute queues

◆ operator()()

template<typename scalar_t >

Tile< scalar_t > slate::BaseMatrix< scalar_t >::operator()	(	int64_t	i,
		int64_t	j,
		int	device = `HostNum`
	)

Get shallow copy of tile {i, j} of op(A) on given device, with the tile's op flag set to match the matrix's.

Parameters

[in]	i	Tile's block row index. 0 <= i1 < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's device ID; default is HostNum.

Returns: Tile {i, j, device}.

◆ originTile()

template<typename scalar_t >

Tile< scalar_t > slate::BaseMatrix< scalar_t >::originTile	(	int64_t	i,
		int64_t	j
	)

Returns the origin tile instance of tile(i, j)

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.

◆ releaseLocalWorkspace()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::releaseLocalWorkspace ( std::set< ij_tuple > & tile_set )

Erases a given set of local workspace tiles from all devices including host, if not on hold or modified.

Parameters

[in] tile_set Set of (i, j) tuples indicating indices of tiles to be erased.

◆ releaseRemoteWorkspace()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::releaseRemoteWorkspace	(	std::set< ij_tuple > &	tile_set,
		int64_t	release_count = `1`
	)

Erases a given set of tiles that are not local to node from all devices including host, if not on hold or modified.

Parameters

[in] tile_set Set of (i, j) tuples indicating indices of tiles to be erased.

◆ releaseRemoteWorkspaceTile()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::releaseRemoteWorkspaceTile	(	int64_t	i,
		int64_t	j,
		int64_t	release_count = `1`
	)

Erases a given tile that is not local to node from all devices including host, if not on hold or modified.

The tile's receive count is decremented. If the receive count reaches zero, the tile is erased. Otherwise, tile is not erased.

◆ releaseWorkspace()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::releaseWorkspace ( )

inline

Clears all workspace tiles that are not OnHold.

For local tiles, it ensures that a valid copy remains.

Note that local tiles are currently not released if it would leave all remaining tiles invalid, but this behavior may change in the future and should not be relied on.

◆ tileAcquire()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileAcquire	(	int64_t	i,
		int64_t	j,
		int	device,
		Layout	layout
	)

Acquire tile(i, j) on device without copying data if not already exists.

This is used when the destination tile's data will be overridden. Converts destination Layout to 'layout' param. Assumes the TileNode(i, j) already exists.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's destination: host or device ID.
[in]	layout	Indicates the required Layout of the received tile: ColMajor: column major. RowMajor: row major.

◆ tileBcast()

template<typename scalar_t >

template<Target target>

void slate::BaseMatrix< scalar_t >::tileBcast	(	int64_t	i,
		int64_t	j,
		BaseMatrix< scalar_t > const &	B,
		Layout	layout,
		int	tag = `0`
	)

Send tile {i, j} of op(A) to all MPI ranks in matrix B.

If target is Devices, also copies tile to all devices on each MPI rank. This should be called by at least all ranks with local tiles in B; ones that do not have any local tiles are excluded from the broadcast. Data received must be in 'layout' (ColMajor/RowMajor) major.

Template Parameters

target Destination to target; either Host (default) or Device.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	B	Sub-matrix B defines the MPI ranks to send to. Usually it is the portion of the matrix to be updated by tile {i, j}.
[in]	layout	Indicates the Layout (ColMajor/RowMajor) of the broadcasted data. WARNING: must match the layout of the tile in the sender MPI rank.
[in]	tag	MPI tag, default 0.

◆ tileBcastToSet()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileBcastToSet	(	int64_t	i,
		int64_t	j,
		std::set< int > const &	bcast_set,
		int	radix,
		int	tag,
		Layout	layout,
		Target	target
	)

protected

[internal] Broadcast tile {i, j} to all MPI ranks in the bcast_set.

This should be called by all (and only) ranks that are in bcast_set, as either the root sender or a receiver. This function implements a custom pattern using sends and receives. Data received must be in 'layout' (ColMajor/RowMajor) major.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	bcast_set	Set of MPI ranks to broadcast to.
[in]	radix	Radix of the communication pattern.
[in]	tag	MPI tag, default 0.
[in]	layout	Indicates the Layout (ColMajor/RowMajor) of the received data.

◆ tileDeviceFunc()

template<typename scalar_t >

std::function< int(ij_tuple ij)> slate::BaseMatrix< scalar_t >::tileDeviceFunc ( ) const

inline

Returns tileDevice function.

Useful to construct matrices with the same block size. For submatrices, this is of the parent matrix.

◆ tileErase()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileErase	(	int64_t	i,
		int64_t	j,
		int	device = `HostNum`
	)

Erase tile {i, j} of op(A) on device (host, one device or all devices).

If tile's memory was allocated by SLATE, via tileInsert(i, j, dev) or tileInsertWorkspace(i, j, dev), then the memory is released to the allocator pool.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's device ID.

◆ tileGetAllForReading()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetAllForReading	(	int	device,
		LayoutConvert	layout
	)

Gets all local tiles for reading on device.

See also: tileGetForReading.

Parameters

[in]	device	Tile's destination: host or device ID, defaults to host.
[in]	layout	Indicates whether to convert the Layout of the received data: ColMajor: convert layout to column major. RowMajor: convert layout to row major. None: do not convert layout.

◆ tileGetAllForReadingOnDevices()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetAllForReadingOnDevices ( LayoutConvert layout )

Gets all local tiles for reading on corresponding devices.

See also: tileGetForReading.

Parameters

[in]

layout

Indicates whether to convert the Layout of the received data:

ColMajor: convert layout to column major.
RowMajor: convert layout to row major.
None: do not convert layout.

◆ tileGetAllForWriting()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetAllForWriting	(	int	device,
		LayoutConvert	layout
	)

Gets all local tiles for writing on device.

See also: tileGetForWriting.

Parameters

[in]	device	Tile's destination: host or device ID, defaults to host.
[in]	layout	Indicates whether to convert the Layout of the received data: ColMajor: convert layout to column major. RowMajor: convert layout to row major. None: do not convert layout.

◆ tileGetAllForWritingOnDevices()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetAllForWritingOnDevices ( LayoutConvert layout )

Gets all local tiles for writing on corresponding devices.

See also: tileGetForWriting.

Parameters

[in]

layout

Indicates whether to convert the Layout of the received data:

ColMajor: convert layout to column major.
RowMajor: convert layout to row major.
None: do not convert layout.

◆ tileGetAndHold() [1/4]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetAndHold	(	int64_t	i,
		int64_t	j,
		int	device,
		LayoutConvert	layout
	)

Gets tile(i, j) on device for reading and marks it as MOSI::OnHold.

Will copy tile in if it does not exist or its state is MOSI::Invalid. Updates the source tile's state to MOSI::Shared if copied-in. Converts destination Layout based on 'layout' param.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's destination: host or device ID, defaults to host.
[in]	layout	Indicates whether to convert the Layout of the received data: ColMajor: convert layout to column major. RowMajor: convert layout to row major. None: do not convert layout.

◆ tileGetAndHold() [2/4]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetAndHold	(	int64_t	i,
		int64_t	j,
		LayoutConvert	layout
	)

inline

Gets tile(i, j) on host for reading and marks it as MOSI::OnHold.

See also: tileGetAndHold

◆ tileGetAndHold() [3/4]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetAndHold	(	std::set< ij_tuple > &	tile_set,
		int	device,
		LayoutConvert	layout
	)

Gets a set of tiles for reading on device and marks them as MOSI::OnHold.

See also: tileGetAndHold

Parameters

[in]	tile_set	Set of (i, j) tuples indicating indices of Tiles' to be acquired.
[in]	device	Tile's destination: host or device ID, defaults to host.
[in]	layout	Indicates whether to convert the Layout of the received data: ColMajor: convert layout to column major. RowMajor: convert layout to row major. None: do not convert layout.

◆ tileGetAndHold() [4/4]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetAndHold	(	std::set< ij_tuple > &	tile_set,
		LayoutConvert	layout
	)

inline

Gets a set of tiles for reading on host and marks them as MOSI::OnHold.

See also: tileGetAndHold

◆ tileGetAndHoldAll()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetAndHoldAll	(	int	device,
		LayoutConvert	layout
	)

Gets all local tiles on device and marks them as MOSI::OnHold.

See also: tileGetAndHold.

Parameters

[in]	device	Tile's destination: host or device ID, defaults to host.
[in]	layout	Indicates whether to convert the Layout of the received data: ColMajor: convert layout to column major. RowMajor: convert layout to row major. None: do not convert layout.

◆ tileGetAndHoldAllOnDevices()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetAndHoldAllOnDevices ( LayoutConvert layout )

Gets all local tiles on corresponding devices and marks them as MOSI::OnHold.

See also: tileGetAndHold.

Parameters

[in]

layout

Indicates whether to convert the Layout of the received data:

ColMajor: convert layout to column major.
RowMajor: convert layout to row major.
None: do not convert layout.

◆ tileGetForReading() [1/4]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetForReading	(	int64_t	i,
		int64_t	j,
		int	device,
		LayoutConvert	layout
	)

Gets tile(i, j) for reading on device.

Will copy-in the tile if it does not exist or its state is MOSI::Invalid. Sets tile state to MOSI::Shared if copied-in. Finds a source tile whose state is valid (Modified|Shared) by looping on existing tile instances. Updates source tile's state to shared if copied-in. Converts destination Layout based on 'layout' param.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's destination: host or device ID, defaults to host.
[in]	layout	Indicates whether to convert the Layout of the received data: ColMajor: convert layout to column major. RowMajor: convert layout to row major. None: do not convert layout.

◆ tileGetForReading() [2/4]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetForReading	(	int64_t	i,
		int64_t	j,
		LayoutConvert	layout
	)

inline

Gets tile(i, j) for reading on host.

See also: tileGetForReading

◆ tileGetForReading() [3/4]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetForReading	(	std::set< ij_tuple > &	tile_set,
		int	device,
		LayoutConvert	layout
	)

Gets a set of tiles for reading on device.

See also: tileGetForReading

Parameters

[in]	tile_set	Set of (i, j) tuples indicating indices of Tiles' to be acquired.
[in]	device	Tile's destination: host or device ID.
[in]	layout	Indicates whether to convert the Layout of the received data: ColMajor: convert layout to column major. RowMajor: convert layout to row major. None: do not convert layout.

◆ tileGetForReading() [4/4]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetForReading	(	std::set< ij_tuple > &	tile_set,
		LayoutConvert	layout
	)

inline

Gets a set of tiles for reading on host.

See also: tileGetForReading

◆ tileGetForWriting() [1/4]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetForWriting	(	int64_t	i,
		int64_t	j,
		int	device,
		LayoutConvert	layout
	)

Gets tile(i, j) for writing on device.

Sets destination tile's state to MOSI::Modified. Will copy-in the tile if it does not exist or its state is MOSI::Invalid. Other instances will be invalidated. Finds a source tile whose state is valid (Modified|Shared) by scanning existing tile instances. Converts destination Layout based on 'layout' param.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's destination: host or device ID, defaults to host.
[in]	layout	Indicates whether to convert the Layout of the received data: ColMajor: convert layout to column major. RowMajor: convert layout to row major. None: do not convert layout.

◆ tileGetForWriting() [2/4]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetForWriting	(	int64_t	i,
		int64_t	j,
		LayoutConvert	layout
	)

inline

Gets tile(i, j) for writing on host.

See also: tileGetForWriting

◆ tileGetForWriting() [3/4]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetForWriting	(	std::set< ij_tuple > &	tile_set,
		int	device,
		LayoutConvert	layout
	)

Gets a set of tiles for writing on device.

See also: tileGetForWriting

Parameters

[in]	tile_set	Set of (i, j) tuples indicating indices of Tiles' to be acquired.
[in]	device	Tile's destination: host or device ID, defaults to host.
[in]	layout	Indicates whether to convert the Layout of the received data: ColMajor: convert layout to column major. RowMajor: convert layout to row major. None: do not convert layout.

◆ tileGetForWriting() [4/4]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileGetForWriting	(	std::set< ij_tuple > &	tile_set,
		LayoutConvert	layout
	)

inline

Gets a set of tiles for writing on host.

See also: tileGetForWriting

◆ tileIbcastToSet()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileIbcastToSet	(	int64_t	i,
		int64_t	j,
		std::set< int > const &	bcast_set,
		int	radix,
		int	tag,
		Layout	layout,
		std::vector< MPI_Request > &	send_requests,
		Target	target
	)

protected

[internal] Broadcast tile {i, j} to all MPI ranks in the bcast_set.

This should be called by all (and only) ranks that are in bcast_set, as either the root sender or a receiver. This function implements a custom pattern using sends and receives. Data received must be in 'layout' (ColMajor/RowMajor) major. Nonblocking sends are used, with requests appended to the provided vector.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	bcast_set	Set of MPI ranks to broadcast to.
[in]	radix	Radix of the communication pattern.
[in]	tag	MPI tag, default 0.
[in]	layout	Indicates the Layout (ColMajor/RowMajor) of the received data.
[in,out]	send_requests	Vector where requests for this bcast are appended.

◆ tileInsert() [1/3]

template<typename scalar_t >

Tile< scalar_t > slate::BaseMatrix< scalar_t >::tileInsert	(	int64_t	i,
		int64_t	j,
		int	device,
		scalar_t *	data,
		int64_t	ld
	)

Insert tile {i, j} of op(A) with existing data.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's device ID; default is HostNum (provided by overloaded function).
[in,out]	data	Tile's data. The matrix uses this pointer directly, it does not copy the data, so the data must remain valid while the matrix exists.
[in]	ld	Leading dimension of data; column stride. ld >= tileMb(i).

Returns: Pointer to new tile.

◆ tileInsert() [2/3]

template<typename scalar_t >

Tile< scalar_t > slate::BaseMatrix< scalar_t >::tileInsert	(	int64_t	i,
		int64_t	j,
		int	device = `HostNum`
	)

Insert tile {i, j} of op(A) and allocate its data.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's device ID; default is HostNum.

Returns: Pointer to new tile.

◆ tileInsert() [3/3]

template<typename scalar_t >

Tile< scalar_t > slate::BaseMatrix< scalar_t >::tileInsert	(	int64_t	i,
		int64_t	j,
		scalar_t *	A,
		int64_t	ld
	)

inline

Insert tile with default device=HostNum.

See also: tileInsert.

◆ tileInsertWorkspace()

template<typename scalar_t >

Tile< scalar_t > slate::BaseMatrix< scalar_t >::tileInsertWorkspace	(	int64_t	i,
		int64_t	j,
		int	device,
		Layout	layout
	)

Insert a workspace tile {i, j} of op(A) and allocate its data.

The tile will be freed

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's device ID; default is HostNum.

Returns: Pointer to new tile.

◆ tileIrecv()

template<typename scalar_t >

template<Target target>

void slate::BaseMatrix< scalar_t >::tileIrecv	(	int64_t	i,
		int64_t	j,
		int	src_rank,
		Layout	layout,
		int	tag,
		MPI_Request *	request
	)

Receive tile {i, j} of op(A) to the given MPI rank using immediate mode.

Tile is allocated as workspace if it doesn't yet exist. Source rank must call tileSend() or tileIsend(). Data received must be in 'layout' (ColMajor/RowMajor) major.

For non-GPU-aware MPI, Irecv currently can't respect the target.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	src_rank	Source MPI rank. If src_rank == mpiRank, this is a no-op.
[in]	layout	Indicates the Layout (ColMajor/RowMajor) of the received data. WARNING: must match the layout of the tile in the sender MPI rank.
[in]	tag	MPI tag
[out]	request	MPI request object

◆ tileIsend()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileIsend	(	int64_t	i,
		int64_t	j,
		int	dst_rank,
		int	tag,
		MPI_Request *	request
	)

Immediately send tile {i, j} of op(A) to the given MPI rank.

Destination rank must call tileRecv() or tileIrecv().

Template Parameters

target Destination to target; either Host (default) or Device.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	dst_rank	Destination MPI rank. If dst_rank == mpiRank, this is a no-op.
[in]	tag	MPI tag, default 0.
[out]	request	Pointer to an MPI_Request struct

◆ tileLayoutConvert() [1/3]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileLayoutConvert	(	int	device,
		Layout	layout,
		bool	reset = `false`
	)

Converts all existing tile instances on 'device' into 'layout' Operates in batch mode.

Parameters

[in]	device	Tiles' host or device ID.
[in]	layout	Intended Layout of tiles: Layout::ColMajor or Layout::RowMajor.
[in]	reset	Optionally resets the tiles extended buffers.

◆ tileLayoutConvert() [2/3]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileLayoutConvert	(	int64_t	i,
		int64_t	j,
		int	device,
		Layout	layout,
		bool	reset = `false`,
		bool	async = `false`
	)

Converts tile(i, j, device) into 'layout'.

Tile should exist on 'device', will assert otherwise. Tile will be made Convertible if it was not.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's host or device ID.
[in]	layout	Intended Layout of tile: Layout::ColMajor or Layout::RowMajor.
[in]	reset	Optionally resets the tile extended buffers.

todo: handle op(A), sub-matrix, and sliced-matrix

◆ tileLayoutConvert() [3/3]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileLayoutConvert	(	std::set< ij_tuple > &	tile_set,
		int	device,
		Layout	layout,
		bool	reset = `false`
	)

Converts tiles indicated in 'tile_set' that exist on 'device' into 'layout' if not already in 'layout' major.

Tiles should exist on 'device', will throw exception otherwise. Operates in batch mode when tiles are on devices. If device is not Host, will bucket tiles into uniform size and stride batches, then launches each batch transpose.

Parameters

[in]	tile_set	Set of (i, j) tuples indicating indices of Tiles' to be converted.
[in]	device	Tiles' host or device ID.
[in]	layout	Intended Layout of tiles: Layout::ColMajor or Layout::RowMajor.
[in]	reset	Optionally resets the tiles extended buffers.

◆ tileLayoutConvertOnDevices()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileLayoutConvertOnDevices	(	Layout	layout,
		bool	reset = `false`
	)

Converts all existing tile instances on available devices into 'layout'.

Host tiles are not affected. Tiles should exist already on devices. Operates in batch mode.

Parameters

[in]	layout	Intended Layout of tiles: Layout::ColMajor or Layout::RowMajor.
[in]	reset	Optionally resets the tile extended buffers.

◆ tileLayoutIsConvertible()

template<typename scalar_t >

bool slate::BaseMatrix< scalar_t >::tileLayoutIsConvertible	(	int64_t	i,
		int64_t	j,
		int	device = `HostNum`
	)

Returns whether tile(i, j, device) can be safely transposed.

based on its 'TileKind', buffer size, Layout, and stride. Tile instance on 'device' should exist.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's host or device ID, defaults to host.

◆ tileLayoutReset() [1/3]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileLayoutReset

Converts all origin tiles into current matrix-layout.

Operates in batch mode.

◆ tileLayoutReset() [2/3]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileLayoutReset	(	int64_t	i,
		int64_t	j,
		int	device,
		Layout	layout
	)

Converts tile(i, j) into current layout and resets its extended buffer.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's device ID; default is HostNum.
[in]	layout	Intended Layout of tiles: Layout::ColMajor or Layout::RowMajor.

◆ tileLayoutReset() [3/3]

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileLayoutReset	(	std::set< ij_tuple > &	tile_set,
		int	device,
		Layout	layout
	)

Converts set of tiles into current layout and resets their extended buffers.

Operates in batch mode.

Parameters

[in]	tile_set	Set of (i, j) indices of tiles to be converted and reset.
[in]	device	Tile's device ID; default is HostNum.
[in]	layout	Intended Layout of tiles: Layout::ColMajor or Layout::RowMajor.

◆ tileMb()

template<typename scalar_t >

int64_t slate::BaseMatrix< scalar_t >::tileMb ( int64_t i ) const

Returns number of rows (mb) in block row i of op(A).

Parameters

[in] i Tile's block row index. 0 <= i < mt.

◆ tileMbFunc()

template<typename scalar_t >

std::function< int64_t(int64_t i)> slate::BaseMatrix< scalar_t >::tileMbFunc ( ) const

inline

Returns tileMb function.

Useful to construct matrices with the same block size. For submatrices, this is of the parent matrix.

◆ tileModified()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileModified	(	int64_t	i,
		int64_t	j,
		int	device = `HostNum`,
		bool	permissive = `false`
	)

Marks tile(i, j) as Modified on device.

Other instances will be invalidated. Unless permissive, asserts if other instances are in Modified state.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's device ID, defaults to host.
[in]	permissive	Defaults to false.

◆ tileNb()

template<typename scalar_t >

int64_t slate::BaseMatrix< scalar_t >::tileNb ( int64_t j ) const

Returns number of cols (nb) in block col j of op(A).

Parameters

[in] j Tile's block column index. 0 <= j < nt.

◆ tileNbFunc()

template<typename scalar_t >

std::function< int64_t(int64_t j)> slate::BaseMatrix< scalar_t >::tileNbFunc ( ) const

inline

Returns tileNb function.

Useful to construct matrices with the same block size. For submatrices, this is of the parent matrix.

◆ tileOnHold()

template<typename scalar_t >

bool slate::BaseMatrix< scalar_t >::tileOnHold	(	int64_t	i,
		int64_t	j,
		int	device = `HostNum`
	)

inline

Returns whether tile(i, j) is OnHold on device (defaults to host).

Asserts that the tile exists.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's device ID.

◆ tileRankFunc()

template<typename scalar_t >

std::function< int(ij_tuple ij)> slate::BaseMatrix< scalar_t >::tileRankFunc ( ) const

inline

Returns tileRank function.

Useful to construct matrices with the same block size. For submatrices, this is of the parent matrix.

◆ tileReceiveCount()

template<typename scalar_t >

int64_t slate::BaseMatrix< scalar_t >::tileReceiveCount	(	int64_t	i,
		int64_t	j
	)		const

inline

Returns how many times the tile {i, j} is received through MPI.

This function is used to track tiles that may be communicated twice due to symmetry during hemm and symm operations.

◆ tileRecv()

template<typename scalar_t >

template<Target target>

void slate::BaseMatrix< scalar_t >::tileRecv	(	int64_t	i,
		int64_t	j,
		int	src_rank,
		Layout	layout,
		int	tag = `0`
	)

Receive tile {i, j} of op(A) to the given MPI rank.

Tile is allocated as workspace if it doesn't yet exist. Source rank must call tileSend() or tileIsend(). Data received must be in 'layout' (ColMajor/RowMajor) major.

Note that when target=Devices and non-GPU-aware MPI is used, an OpenMP task may be created to copy the tile to device.

Template Parameters

target Destination to target; either Host (default) or Device.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	src_rank	Source MPI rank. If src_rank == mpiRank, this is a no-op.
[in]	layout	Indicates the Layout (ColMajor/RowMajor) of the received data. WARNING: must match the layout of the tile in the sender MPI rank.
[in]	tag	MPI tag, default 0.

◆ tileRelease()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileRelease	(	int64_t	i,
		int64_t	j,
		int	device = `HostNum`
	)

Erase the tile {i, j}'s instance on device if it is a workspace tile with no hold set on it.

If tile's memory was allocated by SLATE, via tileInsert(i, j, dev) or tileInsertWorkspace(i, j, dev), then the memory is released to the allocator pool. For local tiles, it ensures that a valid copy remains.

Note that local tiles are currently not released if it would leave all remaining tiles invalid, but this behavior may change in the future and should not be relied on.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's device ID.

◆ tileSend()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileSend	(	int64_t	i,
		int64_t	j,
		int	dst_rank,
		int	tag = `0`
	)

Send tile {i, j} of op(A) to the given MPI rank.

Destination rank must call tileRecv() or tileIrecv().

Template Parameters

target Destination to target; either Host (default) or Device.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	dst_rank	Destination MPI rank. If dst_rank == mpiRank, this is a no-op.
[in]	tag	MPI tag, default 0.

◆ tileState()

template<typename scalar_t >

MOSI slate::BaseMatrix< scalar_t >::tileState	(	int64_t	i,
		int64_t	j,
		int	device = `HostNum`
	)

inline

Returns tile(i, j)'s state on device (defaults to host).

Asserts that the tile exists.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's device ID.

◆ tileTick()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileTick	(	int64_t	i,
		int64_t	j
	)

inline

Decrements life counter of workspace tile {i, j} of op(A).

Then, if life reaches 0, deletes tile on all devices. For local, non-workspace tiles, does nothing.

◆ tileUnsetHold()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileUnsetHold	(	int64_t	i,
		int64_t	j,
		int	device = `HostNum`
	)

inline

Unsets the hold of tile(i, j) on device (defaults to host) if it was OnHold.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.
[in]	device	Tile's device ID.

◆ tileUnsetHoldAll()

template<typename scalar_t >

void slate::BaseMatrix< scalar_t >::tileUnsetHoldAll ( int device = HostNum )

Unsets all local tiles' hold on device.

Parameters

[in] device Tile's device ID.

◆ tileUpdateOrigin()

template<typename scalar_t >

Tile< scalar_t > slate::BaseMatrix< scalar_t >::tileUpdateOrigin	(	int64_t	i,
		int64_t	j
	)

Updates the origin instance of tile(i, j) if MOSI::Invalid tile must be local.

Parameters

[in]	i	Tile's block row index. 0 <= i < mt.
[in]	j	Tile's block column index. 0 <= j < nt.

Returns: Pointer to origin instance of tile(i, j)

Friends And Related Symbol Documentation

◆ conj_transpose

template<typename scalar_t >

template<typename MatrixType >

MatrixType conj_transpose ( MatrixType & A )

friend

Returns shallow copy of op(A) that is conjugate-transpose.

See also: transpose

Returns: copy of Tile, etc. with updated op flag.

See also: transpose()

◆ transpose

template<typename scalar_t >

template<typename MatrixType >

MatrixType transpose ( MatrixType & A )

friend

Returns shallow copy of op(A) that is transposed.

See also: conj_transpose

Returns: copy of Tile, etc. with updated op flag.

Making this a template avoids repeating the code ad nauseam in each class. Tile and BaseMatrix make this a friend, to change op.

The documentation for this class was generated from the following file:

include/slate/BaseMatrix.hh

Public Types

Public Member Functions

Static Public Attributes

Protected Member Functions

Protected Attributes

Friends

Detailed Description

Constructor & Destructor Documentation

◆ BaseMatrix() [1/5]

◆ BaseMatrix() [2/5]

◆ BaseMatrix() [3/5]

◆ BaseMatrix() [4/5]

◆ BaseMatrix() [5/5]

Member Function Documentation

◆ allocateBatchArrays()

◆ allocWorkspaceBuffer()

◆ array_device()

◆ array_host()

◆ batchArraySize()

◆ clear()

◆ clearBatchArrays()

◆ clearWorkspace()

◆ comm_queue()

◆ compute_queue()

◆ freeWorkspaceBuffer()

◆ getLocalDevices()

◆ getRanks()

◆ globalIndex() [1/2]

◆ globalIndex() [2/2]

◆ gridinfo()

◆ listBcast()

◆ listBcastMT()

◆ numComputeQueues()

◆ operator()()

◆ originTile()

◆ releaseLocalWorkspace()

◆ releaseRemoteWorkspace()

◆ releaseRemoteWorkspaceTile()

◆ releaseWorkspace()

◆ tileAcquire()

◆ tileBcast()

◆ tileBcastToSet()

◆ tileDeviceFunc()

◆ tileErase()

◆ tileGetAllForReading()

◆ tileGetAllForReadingOnDevices()

◆ tileGetAllForWriting()

◆ tileGetAllForWritingOnDevices()

◆ tileGetAndHold() [1/4]

◆ tileGetAndHold() [2/4]

◆ tileGetAndHold() [3/4]

◆ tileGetAndHold() [4/4]

◆ tileGetAndHoldAll()

◆ tileGetAndHoldAllOnDevices()

◆ tileGetForReading() [1/4]

◆ tileGetForReading() [2/4]

◆ tileGetForReading() [3/4]

◆ tileGetForReading() [4/4]

◆ tileGetForWriting() [1/4]

◆ tileGetForWriting() [2/4]

◆ tileGetForWriting() [3/4]

◆ tileGetForWriting() [4/4]

◆ tileIbcastToSet()

◆ tileInsert() [1/3]

◆ tileInsert() [2/3]

◆ tileInsert() [3/3]

◆ tileInsertWorkspace()

◆ tileIrecv()

◆ tileIsend()

◆ tileLayoutConvert() [1/3]

◆ tileLayoutConvert() [2/3]

◆ tileLayoutConvert() [3/3]

◆ tileLayoutConvertOnDevices()

◆ tileLayoutIsConvertible()

◆ tileLayoutReset() [1/3]

◆ tileLayoutReset() [2/3]

◆ tileLayoutReset() [3/3]

◆ tileMb()

◆ tileMbFunc()

◆ tileModified()