SLATE 2024.05.31
Software for Linear Algebra Targeting Exascale
Loading...
Searching...
No Matches
slate::device Namespace Reference

GPU device implementations of kernels. More...

Classes

struct  nx_traits
 Look up NX based on data type. More...
 

Functions

template<typename src_scalar_t , typename dst_scalar_t >
void gecopy (int64_t m, int64_t n, src_scalar_t const *const *Aarray, int64_t lda, dst_scalar_t **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 Batched routine for element-wise copy and precision conversion, copying A to B.
 
template<typename src_scalar_t , typename dst_scalar_t >
void tzcopy (Uplo uplo, int64_t m, int64_t n, src_scalar_t const *const *Aarray, int64_t lda, dst_scalar_t **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<typename scalar_t >
void geadd (int64_t m, int64_t n, scalar_t const &alpha, scalar_t *A, int64_t lda, scalar_t const &beta, scalar_t *B, int64_t ldb, blas::Queue &queue)
 Routine for element-wise tile addition.
 
template<typename scalar_t >
void tzadd (Uplo uplo, int64_t m, int64_t n, scalar_t const &alpha, scalar_t **Aarray, int64_t lda, scalar_t const &beta, scalar_t **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<typename scalar_t , typename scalar_t2 >
void gescale (int64_t m, int64_t n, scalar_t2 numer, scalar_t2 denom, scalar_t *A, int64_t lda, blas::Queue &queue)
 Kernel implementing element-wise tile scale.
 
template<typename scalar_t , typename scalar_t2 >
void gescale_row_col_batch (Equed equed, int64_t m, int64_t n, scalar_t2 const *const *Rarray, scalar_t2 const *const *Carray, scalar_t **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 Batched routine for row and column scaling.
 
template<typename scalar_t >
void geset (int64_t m, int64_t n, scalar_t const &offdiag_value, scalar_t const &diag_value, scalar_t *A, int64_t lda, blas::Queue &queue)
 Element-wise m-by-n matrix A to diag_value on the diagonal and offdiag_value on the off-diagonals.
 
template<typename scalar_t >
void tzset (Uplo uplo, int64_t m, int64_t n, scalar_t const &offdiag_value, scalar_t const &diag_value, scalar_t *A, int64_t lda, blas::Queue &queue)
 
template<typename scalar_t >
void genorm (Norm norm, NormScope scope, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<typename scalar_t >
void henorm (Norm norm, Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<typename scalar_t >
void synorm (Norm norm, Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<typename scalar_t >
void synormOffdiag (Norm norm, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<typename scalar_t >
void trnorm (Norm norm, Uplo uplo, Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<typename scalar_t >
void transpose (bool is_conj, int64_t n, scalar_t *A, int64_t lda, blas::Queue &queue)
 Physically transpose a square matrix in place.
 
template<typename scalar_t >
void transpose_batch (bool is_conj, int64_t n, scalar_t **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 Physically transpose a batch of square matrices in place.
 
template<typename scalar_t >
void transpose (bool is_conj, int64_t m, int64_t n, scalar_t *dA, int64_t lda, scalar_t *dAT, int64_t ldat, blas::Queue &queue)
 Physically transpose a rectangular matrix out-of-place.
 
template<typename scalar_t >
void transpose_batch (bool is_conj, int64_t m, int64_t n, scalar_t **dA_array, int64_t lda, scalar_t **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue)
 Physically transpose a batch of rectangular matrices out-of-place.
 
template<typename scalar_t >
__device__ void geadd_func (int64_t m, int64_t n, scalar_t alpha, scalar_t *A, int64_t lda, scalar_t beta, scalar_t *B, int64_t ldb)
 Kernel implementing element-wise tile addition.
 
template<typename scalar_t >
__global__ void geadd_kernel (int64_t m, int64_t n, scalar_t alpha, scalar_t *A, int64_t lda, scalar_t beta, scalar_t *B, int64_t ldb)
 Kernel implementing element-wise tile.
 
template<typename scalar_t >
__global__ void geadd_batch_kernel (int64_t m, int64_t n, scalar_t alpha, scalar_t **Aarray, int64_t lda, scalar_t beta, scalar_t **Barray, int64_t ldb)
 Kernel implementing element-wise tile set.
 
template void geadd (int64_t m, int64_t n, float const &alpha, float *Aarray, int64_t lda, float const &beta, float *Barray, int64_t ldb, blas::Queue &queue)
 
template void geadd (int64_t m, int64_t n, double const &alpha, double *Aarray, int64_t lda, double const &beta, double *Barray, int64_t ldb, blas::Queue &queue)
 
template<>
void geadd (int64_t m, int64_t n, std::complex< float > const &alpha, std::complex< float > *Aarray, int64_t lda, std::complex< float > const &beta, std::complex< float > *Barray, int64_t ldb, blas::Queue &queue)
 
template<>
void geadd (int64_t m, int64_t n, std::complex< double > const &alpha, std::complex< double > *Aarray, int64_t lda, std::complex< double > const &beta, std::complex< double > *Barray, int64_t ldb, blas::Queue &queue)
 
template<typename src_scalar_t , typename dst_scalar_t >
__global__ void gecopy_kernel (int64_t m, int64_t n, src_scalar_t const *const *Aarray, int64_t lda, dst_scalar_t **Barray, int64_t ldb)
 Kernel implementing copy and precision conversions, copying A to B.
 
template void gecopy (int64_t m, int64_t n, float const *const *Aarray, int64_t lda, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template void gecopy (int64_t m, int64_t n, float const *const *Aarray, int64_t lda, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template void gecopy (int64_t m, int64_t n, double const *const *Aarray, int64_t lda, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template void gecopy (int64_t m, int64_t n, double const *const *Aarray, int64_t lda, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<>
void gecopy (int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<>
void gecopy (int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<>
void gecopy (int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<>
void gecopy (int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<>
void gecopy (int64_t m, int64_t n, float const *const *Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<>
void gecopy (int64_t m, int64_t n, double const *const *Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<typename scalar_t >
__global__ void genorm_max_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_maxima)
 Finds the largest absolute value of elements, for each tile in Aarray.
 
template<typename scalar_t >
__global__ void genorm_one_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv)
 Sum of absolute values of each column of elements, for each tile in Aarray.
 
template<typename scalar_t >
__global__ void genorm_inf_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv)
 Sum of absolute values of each row of elements, for each tile in Aarray.
 
template<typename scalar_t >
__global__ void genorm_fro_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_values)
 Sum of squares, in scaled representation, for each tile in Aarray.
 
template<typename scalar_t >
__global__ void ge_col_norms_max_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *col_max, int64_t ldv)
 
template<typename scalar_t >
void genorm (lapack::Norm norm, NormScope scope, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 Batched routine that computes a partial norm for each tile.
 
template void genorm (lapack::Norm norm, NormScope scope, int64_t m, int64_t n, float const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template void genorm (lapack::Norm norm, NormScope scope, int64_t m, int64_t n, double const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<>
void genorm (lapack::Norm norm, NormScope scope, int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<>
void genorm (lapack::Norm norm, NormScope scope, int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<typename scalar_t , typename scalar_t2 >
__device__ void gescale_func (int64_t m, int64_t n, scalar_t2 mul, scalar_t *A, int64_t lda)
 Device function implementing element-wise tile scale.
 
template<typename scalar_t , typename scalar_t2 >
__global__ void gescale_kernel (int64_t m, int64_t n, scalar_t2 mul, scalar_t *A, int64_t lda)
 Kernel implementing element-wise tile scale.
 
template<typename scalar_t , typename scalar_t2 >
__global__ void gescale_batch_kernel (int64_t m, int64_t n, scalar_t2 mul, scalar_t **Aarray, int64_t lda)
 Kernel implementing element-wise tile scale.
 
template void gescale (int64_t m, int64_t n, float numer, float denom, float *A, int64_t lda, blas::Queue &queue)
 
template void gescale (int64_t m, int64_t n, double numer, double denom, double *A, int64_t lda, blas::Queue &queue)
 
template<>
void gescale (int64_t m, int64_t n, float numer, float denom, std::complex< float > *A, int64_t lda, blas::Queue &queue)
 
template<>
void gescale (int64_t m, int64_t n, std::complex< float > numer, std::complex< float > denom, std::complex< float > *A, int64_t lda, blas::Queue &queue)
 
template<>
void gescale (int64_t m, int64_t n, double numer, double denom, std::complex< double > *A, int64_t lda, blas::Queue &queue)
 
template<>
void gescale (int64_t m, int64_t n, std::complex< double > numer, std::complex< double > denom, std::complex< double > *A, int64_t lda, blas::Queue &queue)
 
template<typename scalar_t , typename scalar_t2 >
__global__ void gescale_row_col_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Rarray, scalar_t2 const *const *Carray, scalar_t **Aarray, int64_t lda)
 Kernel implementing row and column scaling.
 
template<typename scalar_t , typename scalar_t2 >
__global__ void gescale_col_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Carray, scalar_t **Aarray, int64_t lda)
 Kernel implementing column scaling.
 
template<typename scalar_t , typename scalar_t2 >
__global__ void gescale_row_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Rarray, scalar_t **Aarray, int64_t lda)
 Kernel implementing row scaling.
 
template void gescale_row_col_batch (Equed equed, int64_t m, int64_t n, float const *const *Rarray, float const *const *Carray, float **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 
template void gescale_row_col_batch (Equed equed, int64_t m, int64_t n, double const *const *Rarray, double const *const *Carray, double **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 
template<>
void gescale_row_col_batch (Equed equed, int64_t m, int64_t n, float const *const *Rarray, float const *const *Carray, std::complex< float > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 
template<>
void gescale_row_col_batch (Equed equed, int64_t m, int64_t n, double const *const *Rarray, double const *const *Carray, std::complex< double > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 
template<>
void gescale_row_col_batch (Equed equed, int64_t m, int64_t n, std::complex< float > const *const *Rarray, std::complex< float > const *const *Carray, std::complex< float > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 
template<>
void gescale_row_col_batch (Equed equed, int64_t m, int64_t n, std::complex< double > const *const *Rarray, std::complex< double > const *const *Carray, std::complex< double > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 
template<typename scalar_t >
__device__ void geset_func (int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t *A, int64_t lda)
 Kernel implementing element-wise tile set.
 
template<typename scalar_t >
__global__ void geset_kernel (int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t *A, int64_t lda)
 Kernel implementing element-wise tile.
 
template<typename scalar_t >
__global__ void geset_batch_kernel (int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t **Aarray, int64_t lda)
 Kernel implementing element-wise tile set.
 
template void geset (int64_t m, int64_t n, float const &offdiag_value, float const &diag_value, float *A, int64_t lda, blas::Queue &queue)
 
template void geset (int64_t m, int64_t n, double const &offdiag_value, double const &diag_value, double *A, int64_t lda, blas::Queue &queue)
 
template<>
void geset (int64_t m, int64_t n, std::complex< float > const &offdiag_value, std::complex< float > const &diag_value, std::complex< float > *A, int64_t lda, blas::Queue &queue)
 
template<>
void geset (int64_t m, int64_t n, std::complex< double > const &offdiag_value, std::complex< double > const &diag_value, std::complex< double > *A, int64_t lda, blas::Queue &queue)
 
template<typename scalar_t >
__global__ void henorm_max_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_maxima)
 Finds the largest absolute value of elements, for each tile in Aarray.
 
template<typename scalar_t >
__global__ void henorm_one_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv)
 Sum of absolute values of each column of elements, for each tile in Aarray.
 
template<typename scalar_t >
__global__ void henorm_fro_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_values)
 Sum of squares, in scaled representation, for each tile in Aarray.
 
template<typename scalar_t >
void henorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 Batched routine that computes a partial norm for each tile.
 
template void henorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, float const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template void henorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, double const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<>
void henorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<>
void henorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<typename scalar_t >
__global__ void synorm_max_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_maxima)
 Finds the largest absolute value of elements, for each tile in Aarray.
 
template<typename scalar_t >
__global__ void synorm_one_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv)
 Sum of absolute values of each column of elements, for each tile in Aarray.
 
template<typename scalar_t >
__global__ void synorm_fro_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_values)
 Sum of squares, in scaled representation, for each tile in Aarray.
 
template<typename scalar_t >
void synorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 Batched routine that computes a partial norm for each tile.
 
template<typename scalar_t >
__global__ void synorm_offdiag_one_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv)
 Sum of absolute values of each row and each column of elements, for each tile in tiles.
 
template<typename scalar_t >
void synormOffdiag (lapack::Norm norm, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 Batched routine that computes a partial norm for each tile.
 
template void synorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, float const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template void synorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, double const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<>
void synorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<>
void synorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template void synormOffdiag (lapack::Norm norm, int64_t m, int64_t n, float const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template void synormOffdiag (lapack::Norm norm, int64_t m, int64_t n, double const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<>
void synormOffdiag (lapack::Norm norm, int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<>
void synormOffdiag (lapack::Norm norm, int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<typename scalar_t >
__device__ void transpose_func (bool is_conj, int n, scalar_t *A, int64_t lda)
 Device routine handles one matrix.
 
template<typename scalar_t , int NX>
__device__ void transpose_func (bool is_conj, int m, int n, const scalar_t *A, int64_t lda, scalar_t *AT, int64_t ldat)
 tile M-by-N matrix with ceil(M/NB) by ceil(N/NB) tiles sized NB-by-NB.
 
template<typename scalar_t >
__global__ void transpose_kernel (bool is_conj, int n, scalar_t *A, int64_t lda)
 in-place transpose of a square buffer
 
template<typename scalar_t >
__global__ void transpose_batch_kernel (bool is_conj, int n, scalar_t **Aarray, int64_t lda)
 in-place transpose of array of square buffers
 
template<typename scalar_t , int NX>
__global__ void transpose_kernel (bool is_conj, int m, int n, const scalar_t *A, int64_t lda, scalar_t *AT, int64_t ldat)
 out-of-place transpose of a rectangular buffer transposes A onto AT
 
template<typename scalar_t , int NX>
__global__ void transpose_batch_kernel (bool is_conj, int m, int n, scalar_t **dA_array, int64_t lda, scalar_t **dAT_array, int64_t ldat)
 out-of-place transpose of an array of rectangular buffers transposes dA_array onto dAT_array
 
template void transpose (bool is_conj, int64_t n, float *A, int64_t lda, blas::Queue &queue)
 
template void transpose (bool is_conj, int64_t n, double *A, int64_t lda, blas::Queue &queue)
 
template void transpose (bool is_conj, int64_t m, int64_t n, float *A, int64_t lda, float *B, int64_t ldb, blas::Queue &queue)
 
template void transpose (bool is_conj, int64_t m, int64_t n, double *A, int64_t lda, double *B, int64_t ldb, blas::Queue &queue)
 
template<>
void transpose (bool is_conj, int64_t n, std::complex< float > *A, int64_t lda, blas::Queue &queue)
 
template<>
void transpose (bool is_conj, int64_t n, std::complex< double > *A, int64_t lda, blas::Queue &queue)
 
template<>
void transpose (bool is_conj, int64_t m, int64_t n, std::complex< float > *A, int64_t lda, std::complex< float > *B, int64_t ldb, blas::Queue &queue)
 
template<>
void transpose (bool is_conj, int64_t m, int64_t n, std::complex< double > *A, int64_t lda, std::complex< double > *B, int64_t ldb, blas::Queue &queue)
 
template void transpose_batch (bool is_conj, int64_t n, float **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 
template void transpose_batch (bool is_conj, int64_t n, double **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 
template void transpose_batch (bool is_conj, int64_t m, int64_t n, float **Aarray, int64_t lda, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template void transpose_batch (bool is_conj, int64_t m, int64_t n, double **Aarray, int64_t lda, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<>
void transpose_batch (bool is_conj, int64_t n, std::complex< float > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 
template<>
void transpose_batch (bool is_conj, int64_t n, std::complex< double > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 
template<>
void transpose_batch (bool is_conj, int64_t m, int64_t n, std::complex< float > **Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<>
void transpose_batch (bool is_conj, int64_t m, int64_t n, std::complex< double > **Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<typename scalar_t >
__global__ void trnorm_max_kernel (lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_maxima)
 Finds the largest absolute value of elements, for each tile in Aarray.
 
template<typename scalar_t >
__global__ void trnorm_one_kernel (lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv)
 Sum of absolute values of each column of elements, for each tile in Aarray.
 
template<typename scalar_t >
__global__ void trnorm_inf_kernel (lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv)
 Sum of absolute values of each row of elements, for each tile in Aarray.
 
template<typename scalar_t >
__global__ void trnorm_fro_kernel (lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_values)
 Sum of squares, in scaled representation, for each tile in Aarray.
 
template<typename scalar_t >
void trnorm (lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 Batched routine that computes a partial norm for each trapezoidal tile.
 
template void trnorm (lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, float const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template void trnorm (lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, double const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<>
void trnorm (lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<>
void trnorm (lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 
template<typename scalar_t >
__global__ void tzadd_kernel (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t alpha, scalar_t **Aarray, int64_t lda, scalar_t beta, scalar_t **Barray, int64_t ldb)
 Kernel implementing element-wise tile addition.
 
template<typename scalar_t >
void tzadd (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t const &alpha, scalar_t **Aarray, int64_t lda, scalar_t const &beta, scalar_t **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 Batched routine for element-wise trapezoidal tile addition.
 
template void tzadd (lapack::Uplo uplo, int64_t m, int64_t n, float const &alpha, float **Aarray, int64_t lda, float const &beta, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template void tzadd (lapack::Uplo uplo, int64_t m, int64_t n, double const &alpha, double **Aarray, int64_t lda, double const &beta, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<>
void tzadd (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< float > const &alpha, std::complex< float > **Aarray, int64_t lda, std::complex< float > const &beta, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<>
void tzadd (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< double > const &alpha, std::complex< double > **Aarray, int64_t lda, std::complex< double > const &beta, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<typename src_scalar_t , typename dst_scalar_t >
__global__ void tzcopy_kernel (lapack::Uplo uplo, int64_t m, int64_t n, src_scalar_t const *const *Aarray, int64_t lda, dst_scalar_t **Barray, int64_t ldb)
 Kernel implementing copy and precision conversions, copying A to B.
 
template<typename src_scalar_t , typename dst_scalar_t >
void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, src_scalar_t const *const *Aarray, int64_t lda, dst_scalar_t **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 Batched routine for element-wise trapezoidal copy and precision conversion, copying A to B.
 
template void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, float const *const *Aarray, int64_t lda, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, float const *const *Aarray, int64_t lda, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, double const *const *Aarray, int64_t lda, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, double const *const *Aarray, int64_t lda, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<>
void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<>
void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<>
void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<>
void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 
template<typename scalar_t >
__global__ void tzscale_kernel (lapack::Uplo uplo, int64_t m, int64_t n, blas::real_type< scalar_t > numer, blas::real_type< scalar_t > denom, scalar_t **Aarray, int64_t lda)
 Kernel implementing element-wise tile scale.
 
template<typename scalar_t >
__device__ void tzset_func (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t *A, int64_t lda)
 Device function implementing element-wise tile set.
 
template<typename scalar_t >
__global__ void tzset_kernel (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t *A, int64_t lda)
 Kernel implementing element-wise tile set.
 
template<typename scalar_t >
__global__ void tzset_batch_kernel (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t **Aarray, int64_t lda)
 Kernel implementing element-wise tile set.
 
template<typename scalar_t >
void tzset (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t const &offdiag_value, scalar_t const &diag_value, scalar_t *A, int64_t lda, blas::Queue &queue)
 Element-wise trapezoidal tile set.
 
template void tzset (lapack::Uplo uplo, int64_t m, int64_t n, float const &offdiag_value, float const &diag_value, float *A, int64_t lda, blas::Queue &queue)
 
template void tzset (lapack::Uplo uplo, int64_t m, int64_t n, double const &offdiag_value, double const &diag_value, double *A, int64_t lda, blas::Queue &queue)
 
template<>
void tzset (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< float > const &offdiag_value, std::complex< float > const &diag_value, std::complex< float > *A, int64_t lda, blas::Queue &queue)
 
template<>
void tzset (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< double > const &offdiag_value, std::complex< double > const &diag_value, std::complex< double > *A, int64_t lda, blas::Queue &queue)
 
template<typename real_t >
__host__ __device__ real_t max_nan (real_t x, real_t y)
 max that propagates nan consistently: max_nan( 1, nan ) = nan max_nan( nan, 1 ) = nan
 
template<typename real_t >
__device__ void max_nan_reduce (int n, int tid, real_t *x)
 Max reduction of n-element array x, leaving total in x[0].
 
template<typename real_t >
__device__ void sum_reduce (int n, int tid, real_t *x)
 Sum reduction of n-element array x, leaving total in x[0].
 
__host__ __device__ double real (rocblas_double_complex x)
 
__host__ __device__ float real (rocblas_float_complex x)
 
__host__ __device__ double imag (rocblas_double_complex x)
 
__host__ __device__ float imag (rocblas_float_complex x)
 
__host__ __device__ rocblas_double_complex conj (rocblas_double_complex x)
 
__host__ __device__ rocblas_float_complex conj (rocblas_float_complex x)
 
__host__ __device__ double real (double x)
 
__host__ __device__ float real (float x)
 
__host__ __device__ double imag (double x)
 
__host__ __device__ float imag (float x)
 
__host__ __device__ double conj (double x)
 
__host__ __device__ float conj (float x)
 
__host__ __device__ float abs (float x)
 Overloaded versions of absolute value on device.
 
__host__ __device__ double abs (double x)
 
__host__ __device__ float abs (cuFloatComplex x)
 
__host__ __device__ double abs (cuDoubleComplex x)
 
template<typename scalar_t >
__host__ __device__ scalar_t sqr (scalar_t x)
 Square of number.
 
template<typename real_t >
__host__ __device__ void combine_sumsq (real_t &scale1, real_t &sumsq1, real_t scale2, real_t sumsq2)
 Adds two scaled, sum-of-squares representations.
 
template<typename real_t >
__host__ __device__ void add_sumsq (real_t &scale, real_t &sumsq, real_t absx)
 Adds new value to scaled, sum-of-squares representation.
 
template<typename T >
__host__ __device__ constexpr T ceildiv (T x, T y)
 
template<typename T >
__host__ __device__ constexpr T roundup (T x, T y)
 
template<typename TA , typename TB >
__host__ __device__ void copy (TA a, TB &b)
 Overloaded copy and precision conversion.
 
__host__ __device__ void copy (cuFloatComplex a, cuDoubleComplex &b)
 Sets b = a, converting from complex-float to complex-double.
 
__host__ __device__ void copy (cuDoubleComplex a, cuFloatComplex &b)
 Sets b = a, converting from complex-double to complex-float.
 
__host__ __device__ void copy (float a, cuFloatComplex &b)
 Sets b = a, converting from float to complex-float.
 
__host__ __device__ void copy (double a, cuDoubleComplex &b)
 Sets b = a, converting from double to complex-double.
 
template<typename x_scalar_t , typename y_scalar_t >
void transpose_batch (bool is_conj, int64_t m, int64_t n, x_scalar_t **dA_array, int64_t lda, y_scalar_t **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue)
 
template<typename scalar_t , typename scalar_t2 >
void gescale_row_col_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Rarray, scalar_t2 const *const *Carray, scalar_t **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 Kernel implementing row and column scaling.
 
template<typename scalar_t , typename scalar_t2 >
void gescale_col_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Carray, scalar_t **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 Kernel implementing column scaling.
 
template<typename scalar_t , typename scalar_t2 >
void gescale_row_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Rarray, scalar_t **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 Kernel implementing row scaling.
 
template<typename scalar_t >
void transpose_sqr_batch_func (bool is_conj, int n, scalar_t **Aarray, int64_t lda, int batch_count, blas::Queue &queue)
 Device routine handles batches of square matrices.
 
template<typename scalar_t >
void transpose_sqr_func (bool is_conj, int n, scalar_t *A, int64_t lda, blas::Queue &queue)
 Device routine handles single square matrix.
 
template<typename scalar_t , int NX>
void transpose_rect_batch_func (bool is_conj, int m, int n, scalar_t **dAarray, int64_t lda, scalar_t **dATarray, int64_t ldat, int batch_count, blas::Queue &queue)
 Device routine handles batches of rectangular matrices.
 
template<typename scalar_t , int NX>
void transpose_rect_func (bool is_conj, int m, int n, scalar_t *dA, int64_t lda, scalar_t *dAT, int64_t ldat, blas::Queue &queue)
 Device routine handles a single rectangular matrix.
 
template<typename scalar_t , int NX>
void transpose (bool is_conj, int64_t m, int64_t n, scalar_t *dA, int64_t lda, scalar_t *dAT, int64_t ldat, blas::Queue &queue)
 Physically transpose a rectangular matrix out-of-place.
 
template<typename scalar_t , int NX>
void transpose_batch (bool is_conj, int64_t m, int64_t n, scalar_t **dA_array, int64_t lda, scalar_t **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue)
 Physically transpose a batch of rectangular matrices out-of-place.
 
template<>
void transpose (bool is_conj, int64_t m, int64_t n, float *dA, int64_t lda, float *dAT, int64_t ldat, blas::Queue &queue)
 
template<>
void transpose (bool is_conj, int64_t m, int64_t n, double *dA, int64_t lda, double *dAT, int64_t ldat, blas::Queue &queue)
 
template<>
void transpose (bool is_conj, int64_t m, int64_t n, std::complex< float > *dA, int64_t lda, std::complex< float > *dAT, int64_t ldat, blas::Queue &queue)
 
template<>
void transpose (bool is_conj, int64_t m, int64_t n, std::complex< double > *dA, int64_t lda, std::complex< double > *dAT, int64_t ldat, blas::Queue &queue)
 
template<>
void transpose_batch (bool is_conj, int64_t m, int64_t n, float **dA_array, int64_t lda, float **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue)
 
template<>
void transpose_batch (bool is_conj, int64_t m, int64_t n, double **dA_array, int64_t lda, double **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue)
 
template<>
void transpose_batch (bool is_conj, int64_t m, int64_t n, std::complex< float > **dA_array, int64_t lda, std::complex< float > **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue)
 
template<>
void transpose_batch (bool is_conj, int64_t m, int64_t n, std::complex< double > **dA_array, int64_t lda, std::complex< double > **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue)
 

Variables

const int ib = 32
 block size for genorm_one_kernel
 
const int ib1 = 33
 ib + 1 for stride to avoid GPU bank conflicts
 
static const int NB = 32
 block size for transpose_func
 
static const int NY = 8
 y dim of thread block size for transpose_func
 

Detailed Description

GPU device implementations of kernels.

Function Documentation

◆ add_sumsq()

template<typename real_t >
__host__ __device__ void slate::device::add_sumsq ( real_t &  scale,
real_t &  sumsq,
real_t  absx 
)

Adds new value to scaled, sum-of-squares representation.

On exit, scale and sumsq are updated such that: scale^2 sumsq := scale^2 sumsq + (absx)^2

◆ ceildiv()

template<typename T >
__host__ __device__ constexpr T slate::device::ceildiv ( x,
y 
)
inlineconstexpr
Returns
ceil( x / y ), for integer type T.

◆ combine_sumsq()

template<typename real_t >
__host__ __device__ void slate::device::combine_sumsq ( real_t &  scale1,
real_t &  sumsq1,
real_t  scale2,
real_t  sumsq2 
)

Adds two scaled, sum-of-squares representations.

On exit, scale1 and sumsq1 are updated such that: scale1^2 sumsq1 := scale1^2 sumsq1 + scale2^2 sumsq2.

◆ conj()

__host__ __device__ double slate::device::conj ( double  x)
inline
Returns
conjugate of complex number x; x for real number.

◆ copy()

template<typename TA , typename TB >
__host__ __device__ void slate::device::copy ( TA  a,
TB &  b 
)
inline

Overloaded copy and precision conversion.

Sets b = a, converting from type TA to type TB.

◆ geadd()

template<typename scalar_t >
void slate::device::geadd ( int64_t  m,
int64_t  n,
scalar_t const &  alpha,
scalar_t *  A,
int64_t  lda,
scalar_t const &  beta,
scalar_t *  B,
int64_t  ldb,
blas::Queue &  queue 
)

Routine for element-wise tile addition.

Sets

\[ B = \alpha A + \beta B. \]

Parameters
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]alphaThe scalar alpha.
[in]Ais an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]betaThe scalar beta.
[in,out]Bis an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]queueBLAS++ queue to execute in.

Sets

\[ B = \alpha A + \beta B. \]

Parameters
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]alphaThe scalar alpha.
[in]Ais an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]betaThe scalar beta.
[in,out]Bis an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]queueBLAS++ queue to execute in.

◆ geadd_batch_kernel()

template<typename scalar_t >
__global__ void slate::device::geadd_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t  alpha,
scalar_t **  Aarray,
int64_t  lda,
scalar_t  beta,
scalar_t **  Barray,
int64_t  ldb 
)

Kernel implementing element-wise tile set.

◆ geadd_func()

template<typename scalar_t >
__device__ void slate::device::geadd_func ( int64_t  m,
int64_t  n,
scalar_t  alpha,
scalar_t *  A,
int64_t  lda,
scalar_t  beta,
scalar_t *  B,
int64_t  ldb 
)

Kernel implementing element-wise tile addition.

Each thread deals with one row. Launched by geadd_kernel() and geadd_batch_kernel().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]Aarrayis an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.
[in,out]Bis an m-by-n matrix stored in an ldb-by-n array.
[in]ldbLeading dimension of each tile in Barray. ldb >= m.

Routine for element-wise tile addition.

Sets

\[ B = \alpha A + \beta B. \]

Parameters
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]alphaThe scalar alpha.
[in]Ais an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]betaThe scalar beta.
[in,out]Bis an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]queueBLAS++ queue to execute in.

Sets

\[ B = \alpha A + \beta B. \]

Parameters
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]alphaThe scalar alpha.
[in]Ais an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]betaThe scalar beta.
[in,out]Bis an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]queueBLAS++ queue to execute in.

◆ geadd_kernel()

template<typename scalar_t >
__global__ void slate::device::geadd_kernel ( int64_t  m,
int64_t  n,
scalar_t  alpha,
scalar_t *  A,
int64_t  lda,
scalar_t  beta,
scalar_t *  B,
int64_t  ldb 
)

Kernel implementing element-wise tile.

Routine for element-wise tile addition.

Sets

\[ B = \alpha A + \beta B. \]

Parameters
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]alphaThe scalar alpha.
[in]Ais an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]betaThe scalar beta.
[in,out]Bis an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]queueBLAS++ queue to execute in.

Sets

\[ B = \alpha A + \beta B. \]

Parameters
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]alphaThe scalar alpha.
[in]Ais an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]betaThe scalar beta.
[in,out]Bis an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]queueBLAS++ queue to execute in.

◆ gecopy()

template<typename src_scalar_t , typename dst_scalar_t >
void slate::device::gecopy ( int64_t  m,
int64_t  n,
src_scalar_t const *const *  Aarray,
int64_t  lda,
dst_scalar_t **  Barray,
int64_t  ldb,
int64_t  batch_count,
blas::Queue &  queue 
)

Batched routine for element-wise copy and precision conversion, copying A to B.

Sets

\[ Barray[k] = Aarray[k]. \]

Parameters
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[out]BarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]batch_countSize of Aarray and Barray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

Sets

\[ Barray[k] = Aarray[k]. \]

Parameters
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[out]BarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]batch_countSize of Aarray and Barray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ gecopy_kernel()

template<typename src_scalar_t , typename dst_scalar_t >
__global__ void slate::device::gecopy_kernel ( int64_t  m,
int64_t  n,
src_scalar_t const *const *  Aarray,
int64_t  lda,
dst_scalar_t **  Barray,
int64_t  ldb 
)

Kernel implementing copy and precision conversions, copying A to B.

Each thread block deals with one tile. Each thread deals with one row. Launched by gecopy().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.
[out]BarrayArray of tiles of dimension gridDim.x, where each Barray[k] is an m-by-n matrix stored in an ldb-by-n array.
[in]ldbLeading dimension of each tile in Barray. ldb >= m.

◆ genorm()

template<typename scalar_t >
void slate::device::genorm ( lapack::Norm  norm,
NormScope  scope,
int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  values,
int64_t  ldv,
int64_t  batch_count,
blas::Queue &  queue 
)

Batched routine that computes a partial norm for each tile.

Batched routine that returns the largest absolute value of elements for each tile in Aarray.

Parameters
[in]normNorm to compute. See values for description.
[in]scopeScope of the norm.
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= m.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: ldv >= m. On exit, values[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= i < m.
  • Norm::Max: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of values array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

Sets tiles_maxima[k] = max_{i, j}( abs( A^(k)_(i, j) )), for each tile A^(k), where A^(k) = Aarray[k], k = 0, ..., blockDim.x-1, i = 0, ..., m-1, j = 0, ..., n-1.

Parameters
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= m.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: ldv >= m. On exit, values[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= i < m.
  • Norm::Fro: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of tiles_sums (values) array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]streamdevice to execute in.

◆ genorm_fro_kernel()

template<typename scalar_t >
__global__ void slate::device::genorm_fro_kernel ( int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_values 
)

Sum of squares, in scaled representation, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Kernel assumes non-trivial tiles (m, n >= 1). Launched by genorm().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1. Also the number of threads per block, hence,
[in]AarrayArray of tiles of dimension blockDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_valuesArray of dimension 2 * blockDim.x. On exit, tiles_values[2*k + 0] = scale tiles_values[2*k + 1] = sumsq such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2 for tile A^(k).

◆ genorm_inf_kernel()

template<typename scalar_t >
__global__ void slate::device::genorm_inf_kernel ( int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_sums,
int64_t  ldv 
)

Sum of absolute values of each row of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row. Kernel assumes non-trivial tiles (m, n >= 1). Launched by genorm().

Parameters
[in]mNumber of rows of each tile. m >= 1. Also the number of threads per block, hence,
[in]nNumber of columns of each tile. n >= 1.
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_sumsArray of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) for row i of tile A^(k).
[in]ldvLeading dimension of tiles_sums (values) array.

◆ genorm_max_kernel()

template<typename scalar_t >
__global__ void slate::device::genorm_max_kernel ( int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_maxima 
)

Finds the largest absolute value of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Uses dynamic shared memory array of length sizeof(real_t) * m. Kernel assumes non-trivial tiles (m, n >= 1). Launched by genorm().

Parameters
[in]mNumber of rows of each tile. m >= 1. Also the number of threads per block (blockDim.x), hence,
[in]nNumber of columns of each tile. n >= 1.
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_maximaArray of dimension gridDim.x. On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) ) for tile A^(k).

◆ genorm_one_kernel()

template<typename scalar_t >
__global__ void slate::device::genorm_one_kernel ( int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_sums,
int64_t  ldv 
)

Sum of absolute values of each column of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one column. Kernel assumes non-trivial tiles (m, n >= 1). Launched by genorm().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence,
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_sumsArray of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) ) for row j of tile A^(k).
[in]ldvLeading dimension of tiles_sums (values) array.

◆ gescale()

template<typename scalar_t , typename scalar_t2 >
void slate::device::gescale ( int64_t  m,
int64_t  n,
scalar_t2  numer,
scalar_t2  denom,
scalar_t *  A,
int64_t  lda,
blas::Queue &  queue 
)

Kernel implementing element-wise tile scale.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]numerScale value numerator.
[in]denomScale value denominator.
[in,out]AAn m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.

◆ gescale_batch_kernel()

template<typename scalar_t , typename scalar_t2 >
__global__ void slate::device::gescale_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t2  mul,
scalar_t **  Aarray,
int64_t  lda 
)

Kernel implementing element-wise tile scale.

◆ gescale_col_batch_kernel() [1/2]

template<typename scalar_t , typename scalar_t2 >
__global__ void slate::device::gescale_col_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t2 const *const *  Carray,
scalar_t **  Aarray,
int64_t  lda 
)

Kernel implementing column scaling.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]CarrayVector of length n containing column scaling factors.
[in,out]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.

◆ gescale_col_batch_kernel() [2/2]

template<typename scalar_t , typename scalar_t2 >
void slate::device::gescale_col_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t2 const *const *  Carray,
scalar_t **  Aarray,
int64_t  lda,
int64_t  batch_count,
blas::Queue &  queue 
)

Kernel implementing column scaling.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]CarrayVector of length n containing column scaling factors.
[in,out]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ gescale_func()

template<typename scalar_t , typename scalar_t2 >
__device__ void slate::device::gescale_func ( int64_t  m,
int64_t  n,
scalar_t2  mul,
scalar_t *  A,
int64_t  lda 
)

Device function implementing element-wise tile scale.

Each thread block deals with one tile. gridDim.x == batch_count. Each thread deals with one row. Called by gescale_kernel and gescale_batch_kernel.

Kernel implementing element-wise tile scale.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]numerScale value numerator.
[in]denomScale value denominator.
[in,out]AAn m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.

◆ gescale_kernel()

template<typename scalar_t , typename scalar_t2 >
__global__ void slate::device::gescale_kernel ( int64_t  m,
int64_t  n,
scalar_t2  mul,
scalar_t *  A,
int64_t  lda 
)

Kernel implementing element-wise tile scale.

Kernel implementing element-wise tile scale.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]numerScale value numerator.
[in]denomScale value denominator.
[in,out]AAn m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.

◆ gescale_row_batch_kernel() [1/2]

template<typename scalar_t , typename scalar_t2 >
__global__ void slate::device::gescale_row_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t2 const *const *  Rarray,
scalar_t **  Aarray,
int64_t  lda 
)

Kernel implementing row scaling.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]RarrayVector of length m containing row scaling factors.
[in,out]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.

◆ gescale_row_batch_kernel() [2/2]

template<typename scalar_t , typename scalar_t2 >
void slate::device::gescale_row_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t2 const *const *  Rarray,
scalar_t **  Aarray,
int64_t  lda,
int64_t  batch_count,
blas::Queue &  queue 
)

Kernel implementing row scaling.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]RarrayVector of length m containing row scaling factors.
[in,out]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ gescale_row_col_batch()

template<typename scalar_t , typename scalar_t2 >
void slate::device::gescale_row_col_batch ( Equed  equed,
int64_t  m,
int64_t  n,
scalar_t2 const *const *  Rarray,
scalar_t2 const *const *  Carray,
scalar_t **  Aarray,
int64_t  lda,
int64_t  batch_count,
blas::Queue &  queue 
)

Batched routine for row and column scaling.

Parameters
[in]equedForm of scaling to do.
  • Equed::Row: sets \( A = diag(R) A \)
  • Equed::Col: sets \( A = A diag(C) \)
  • Equed::Both: sets \( A = diag(R) A diag(C) \) for each R in Rarray, C in Carray, and A in Aarray.
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]RarrayVector of length m containing row scaling factors.
[in]CarrayVector of length n containing column scaling factors.
[in,out]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ gescale_row_col_batch_kernel() [1/2]

template<typename scalar_t , typename scalar_t2 >
__global__ void slate::device::gescale_row_col_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t2 const *const *  Rarray,
scalar_t2 const *const *  Carray,
scalar_t **  Aarray,
int64_t  lda 
)

Kernel implementing row and column scaling.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]RarrayVector of length m containing row scaling factors.
[in]CarrayVector of length n containing column scaling factors.
[in,out]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.

◆ gescale_row_col_batch_kernel() [2/2]

template<typename scalar_t , typename scalar_t2 >
void slate::device::gescale_row_col_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t2 const *const *  Rarray,
scalar_t2 const *const *  Carray,
scalar_t **  Aarray,
int64_t  lda,
int64_t  batch_count,
blas::Queue &  queue 
)

Kernel implementing row and column scaling.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]RarrayVector of length m containing row scaling factors.
[in]CarrayVector of length n containing column scaling factors.
[in,out]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.

◆ geset()

template<typename scalar_t >
void slate::device::geset ( int64_t  m,
int64_t  n,
scalar_t const &  offdiag_value,
scalar_t const &  diag_value,
scalar_t *  A,
int64_t  lda,
blas::Queue &  queue 
)

Element-wise m-by-n matrix A to diag_value on the diagonal and offdiag_value on the off-diagonals.

Parameters
[in]mNumber of rows of A. m >= 0.
[in]nNumber of columns of A. n >= 0.
[in]offdiag_valueThe value to set outside of the diagonal.
[in]diag_valueThe value to set on the diagonal.
[out]AAn m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of A. lda >= m.
[in]queueBLAS++ queue to execute in.

◆ geset_batch_kernel()

template<typename scalar_t >
__global__ void slate::device::geset_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t  offdiag_value,
scalar_t  diag_value,
scalar_t **  Aarray,
int64_t  lda 
)

Kernel implementing element-wise tile set.

◆ geset_func()

template<typename scalar_t >
__device__ void slate::device::geset_func ( int64_t  m,
int64_t  n,
scalar_t  offdiag_value,
scalar_t  diag_value,
scalar_t *  A,
int64_t  lda 
)

Kernel implementing element-wise tile set.

Each thread block deals with one tile. Each thread deals with one row. Launched by geset_kernel() and geset_batch_kernel().

Element-wise m-by-n matrix A to diag_value on the diagonal and offdiag_value on the off-diagonals.

Parameters
[in]mNumber of rows of A. m >= 0.
[in]nNumber of columns of A. n >= 0.
[in]offdiag_valueThe value to set outside of the diagonal.
[in]diag_valueThe value to set on the diagonal.
[out]AAn m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of A. lda >= m.
[in]queueBLAS++ queue to execute in.

◆ geset_kernel()

template<typename scalar_t >
__global__ void slate::device::geset_kernel ( int64_t  m,
int64_t  n,
scalar_t  offdiag_value,
scalar_t  diag_value,
scalar_t *  A,
int64_t  lda 
)

Kernel implementing element-wise tile.

Element-wise m-by-n matrix A to diag_value on the diagonal and offdiag_value on the off-diagonals.

Parameters
[in]mNumber of rows of A. m >= 0.
[in]nNumber of columns of A. n >= 0.
[in]offdiag_valueThe value to set outside of the diagonal.
[in]diag_valueThe value to set on the diagonal.
[out]AAn m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of A. lda >= m.
[in]queueBLAS++ queue to execute in.

◆ henorm()

template<typename scalar_t >
void slate::device::henorm ( lapack::Norm  norm,
lapack::Uplo  uplo,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  values,
int64_t  ldv,
int64_t  batch_count,
blas::Queue &  queue 
)

Batched routine that computes a partial norm for each tile.

Batched routine that returns the largest absolute value of elements for each tile in Aarray.

Parameters
[in]normNorm to compute. See values for description.
[in]uploWhether each Aarray[k] is stored in the upper or lower triangle.
[in]nNumber of rows and columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= n.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: for symmetric, same as Norm::One
  • Norm::Max: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of values array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

Sets tiles_maxima[k] = max_{i, j}( abs( A^(k)_(i, j) )), for each tile A^(k), where A^(k) = Aarray[k], k = 0, ..., blockDim.x-1, i = 0, ..., n-1, j = 0, ..., n-1.

Parameters
[in]nNumber of rows and columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= n.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: for symmetric, same as Norm::One
  • Norm::Max: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of tiles_sums (values) array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]streamdevice to execute in.

◆ henorm_fro_kernel()

template<typename scalar_t >
__global__ void slate::device::henorm_fro_kernel ( lapack::Uplo  uplo,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_values 
)

Sum of squares, in scaled representation, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Kernel assumes non-trivial tiles (n >= 1). Launched by henorm().

Parameters
[in]nNumber of rows and columns of each tile. n >= 1. Also the number of threads per block, hence,
[in]AarrayArray of tiles of dimension blockDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= n.
[out]tiles_valuesArray of dimension 2 * blockDim.x. On exit, tiles_values[2*k + 0] = scale tiles_values[2*k + 1] = sumsq such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2 for tile A^(k).

◆ henorm_max_kernel()

template<typename scalar_t >
__global__ void slate::device::henorm_max_kernel ( lapack::Uplo  uplo,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_maxima 
)

Finds the largest absolute value of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Uses dynamic shared memory array of length sizeof(real_t) * n. Kernel assumes non-trivial tiles (n >= 1). Launched by henorm().

Parameters
[in]nNumber of rows and columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence,
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= n.
[out]tiles_maximaArray of dimension gridDim.x. On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) ) for tile A^(k).

◆ henorm_one_kernel()

template<typename scalar_t >
__global__ void slate::device::henorm_one_kernel ( lapack::Uplo  uplo,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_sums,
int64_t  ldv 
)

Sum of absolute values of each column of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one column. Kernel assumes non-trivial tiles (n >= 1). Launched by henorm().

Parameters
[in]nNumber of rows and columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence,
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= n.
[out]tiles_sumsArray of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) ) for row j of tile A^(k).
[in]ldvLeading dimension of tiles_sums (values) array.

◆ imag()

__host__ __device__ double slate::device::imag ( double  x)
inline
Returns
imaginary component of complex number x; 0 for real number.

◆ max_nan_reduce()

template<typename real_t >
__device__ void slate::device::max_nan_reduce ( int  n,
int  tid,
real_t *  x 
)

Max reduction of n-element array x, leaving total in x[0].

Propagates NaN values consistently. With k threads, can reduce array up to 2*k in size. Assumes number of threads <= 1024, which is the current max number of CUDA threads.

Parameters
[in]nSize of array.
[in]tidThread id.
[in]xArray of dimension n. On exit, x[0] = max(x[0], ..., x[n-1]); the rest of x is overwritten.

◆ real()

__host__ __device__ double slate::device::real ( double  x)
inline
Returns
real component of complex number x; x for real number.

◆ roundup()

template<typename T >
__host__ __device__ constexpr T slate::device::roundup ( x,
y 
)
inlineconstexpr
Returns
ceil( x / y )*y, i.e., x rounded up to next multiple of y.

◆ sqr()

template<typename scalar_t >
__host__ __device__ scalar_t slate::device::sqr ( scalar_t  x)
inline

Square of number.

Returns
x^2

◆ sum_reduce()

template<typename real_t >
__device__ void slate::device::sum_reduce ( int  n,
int  tid,
real_t *  x 
)

Sum reduction of n-element array x, leaving total in x[0].

With k threads, can reduce array up to 2*k in size. Assumes number of threads <= 1024 (which is current max number of CUDA threads).

Parameters
[in]nSize of array.
[in]tidThread id.
[in]xArray of dimension n. On exit, x[0] = sum(x[0], ..., x[n-1]); rest of x is overwritten.

◆ synorm()

template<typename scalar_t >
void slate::device::synorm ( lapack::Norm  norm,
lapack::Uplo  uplo,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  values,
int64_t  ldv,
int64_t  batch_count,
blas::Queue &  queue 
)

Batched routine that computes a partial norm for each tile.

Batched routine that returns the largest absolute value of elements for each tile in Aarray.

Parameters
[in]normNorm to compute. See values for description.
[in]uploWhether each Aarray[k] is stored in the upper or lower triangle.
[in]nNumber of rows and columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= n.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: for symmetric, same as Norm::One
  • Norm::Max: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of values array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

Sets tiles_maxima[k] = max_{i, j}( abs( A^(k)_(i, j) )), for each tile A^(k), where A^(k) = Aarray[k], k = 0, ..., blockDim.x-1, i = 0, ..., n-1, j = 0, ..., n-1.

Parameters
[in]nNumber of rows and columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= n.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: for symmetric, same as Norm::One
  • Norm::Fro: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of tiles_sums (values) array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]streamdevice to execute in.

◆ synorm_fro_kernel()

template<typename scalar_t >
__global__ void slate::device::synorm_fro_kernel ( lapack::Uplo  uplo,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_values 
)

Sum of squares, in scaled representation, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Kernel assumes non-trivial tiles (n >= 1). Launched by synorm().

Parameters
[in]nNumber of rows and columns of each tile. n >= 1. Also the number of threads per block, hence,
[in]AarrayArray of tiles of dimension blockDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= n.
[out]tiles_valuesArray of dimension 2 * blockDim.x. On exit, tiles_values[2*k + 0] = scale tiles_values[2*k + 1] = sumsq such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2 for tile A^(k).

◆ synorm_max_kernel()

template<typename scalar_t >
__global__ void slate::device::synorm_max_kernel ( lapack::Uplo  uplo,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_maxima 
)

Finds the largest absolute value of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Uses dynamic shared memory array of length sizeof(real_t) * n. Kernel assumes non-trivial tiles (n >= 1). Launched by synorm().

Parameters
[in]nNumber of rows and columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence,
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= n.
[out]tiles_maximaArray of dimension gridDim.x. On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) ) for tile A^(k).

◆ synorm_offdiag_one_kernel()

template<typename scalar_t >
__global__ void slate::device::synorm_offdiag_one_kernel ( int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_sums,
int64_t  ldv 
)

Sum of absolute values of each row and each column of elements, for each tile in tiles.

Each thread block deals with one tile. Kernel assumes non-trivial tiles (m, n >= 1). Launched by synormOffdiag().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_sumsArray of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for column j of tile A^(k), and tiles_sums[k*ldv + i + n] = sum_{j} abs( A^(k)_(i, j) ) for row i of tile A^(k).
[in]ldvLeading dimension of tiles_sums (values) array.

◆ synorm_one_kernel()

template<typename scalar_t >
__global__ void slate::device::synorm_one_kernel ( lapack::Uplo  uplo,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_sums,
int64_t  ldv 
)

Sum of absolute values of each column of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one column. Kernel assumes non-trivial tiles (n >= 1). Launched by synorm().

Parameters
[in]nNumber of rows and columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence,
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= n.
[out]tiles_sumsArray of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) ) for row j of tile A^(k).
[in]ldvLeading dimension of tiles_sums (values) array.

◆ synormOffdiag()

template<typename scalar_t >
void slate::device::synormOffdiag ( lapack::Norm  norm,
int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  values,
int64_t  ldv,
int64_t  batch_count,
blas::Queue &  queue 
)

Batched routine that computes a partial norm for each tile.

Batched routine that returns the largest absolute value of elements for each tile in Aarray.

Used for full, off-diagonal tiles within a symmetric matrix, where element Aij contributes to both column i and j.

Parameters
[in]normNorm to compute. See values for description.
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= m.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: for symmetric, same as Norm::One
  • Norm::Max: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of values array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

Sets tiles_maxima[k] = max_{i, j}( abs( A^(k)_(i, j) )), for each tile A^(k), where A^(k) = Aarray[k], k = 0, ..., blockDim.x-1, i = 0, ..., n-1, j = 0, ..., n-1.

Parameters
[in]nNumber of rows and columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= n.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: for symmetric, same as Norm::One
  • Norm::Max: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of tiles_sums (values) array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]streamGPU device to execute in.

◆ transpose() [1/3]

template<typename scalar_t >
void slate::device::transpose ( bool  is_conj,
int64_t  m,
int64_t  n,
scalar_t *  dA,
int64_t  lda,
scalar_t *  dAT,
int64_t  ldat,
blas::Queue &  queue 
)

Physically transpose a rectangular matrix out-of-place.

Parameters
[in]mNumber of columns of tile. m >= 0.
[in]nNumber of rows of tile. n >= 0.
[in]dAA rectangular m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of dA. lda >= m.
[out]dATA rectangular m-by-n matrix stored in an ldat-by-m array in GPU memory. On output, dAT is the transpose of dA.
[in]ldatLeading dimension of dAT. ldat >= n.
[in]queueBLAS++ queue to execute in.

◆ transpose() [2/3]

template<typename scalar_t , int NX>
void slate::device::transpose ( bool  is_conj,
int64_t  m,
int64_t  n,
scalar_t *  dA,
int64_t  lda,
scalar_t *  dAT,
int64_t  ldat,
blas::Queue &  queue 
)

Physically transpose a rectangular matrix out-of-place.

Parameters
[in]mNumber of columns of tile. m >= 0.
[in]nNumber of rows of tile. n >= 0.
[in]dAA rectangular m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of dA. lda >= m.
[out]dATA rectangular m-by-n matrix stored in an ldat-by-m array in GPU memory. On output, dAT is the transpose of dA.
[in]ldatLeading dimension of dAT. ldat >= n.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ transpose() [3/3]

template<typename scalar_t >
void slate::device::transpose ( bool  is_conj,
int64_t  n,
scalar_t *  A,
int64_t  lda,
blas::Queue &  queue 
)

Physically transpose a square matrix in place.

Parameters
[in]nNumber of rows and columns of each tile. n >= 0.
[in,out]AA square n-by-n matrix stored in an lda-by-n array in GPU memory. On output, A is transposed.
[in]ldaLeading dimension of A. lda >= n.
[in]queueBLAS++ queue to execute in.

◆ transpose_batch() [1/3]

template<typename scalar_t >
void slate::device::transpose_batch ( bool  is_conj,
int64_t  m,
int64_t  n,
scalar_t **  dA_array,
int64_t  lda,
scalar_t **  dAT_array,
int64_t  ldat,
int64_t  batch_count,
blas::Queue &  queue 
)

Physically transpose a batch of rectangular matrices out-of-place.

Parameters
[in]mNumber of columns of each tile. m >= 0.
[in]nNumber of rows of each tile. n >= 0.
[in]dA_arrayArray in GPU memory of dimension batch_count, containing pointers to matrices, where each dA_array[k] is a rectangular m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each dA_array[k] tile. lda >= m.
[out]dAT_arrayArray in GPU memory of dimension batch_count, containing pointers to matrices, where each dAT_array[k] is a rectangular m-by-n matrix stored in an ldat-by-m array in GPU memory. On output, each dAT_array[k] is the transpose of dA_array[k].
[in]ldatLeading dimension of each dAT_array[k] tile. ldat >= n.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ transpose_batch() [2/3]

template<typename scalar_t , int NX>
void slate::device::transpose_batch ( bool  is_conj,
int64_t  m,
int64_t  n,
scalar_t **  dA_array,
int64_t  lda,
scalar_t **  dAT_array,
int64_t  ldat,
int64_t  batch_count,
blas::Queue &  queue 
)

Physically transpose a batch of rectangular matrices out-of-place.

Parameters
[in]mNumber of columns of each tile. m >= 0.
[in]nNumber of rows of each tile. n >= 0.
[in]dA_arrayArray in GPU memory of dimension batch_count, containing pointers to matrices, where each dA_array[k] is a rectangular m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each dA_array[k] tile. lda >= m.
[out]dAT_arrayArray in GPU memory of dimension batch_count, containing pointers to matrices, where each dAT_array[k] is a rectangular m-by-n matrix stored in an ldat-by-m array in GPU memory. On output, each dAT_array[k] is the transpose of dA_array[k].
[in]ldaLeading dimension of each dAT_array[k] tile. ldat >= n.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ transpose_batch() [3/3]

template<typename scalar_t >
void slate::device::transpose_batch ( bool  is_conj,
int64_t  n,
scalar_t **  Aarray,
int64_t  lda,
int64_t  batch_count,
blas::Queue &  queue 
)

Physically transpose a batch of square matrices in place.

Parameters
[in]nNumber of rows and columns of each tile. n >= 0.
[in,out]AarrayArray in GPU memory of dimension batch_count, containing pointers to matrices, where each Aarray[k] is a square n-by-n matrix stored in an lda-by-n array in GPU memory. On output, each Aarray[k] is transposed.
[in]ldaLeading dimension of each tile. lda >= n.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ transpose_func() [1/2]

template<typename scalar_t , int NX>
__device__ void slate::device::transpose_func ( bool  is_conj,
int  m,
int  n,
const scalar_t *  A,
int64_t  lda,
scalar_t *  AT,
int64_t  ldat 
)

tile M-by-N matrix with ceil(M/NB) by ceil(N/NB) tiles sized NB-by-NB.

uses NX-by-NY threads, where NB/NX, NB/NY, NX/NY evenly. subtile each NB-by-NB tile with (NB/NX) subtiles sized NX-by-NB for each subtile load NX-by-NB subtile transposed from A into sA, as (NB/NY) blocks sized NX-by-NY save NB-by-NX subtile from sA into AT, as (NB/NX)*(NX/NY) blocks sized NX-by-NY A += NX AT += NX*ldat

e.g., with NB=32, NX=32, NY=8 ([sdc] precisions) load 32x32 subtile as 4 blocks of 32x8 columns: (A11 A12 A13 A14 ) save 32x32 subtile as 1*4 blocks of 32x8 columns: (AT11 AT12 AT13 AT14)

e.g., with NB=32, NX=16, NY=8 (z precision) load 16x32 subtile as 4 blocks of 16x8 columns: (A11 A12 A13 A14) save 32x16 subtile as 2*2 blocks of 16x8 columns: (AT11 AT12) (AT21 AT22)

◆ transpose_func() [2/2]

template<typename scalar_t >
__device__ void slate::device::transpose_func ( bool  is_conj,
int  n,
scalar_t *  A,
int64_t  lda 
)

Device routine handles one matrix.

Thread block grid: x = batch index (ignored here; see batch kernel), y = block row index, z = block col index. Each thread block is ib-by-ib threads and does one ib-by-ib block of an n-by-n matrix.

Let nt = ceildiv( n, ib ) be the number of blocks for one n-by-n matrix. An even number of blocks uses an (nt + 1) by (nt/2) grid. Example: for nt = 4 blocks, y by z = 5 by 2 grid: [ A00 A01 ] [-—. A11 ] [ A10 . | . . ] [ A10 '-—] [ A20 A21 | . . ] [ A20 A21 ] covers matrix as [ A30 A31 | A00 . ] [ A30 A31 ] [ A40 A41 | A01 A11 ] [ A40 A41 ]

An odd number of blocks uses an (nt) by (nt + 1)/2 grid. Example: for nt = 5 blocks, y by z = 5 by 3 grid: [ A00 | A01 A02 ] [ '-—. ] [ A00 . . | . . ] [ A10 A11 | A12 ] [ A10 A11 . | . . ] [ '--—] covers matrix as [ A20 A21 A22 | . . ] [ A20 A21 A22 ] [ A30 A31 A32 | A01 . ] [ A30 A31 A32 ] [ A40 A41 A42 | A02 A12 ] [ A40 A41 A42 ]

◆ transpose_rect_batch_func()

template<typename scalar_t , int NX>
void slate::device::transpose_rect_batch_func ( bool  is_conj,
int  m,
int  n,
scalar_t **  dAarray,
int64_t  lda,
scalar_t **  dATarray,
int64_t  ldat,
int  batch_count,
blas::Queue &  queue 
)

Device routine handles batches of rectangular matrices.

The routine loads blocks of data into small NX x NB local storage and then writes the blocks back transposed into the correct location transposed.

◆ transpose_rect_func()

template<typename scalar_t , int NX>
void slate::device::transpose_rect_func ( bool  is_conj,
int  m,
int  n,
scalar_t *  dA,
int64_t  lda,
scalar_t *  dAT,
int64_t  ldat,
blas::Queue &  queue 
)

Device routine handles a single rectangular matrix.

The routine loads blocks of data into small NX x NB local storage and then writes the blocks back transposed into the correct location transposed.

◆ transpose_sqr_batch_func()

template<typename scalar_t >
void slate::device::transpose_sqr_batch_func ( bool  is_conj,
int  n,
scalar_t **  Aarray,
int64_t  lda,
int  batch_count,
blas::Queue &  queue 
)

Device routine handles batches of square matrices.

The routine loads blocks of data into small ib x ib local storage and then writes the blocks back transposed into the correct location transposed.

◆ transpose_sqr_func()

template<typename scalar_t >
void slate::device::transpose_sqr_func ( bool  is_conj,
int  n,
scalar_t *  A,
int64_t  lda,
blas::Queue &  queue 
)

Device routine handles single square matrix.

The routine loads blocks of data into small ib x ib local storage and then writes the blocks back transposed into the correct location transposed.

◆ trnorm()

template<typename scalar_t >
void slate::device::trnorm ( lapack::Norm  norm,
lapack::Uplo  uplo,
lapack::Diag  diag,
int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  values,
int64_t  ldv,
int64_t  batch_count,
blas::Queue &  queue 
)

Batched routine that computes a partial norm for each trapezoidal tile.

Batched routine that returns the largest absolute value of elements for each tile in Aarray.

todo: rename to tznorm for consistency with other tz routines.

Parameters
[in]normNorm to compute. See values for description.
[in]uploWhether each Aarray[k] is upper or lower trapezoidal.
[in]diagWhether or not each Aarray[k] has unit diagonal.
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= m.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: ldv >= m. On exit, values[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= i < m.
  • Norm::Max: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of values array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

Sets tiles_maxima[k] = max_{i, j}( abs( A^(k)_(i, j) )), for each tile A^(k), where A^(k) = Aarray[k], k = 0, ..., blockDim.x-1, i = 0, ..., m-1, j = 0, ..., n-1.

Parameters
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= m.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: ldv >= m. On exit, values[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= i < m.
  • Norm::Max: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of tiles_sums (values) array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]streamdevice to execute in.

◆ trnorm_fro_kernel()

template<typename scalar_t >
__global__ void slate::device::trnorm_fro_kernel ( lapack::Uplo  uplo,
lapack::Diag  diag,
int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_values 
)

Sum of squares, in scaled representation, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Kernel assumes non-trivial tiles (m, n >= 1). Launched by trnorm().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1. Also the number of threads per block, hence,
[in]AarrayArray of tiles of dimension blockDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_valuesArray of dimension 2 * blockDim.x. On exit, tiles_values[2*k + 0] = scale tiles_values[2*k + 1] = sumsq such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2 for tile A^(k).

◆ trnorm_inf_kernel()

template<typename scalar_t >
__global__ void slate::device::trnorm_inf_kernel ( lapack::Uplo  uplo,
lapack::Diag  diag,
int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_sums,
int64_t  ldv 
)

Sum of absolute values of each row of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row. Kernel assumes non-trivial tiles (m, n >= 1). Launched by trnorm().

Parameters
[in]mNumber of rows of each tile. m >= 1. Also the number of threads per block, hence,
[in]nNumber of columns of each tile. n >= 1.
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_sumsArray of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) for row i of tile A^(k).
[in]ldvLeading dimension of tiles_sums (values) array.

◆ trnorm_max_kernel()

template<typename scalar_t >
__global__ void slate::device::trnorm_max_kernel ( lapack::Uplo  uplo,
lapack::Diag  diag,
int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_maxima 
)

Finds the largest absolute value of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Uses dynamic shared memory array of length sizeof(real_t) * m. Kernel assumes non-trivial tiles (m, n >= 1). Launched by trnorm().

Parameters
[in]mNumber of rows of each tile. m >= 1. Also the number of threads per block (blockDim.x), hence,
[in]nNumber of columns of each tile. n >= 1.
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_maximaArray of dimension gridDim.x. On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) ) for tile A^(k).

◆ trnorm_one_kernel()

template<typename scalar_t >
__global__ void slate::device::trnorm_one_kernel ( lapack::Uplo  uplo,
lapack::Diag  diag,
int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_sums,
int64_t  ldv 
)

Sum of absolute values of each column of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one column. Kernel assumes non-trivial tiles (m, n >= 1). Launched by trnorm().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence,
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_sumsArray of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) ) for row j of tile A^(k).
[in]ldvLeading dimension of tiles_sums (values) array.

◆ tzadd()

template<typename scalar_t >
void slate::device::tzadd ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
scalar_t const &  alpha,
scalar_t **  Aarray,
int64_t  lda,
scalar_t const &  beta,
scalar_t **  Barray,
int64_t  ldb,
int64_t  batch_count,
blas::Queue &  queue 
)

Batched routine for element-wise trapezoidal tile addition.

Sets upper or lower part of

\[ Barray[k] = \alpha Aarray[k] + \beta Barray[k]. \]

Parameters
[in]uploWhether each Aarray[k] is upper or lower trapezoidal.
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]alphaThe scalar alpha.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]betaThe scalar beta.
[in,out]BarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]batch_countSize of Aarray and Barray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

Sets upper or lower part of

\[ Barray[k] = \alpha Aarray[k] + \beta Barray[k]. \]

Parameters
[in]uploWhether each Aarray[k] is upper or lower trapezoidal.
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]alphaThe scalar alpha.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]betaThe scalar beta.
[in,out]BarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]batch_countSize of Aarray and Barray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ tzadd_kernel()

template<typename scalar_t >
__global__ void slate::device::tzadd_kernel ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
scalar_t  alpha,
scalar_t **  Aarray,
int64_t  lda,
scalar_t  beta,
scalar_t **  Barray,
int64_t  ldb 
)

Kernel implementing element-wise tile addition.

Each thread block deals with one tile. Each thread deals with one row. Launched by tzadd().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.
[in,out]BarrayArray of tiles of dimension gridDim.x, where each Barray[k] is an m-by-n matrix stored in an ldb-by-n array.
[in]ldbLeading dimension of each tile in Barray. ldb >= m.

◆ tzcopy()

template<typename src_scalar_t , typename dst_scalar_t >
void slate::device::tzcopy ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
src_scalar_t const *const *  Aarray,
int64_t  lda,
dst_scalar_t **  Barray,
int64_t  ldb,
int64_t  batch_count,
blas::Queue &  queue 
)

Batched routine for element-wise trapezoidal copy and precision conversion, copying A to B.

Batched routine for element-wise copy and precision conversion.

Sets upper or lower part of

\[ Barray[k] = Aarray[k]. \]

Parameters
[in]uploWhether each Aarray[k] is upper or lower trapezoidal.
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[out]BarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]batch_countSize of Aarray and Barray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[out]BarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]batch_countSize of Aarray and Barray. batch_count >= 0.
[in]streamDevice to execute in.

◆ tzcopy_kernel()

template<typename src_scalar_t , typename dst_scalar_t >
__global__ void slate::device::tzcopy_kernel ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
src_scalar_t const *const *  Aarray,
int64_t  lda,
dst_scalar_t **  Barray,
int64_t  ldb 
)

Kernel implementing copy and precision conversions, copying A to B.

Each thread block deals with one tile. Each thread deals with one row. Launched by tzcopy().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.
[out]BarrayArray of tiles of dimension gridDim.x, where each Barray[k] is an m-by-n matrix stored in an ldb-by-n array.
[in]ldbLeading dimension of each tile in Barray. ldb >= m.

◆ tzscale_kernel()

template<typename scalar_t >
__global__ void slate::device::tzscale_kernel ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
blas::real_type< scalar_t >  numer,
blas::real_type< scalar_t >  denom,
scalar_t **  Aarray,
int64_t  lda 
)

Kernel implementing element-wise tile scale.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale().

Parameters
[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]numerScale value numerator.
[in]denomScale value denominator.
[in,out]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.

◆ tzset()

template<typename scalar_t >
void slate::device::tzset ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
scalar_t const &  offdiag_value,
scalar_t const &  diag_value,
scalar_t *  A,
int64_t  lda,
blas::Queue &  queue 
)

Element-wise trapezoidal tile set.

Sets upper or lower part of Aarray[k] to diag_value on the diagonal and offdiag_value on the off-diagonals.

Parameters
[in]uploWhether each Aarray[k] is upper or lower trapezoidal.
[in]mNumber of rows of A. m >= 0.
[in]nNumber of columns of A. n >= 0.
[in]offdiag_valueConstant to set offdiagonal entries to.
[in]diag_valueConstant to set diagonal entries to.
[out]AAn m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of A. lda >= m.
[in]queueBLAS++ queue to execute in.

◆ tzset_batch_kernel()

template<typename scalar_t >
__global__ void slate::device::tzset_batch_kernel ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
scalar_t  offdiag_value,
scalar_t  diag_value,
scalar_t **  Aarray,
int64_t  lda 
)

Kernel implementing element-wise tile set.

◆ tzset_func()

template<typename scalar_t >
__device__ void slate::device::tzset_func ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
scalar_t  offdiag_value,
scalar_t  diag_value,
scalar_t *  A,
int64_t  lda 
)

Device function implementing element-wise tile set.

Each thread block deals with one tile. gridDim.x == batch_count. Each thread deals with one row. Called by tzset_kernel and tzset_batch_kernel.

◆ tzset_kernel()

template<typename scalar_t >
__global__ void slate::device::tzset_kernel ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
scalar_t  offdiag_value,
scalar_t  diag_value,
scalar_t *  A,
int64_t  lda 
)

Kernel implementing element-wise tile set.

Variable Documentation

◆ ib

static const int slate::device::ib = 32

block size for genorm_one_kernel

internal blocking 16 x 16 thread block = 256 threads 32 x 32 thread block = 1024 threads