SLATE 2024.05.31
Software for Linear Algebra Targeting Exascale
No Matches
slate::device Namespace Reference

GPU device implementations of kernels. More...


struct  nx_traits
 Look up NX based on data type. More...


template<typename src_scalar_t , typename dst_scalar_t >
void gecopy (int64_t m, int64_t n, src_scalar_t const *const *Aarray, int64_t lda, dst_scalar_t **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 Batched routine for element-wise copy and precision conversion, copying A to B.
template<typename src_scalar_t , typename dst_scalar_t >
void tzcopy (Uplo uplo, int64_t m, int64_t n, src_scalar_t const *const *Aarray, int64_t lda, dst_scalar_t **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
template<typename scalar_t >
void geadd (int64_t m, int64_t n, scalar_t const &alpha, scalar_t *A, int64_t lda, scalar_t const &beta, scalar_t *B, int64_t ldb, blas::Queue &queue)
 Routine for element-wise tile addition.
template<typename scalar_t >
void tzadd (Uplo uplo, int64_t m, int64_t n, scalar_t const &alpha, scalar_t **Aarray, int64_t lda, scalar_t const &beta, scalar_t **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
template<typename scalar_t , typename scalar_t2 >
void gescale (int64_t m, int64_t n, scalar_t2 numer, scalar_t2 denom, scalar_t *A, int64_t lda, blas::Queue &queue)
 Kernel implementing element-wise tile scale.
template<typename scalar_t , typename scalar_t2 >
void gescale_row_col_batch (Equed equed, int64_t m, int64_t n, scalar_t2 const *const *Rarray, scalar_t2 const *const *Carray, scalar_t **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 Batched routine for row and column scaling.
template<typename scalar_t >
void geset (int64_t m, int64_t n, scalar_t const &offdiag_value, scalar_t const &diag_value, scalar_t *A, int64_t lda, blas::Queue &queue)
 Element-wise m-by-n matrix A to diag_value on the diagonal and offdiag_value on the off-diagonals.
template<typename scalar_t >
void tzset (Uplo uplo, int64_t m, int64_t n, scalar_t const &offdiag_value, scalar_t const &diag_value, scalar_t *A, int64_t lda, blas::Queue &queue)
template<typename scalar_t >
void genorm (Norm norm, NormScope scope, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
template<typename scalar_t >
void henorm (Norm norm, Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
template<typename scalar_t >
void synorm (Norm norm, Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
template<typename scalar_t >
void synormOffdiag (Norm norm, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
template<typename scalar_t >
void trnorm (Norm norm, Uplo uplo, Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
template<typename scalar_t >
void transpose (bool is_conj, int64_t n, scalar_t *A, int64_t lda, blas::Queue &queue)
 Physically transpose a square matrix in place.
template<typename scalar_t >
void transpose_batch (bool is_conj, int64_t n, scalar_t **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 Physically transpose a batch of square matrices in place.
template<typename scalar_t >
void transpose (bool is_conj, int64_t m, int64_t n, scalar_t *dA, int64_t lda, scalar_t *dAT, int64_t ldat, blas::Queue &queue)
 Physically transpose a rectangular matrix out-of-place.
template<typename scalar_t >
void transpose_batch (bool is_conj, int64_t m, int64_t n, scalar_t **dA_array, int64_t lda, scalar_t **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue)
 Physically transpose a batch of rectangular matrices out-of-place.
template<typename scalar_t >
__device__ void geadd_func (int64_t m, int64_t n, scalar_t alpha, scalar_t *A, int64_t lda, scalar_t beta, scalar_t *B, int64_t ldb)
 Kernel implementing element-wise tile addition.
template<typename scalar_t >
__global__ void geadd_kernel (int64_t m, int64_t n, scalar_t alpha, scalar_t *A, int64_t lda, scalar_t beta, scalar_t *B, int64_t ldb)
 Kernel implementing element-wise tile.
template<typename scalar_t >
__global__ void geadd_batch_kernel (int64_t m, int64_t n, scalar_t alpha, scalar_t **Aarray, int64_t lda, scalar_t beta, scalar_t **Barray, int64_t ldb)
 Kernel implementing element-wise tile set.
template void geadd (int64_t m, int64_t n, float const &alpha, float *Aarray, int64_t lda, float const &beta, float *Barray, int64_t ldb, blas::Queue &queue)
template void geadd (int64_t m, int64_t n, double const &alpha, double *Aarray, int64_t lda, double const &beta, double *Barray, int64_t ldb, blas::Queue &queue)
void geadd (int64_t m, int64_t n, std::complex< float > const &alpha, std::complex< float > *Aarray, int64_t lda, std::complex< float > const &beta, std::complex< float > *Barray, int64_t ldb, blas::Queue &queue)
void geadd (int64_t m, int64_t n, std::complex< double > const &alpha, std::complex< double > *Aarray, int64_t lda, std::complex< double > const &beta, std::complex< double > *Barray, int64_t ldb, blas::Queue &queue)
template<typename src_scalar_t , typename dst_scalar_t >
__global__ void gecopy_kernel (int64_t m, int64_t n, src_scalar_t const *const *Aarray, int64_t lda, dst_scalar_t **Barray, int64_t ldb)
 Kernel implementing copy and precision conversions, copying A to B.
template void gecopy (int64_t m, int64_t n, float const *const *Aarray, int64_t lda, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
template void gecopy (int64_t m, int64_t n, float const *const *Aarray, int64_t lda, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
template void gecopy (int64_t m, int64_t n, double const *const *Aarray, int64_t lda, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
template void gecopy (int64_t m, int64_t n, double const *const *Aarray, int64_t lda, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
void gecopy (int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
void gecopy (int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
void gecopy (int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
void gecopy (int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
void gecopy (int64_t m, int64_t n, float const *const *Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
void gecopy (int64_t m, int64_t n, double const *const *Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
template<typename scalar_t >
__global__ void genorm_max_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_maxima)
 Finds the largest absolute value of elements, for each tile in Aarray.
template<typename scalar_t >
__global__ void genorm_one_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv)
 Sum of absolute values of each column of elements, for each tile in Aarray.
template<typename scalar_t >
__global__ void genorm_inf_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv)
 Sum of absolute values of each row of elements, for each tile in Aarray.
template<typename scalar_t >
__global__ void genorm_fro_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_values)
 Sum of squares, in scaled representation, for each tile in Aarray.
template<typename scalar_t >
__global__ void ge_col_norms_max_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *col_max, int64_t ldv)
template<typename scalar_t >
void genorm (lapack::Norm norm, NormScope scope, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 Batched routine that computes a partial norm for each tile.
template void genorm (lapack::Norm norm, NormScope scope, int64_t m, int64_t n, float const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
template void genorm (lapack::Norm norm, NormScope scope, int64_t m, int64_t n, double const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
void genorm (lapack::Norm norm, NormScope scope, int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
void genorm (lapack::Norm norm, NormScope scope, int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
template<typename scalar_t , typename scalar_t2 >
__device__ void gescale_func (int64_t m, int64_t n, scalar_t2 mul, scalar_t *A, int64_t lda)
 Device function implementing element-wise tile scale.
template<typename scalar_t , typename scalar_t2 >
__global__ void gescale_kernel (int64_t m, int64_t n, scalar_t2 mul, scalar_t *A, int64_t lda)
 Kernel implementing element-wise tile scale.
template<typename scalar_t , typename scalar_t2 >
__global__ void gescale_batch_kernel (int64_t m, int64_t n, scalar_t2 mul, scalar_t **Aarray, int64_t lda)
 Kernel implementing element-wise tile scale.
template void gescale (int64_t m, int64_t n, float numer, float denom, float *A, int64_t lda, blas::Queue &queue)
template void gescale (int64_t m, int64_t n, double numer, double denom, double *A, int64_t lda, blas::Queue &queue)
void gescale (int64_t m, int64_t n, float numer, float denom, std::complex< float > *A, int64_t lda, blas::Queue &queue)
void gescale (int64_t m, int64_t n, std::complex< float > numer, std::complex< float > denom, std::complex< float > *A, int64_t lda, blas::Queue &queue)
void gescale (int64_t m, int64_t n, double numer, double denom, std::complex< double > *A, int64_t lda, blas::Queue &queue)
void gescale (int64_t m, int64_t n, std::complex< double > numer, std::complex< double > denom, std::complex< double > *A, int64_t lda, blas::Queue &queue)
template<typename scalar_t , typename scalar_t2 >
__global__ void gescale_row_col_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Rarray, scalar_t2 const *const *Carray, scalar_t **Aarray, int64_t lda)
 Kernel implementing row and column scaling.
template<typename scalar_t , typename scalar_t2 >
__global__ void gescale_col_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Carray, scalar_t **Aarray, int64_t lda)
 Kernel implementing column scaling.
template<typename scalar_t , typename scalar_t2 >
__global__ void gescale_row_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Rarray, scalar_t **Aarray, int64_t lda)
 Kernel implementing row scaling.
template void gescale_row_col_batch (Equed equed, int64_t m, int64_t n, float const *const *Rarray, float const *const *Carray, float **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
template void gescale_row_col_batch (Equed equed, int64_t m, int64_t n, double const *const *Rarray, double const *const *Carray, double **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
void gescale_row_col_batch (Equed equed, int64_t m, int64_t n, float const *const *Rarray, float const *const *Carray, std::complex< float > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
void gescale_row_col_batch (Equed equed, int64_t m, int64_t n, double const *const *Rarray, double const *const *Carray, std::complex< double > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
void gescale_row_col_batch (Equed equed, int64_t m, int64_t n, std::complex< float > const *const *Rarray, std::complex< float > const *const *Carray, std::complex< float > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
void gescale_row_col_batch (Equed equed, int64_t m, int64_t n, std::complex< double > const *const *Rarray, std::complex< double > const *const *Carray, std::complex< double > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
template<typename scalar_t >
__device__ void geset_func (int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t *A, int64_t lda)
 Kernel implementing element-wise tile set.
template<typename scalar_t >
__global__ void geset_kernel (int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t *A, int64_t lda)
 Kernel implementing element-wise tile.
template<typename scalar_t >
__global__ void geset_batch_kernel (int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t **Aarray, int64_t lda)
 Kernel implementing element-wise tile set.
template void geset (int64_t m, int64_t n, float const &offdiag_value, float const &diag_value, float *A, int64_t lda, blas::Queue &queue)
template void geset (int64_t m, int64_t n, double const &offdiag_value, double const &diag_value, double *A, int64_t lda, blas::Queue &queue)
void geset (int64_t m, int64_t n, std::complex< float > const &offdiag_value, std::complex< float > const &diag_value, std::complex< float > *A, int64_t lda, blas::Queue &queue)
void geset (int64_t m, int64_t n, std::complex< double > const &offdiag_value, std::complex< double > const &diag_value, std::complex< double > *A, int64_t lda, blas::Queue &queue)
template<typename scalar_t >
__global__ void henorm_max_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_maxima)
 Finds the largest absolute value of elements, for each tile in Aarray.
template<typename scalar_t >
__global__ void henorm_one_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv)
 Sum of absolute values of each column of elements, for each tile in Aarray.
template<typename scalar_t >
__global__ void henorm_fro_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_values)
 Sum of squares, in scaled representation, for each tile in Aarray.
template<typename scalar_t >
void henorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 Batched routine that computes a partial norm for each tile.
template void henorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, float const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
template void henorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, double const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
void henorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
void henorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
template<typename scalar_t >
__global__ void synorm_max_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_maxima)
 Finds the largest absolute value of elements, for each tile in Aarray.
template<typename scalar_t >
__global__ void synorm_one_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv)
 Sum of absolute values of each column of elements, for each tile in Aarray.
template<typename scalar_t >
__global__ void synorm_fro_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_values)
 Sum of squares, in scaled representation, for each tile in Aarray.
template<typename scalar_t >
void synorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 Batched routine that computes a partial norm for each tile.
template<typename scalar_t >
__global__ void synorm_offdiag_one_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv)
 Sum of absolute values of each row and each column of elements, for each tile in tiles.
template<typename scalar_t >
void synormOffdiag (lapack::Norm norm, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 Batched routine that computes a partial norm for each tile.
template void synorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, float const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
template void synorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, double const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
void synorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
void synorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
template void synormOffdiag (lapack::Norm norm, int64_t m, int64_t n, float const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
template void synormOffdiag (lapack::Norm norm, int64_t m, int64_t n, double const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
void synormOffdiag (lapack::Norm norm, int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
void synormOffdiag (lapack::Norm norm, int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
template<typename scalar_t >
__device__ void transpose_func (bool is_conj, int n, scalar_t *A, int64_t lda)
 Device routine handles one matrix.
template<typename scalar_t , int NX>
__device__ void transpose_func (bool is_conj, int m, int n, const scalar_t *A, int64_t lda, scalar_t *AT, int64_t ldat)
 tile M-by-N matrix with ceil(M/NB) by ceil(N/NB) tiles sized NB-by-NB.
template<typename scalar_t >
__global__ void transpose_kernel (bool is_conj, int n, scalar_t *A, int64_t lda)
 in-place transpose of a square buffer
template<typename scalar_t >
__global__ void transpose_batch_kernel (bool is_conj, int n, scalar_t **Aarray, int64_t lda)
 in-place transpose of array of square buffers
template<typename scalar_t , int NX>
__global__ void transpose_kernel (bool is_conj, int m, int n, const scalar_t *A, int64_t lda, scalar_t *AT, int64_t ldat)
 out-of-place transpose of a rectangular buffer transposes A onto AT
template<typename scalar_t , int NX>
__global__ void transpose_batch_kernel (bool is_conj, int m, int n, scalar_t **dA_array, int64_t lda, scalar_t **dAT_array, int64_t ldat)
 out-of-place transpose of an array of rectangular buffers transposes dA_array onto dAT_array
template void transpose (bool is_conj, int64_t n, float *A, int64_t lda, blas::Queue &queue)
template void transpose (bool is_conj, int64_t n, double *A, int64_t lda, blas::Queue &queue)
template void transpose (bool is_conj, int64_t m, int64_t n, float *A, int64_t lda, float *B, int64_t ldb, blas::Queue &queue)
template void transpose (bool is_conj, int64_t m, int64_t n, double *A, int64_t lda, double *B, int64_t ldb, blas::Queue &queue)
void transpose (bool is_conj, int64_t n, std::complex< float > *A, int64_t lda, blas::Queue &queue)
void transpose (bool is_conj, int64_t n, std::complex< double > *A, int64_t lda, blas::Queue &queue)
void transpose (bool is_conj, int64_t m, int64_t n, std::complex< float > *A, int64_t lda, std::complex< float > *B, int64_t ldb, blas::Queue &queue)
void transpose (bool is_conj, int64_t m, int64_t n, std::complex< double > *A, int64_t lda, std::complex< double > *B, int64_t ldb, blas::Queue &queue)
template void transpose_batch (bool is_conj, int64_t n, float **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
template void transpose_batch (bool is_conj, int64_t n, double **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
template void transpose_batch (bool is_conj, int64_t m, int64_t n, float **Aarray, int64_t lda, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
template void transpose_batch (bool is_conj, int64_t m, int64_t n, double **Aarray, int64_t lda, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
void transpose_batch (bool is_conj, int64_t n, std::complex< float > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
void transpose_batch (bool is_conj, int64_t n, std::complex< double > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
void transpose_batch (bool is_conj, int64_t m, int64_t n, std::complex< float > **Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
void transpose_batch (bool is_conj, int64_t m, int64_t n, std::complex< double > **Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
template<typename scalar_t >
__global__ void trnorm_max_kernel (lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_maxima)
 Finds the largest absolute value of elements, for each tile in Aarray.
template<typename scalar_t >
__global__ void trnorm_one_kernel (lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv)
 Sum of absolute values of each column of elements, for each tile in Aarray.
template<typename scalar_t >
__global__ void trnorm_inf_kernel (lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv)
 Sum of absolute values of each row of elements, for each tile in Aarray.
template<typename scalar_t >
__global__ void trnorm_fro_kernel (lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_values)
 Sum of squares, in scaled representation, for each tile in Aarray.
template<typename scalar_t >
void trnorm (lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
 Batched routine that computes a partial norm for each trapezoidal tile.
template void trnorm (lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, float const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
template void trnorm (lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, double const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
void trnorm (lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
void trnorm (lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue)
template<typename scalar_t >
__global__ void tzadd_kernel (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t alpha, scalar_t **Aarray, int64_t lda, scalar_t beta, scalar_t **Barray, int64_t ldb)
 Kernel implementing element-wise tile addition.
template<typename scalar_t >
void tzadd (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t const &alpha, scalar_t **Aarray, int64_t lda, scalar_t const &beta, scalar_t **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 Batched routine for element-wise trapezoidal tile addition.
template void tzadd (lapack::Uplo uplo, int64_t m, int64_t n, float const &alpha, float **Aarray, int64_t lda, float const &beta, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
template void tzadd (lapack::Uplo uplo, int64_t m, int64_t n, double const &alpha, double **Aarray, int64_t lda, double const &beta, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
void tzadd (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< float > const &alpha, std::complex< float > **Aarray, int64_t lda, std::complex< float > const &beta, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
void tzadd (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< double > const &alpha, std::complex< double > **Aarray, int64_t lda, std::complex< double > const &beta, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
template<typename src_scalar_t , typename dst_scalar_t >
__global__ void tzcopy_kernel (lapack::Uplo uplo, int64_t m, int64_t n, src_scalar_t const *const *Aarray, int64_t lda, dst_scalar_t **Barray, int64_t ldb)
 Kernel implementing copy and precision conversions, copying A to B.
template<typename src_scalar_t , typename dst_scalar_t >
void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, src_scalar_t const *const *Aarray, int64_t lda, dst_scalar_t **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
 Batched routine for element-wise trapezoidal copy and precision conversion, copying A to B.
template void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, float const *const *Aarray, int64_t lda, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
template void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, float const *const *Aarray, int64_t lda, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
template void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, double const *const *Aarray, int64_t lda, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
template void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, double const *const *Aarray, int64_t lda, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
void tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue)
template<typename scalar_t >
__global__ void tzscale_kernel (lapack::Uplo uplo, int64_t m, int64_t n, blas::real_type< scalar_t > numer, blas::real_type< scalar_t > denom, scalar_t **Aarray, int64_t lda)
 Kernel implementing element-wise tile scale.
template<typename scalar_t >
__device__ void tzset_func (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t *A, int64_t lda)
 Device function implementing element-wise tile set.
template<typename scalar_t >
__global__ void tzset_kernel (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t *A, int64_t lda)
 Kernel implementing element-wise tile set.
template<typename scalar_t >
__global__ void tzset_batch_kernel (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t **Aarray, int64_t lda)
 Kernel implementing element-wise tile set.
template<typename scalar_t >
void tzset (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t const &offdiag_value, scalar_t const &diag_value, scalar_t *A, int64_t lda, blas::Queue &queue)
 Element-wise trapezoidal tile set.
template void tzset (lapack::Uplo uplo, int64_t m, int64_t n, float const &offdiag_value, float const &diag_value, float *A, int64_t lda, blas::Queue &queue)
template void tzset (lapack::Uplo uplo, int64_t m, int64_t n, double const &offdiag_value, double const &diag_value, double *A, int64_t lda, blas::Queue &queue)
void tzset (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< float > const &offdiag_value, std::complex< float > const &diag_value, std::complex< float > *A, int64_t lda, blas::Queue &queue)
void tzset (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< double > const &offdiag_value, std::complex< double > const &diag_value, std::complex< double > *A, int64_t lda, blas::Queue &queue)
template<typename real_t >
__host__ __device__ real_t max_nan (real_t x, real_t y)
 max that propagates nan consistently: max_nan( 1, nan ) = nan max_nan( nan, 1 ) = nan
template<typename real_t >
__device__ void max_nan_reduce (int n, int tid, real_t *x)
 Max reduction of n-element array x, leaving total in x[0].
template<typename real_t >
__device__ void sum_reduce (int n, int tid, real_t *x)
 Sum reduction of n-element array x, leaving total in x[0].
__host__ __device__ double real (rocblas_double_complex x)
__host__ __device__ float real (rocblas_float_complex x)
__host__ __device__ double imag (rocblas_double_complex x)
__host__ __device__ float imag (rocblas_float_complex x)
__host__ __device__ rocblas_double_complex conj (rocblas_double_complex x)
__host__ __device__ rocblas_float_complex conj (rocblas_float_complex x)
__host__ __device__ double real (double x)
__host__ __device__ float real (float x)
__host__ __device__ double imag (double x)
__host__ __device__ float imag (float x)
__host__ __device__ double conj (double x)
__host__ __device__ float conj (float x)
__host__ __device__ float abs (float x)
 Overloaded versions of absolute value on device.
__host__ __device__ double abs (double x)
__host__ __device__ float abs (cuFloatComplex x)
__host__ __device__ double abs (cuDoubleComplex x)
template<typename scalar_t >
__host__ __device__ scalar_t sqr (scalar_t x)
 Square of number.
template<typename real_t >
__host__ __device__ void combine_sumsq (real_t &scale1, real_t &sumsq1, real_t scale2, real_t sumsq2)
 Adds two scaled, sum-of-squares representations.
template<typename real_t >
__host__ __device__ void add_sumsq (real_t &scale, real_t &sumsq, real_t absx)
 Adds new value to scaled, sum-of-squares representation.
template<typename T >
__host__ __device__ constexpr T ceildiv (T x, T y)
template<typename T >
__host__ __device__ constexpr T roundup (T x, T y)
template<typename TA , typename TB >
__host__ __device__ void copy (TA a, TB &b)
 Overloaded copy and precision conversion.
__host__ __device__ void copy (cuFloatComplex a, cuDoubleComplex &b)
 Sets b = a, converting from complex-float to complex-double.
__host__ __device__ void copy (cuDoubleComplex a, cuFloatComplex &b)
 Sets b = a, converting from complex-double to complex-float.
__host__ __device__ void copy (float a, cuFloatComplex &b)
 Sets b = a, converting from float to complex-float.
__host__ __device__ void copy (double a, cuDoubleComplex &b)
 Sets b = a, converting from double to complex-double.
template<typename x_scalar_t , typename y_scalar_t >
void transpose_batch (bool is_conj, int64_t m, int64_t n, x_scalar_t **dA_array, int64_t lda, y_scalar_t **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue)
template<typename scalar_t , typename scalar_t2 >
void gescale_row_col_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Rarray, scalar_t2 const *const *Carray, scalar_t **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 Kernel implementing row and column scaling.
template<typename scalar_t , typename scalar_t2 >
void gescale_col_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Carray, scalar_t **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 Kernel implementing column scaling.
template<typename scalar_t , typename scalar_t2 >
void gescale_row_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Rarray, scalar_t **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue)
 Kernel implementing row scaling.
template<typename scalar_t >
void transpose_sqr_batch_func (bool is_conj, int n, scalar_t **Aarray, int64_t lda, int batch_count, blas::Queue &queue)
 Device routine handles batches of square matrices.
template<typename scalar_t >
void transpose_sqr_func (bool is_conj, int n, scalar_t *A, int64_t lda, blas::Queue &queue)
 Device routine handles single square matrix.
template<typename scalar_t , int NX>
void transpose_rect_batch_func (bool is_conj, int m, int n, scalar_t **dAarray, int64_t lda, scalar_t **dATarray, int64_t ldat, int batch_count, blas::Queue &queue)
 Device routine handles batches of rectangular matrices.
template<typename scalar_t , int NX>
void transpose_rect_func (bool is_conj, int m, int n, scalar_t *dA, int64_t lda, scalar_t *dAT, int64_t ldat, blas::Queue &queue)
 Device routine handles a single rectangular matrix.
template<typename scalar_t , int NX>
void transpose (bool is_conj, int64_t m, int64_t n, scalar_t *dA, int64_t lda, scalar_t *dAT, int64_t ldat, blas::Queue &queue)
 Physically transpose a rectangular matrix out-of-place.
template<typename scalar_t , int NX>
void transpose_batch (bool is_conj, int64_t m, int64_t n, scalar_t **dA_array, int64_t lda, scalar_t **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue)
 Physically transpose a batch of rectangular matrices out-of-place.
void transpose (bool is_conj, int64_t m, int64_t n, float *dA, int64_t lda, float *dAT, int64_t ldat, blas::Queue &queue)
void transpose (bool is_conj, int64_t m, int64_t n, double *dA, int64_t lda, double *dAT, int64_t ldat, blas::Queue &queue)
void transpose (bool is_conj, int64_t m, int64_t n, std::complex< float > *dA, int64_t lda, std::complex< float > *dAT, int64_t ldat, blas::Queue &queue)
void transpose (bool is_conj, int64_t m, int64_t n, std::complex< double > *dA, int64_t lda, std::complex< double > *dAT, int64_t ldat, blas::Queue &queue)
void transpose_batch (bool is_conj, int64_t m, int64_t n, float **dA_array, int64_t lda, float **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue)
void transpose_batch (bool is_conj, int64_t m, int64_t n, double **dA_array, int64_t lda, double **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue)
void transpose_batch (bool is_conj, int64_t m, int64_t n, std::complex< float > **dA_array, int64_t lda, std::complex< float > **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue)
void transpose_batch (bool is_conj, int64_t m, int64_t n, std::complex< double > **dA_array, int64_t lda, std::complex< double > **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue)


const int ib = 32
 block size for genorm_one_kernel
const int ib1 = 33
 ib + 1 for stride to avoid GPU bank conflicts
static const int NB = 32
 block size for transpose_func
static const int NY = 8
 y dim of thread block size for transpose_func

Detailed Description

GPU device implementations of kernels.

Function Documentation

◆ add_sumsq()

template<typename real_t >
__host__ __device__ void slate::device::add_sumsq ( real_t &  scale,
real_t &  sumsq,
real_t  absx 

Adds new value to scaled, sum-of-squares representation.

On exit, scale and sumsq are updated such that: scale^2 sumsq := scale^2 sumsq + (absx)^2

◆ ceildiv()

template<typename T >
__host__ __device__ constexpr T slate::device::ceildiv ( x,
ceil( x / y ), for integer type T.

◆ combine_sumsq()

template<typename real_t >
__host__ __device__ void slate::device::combine_sumsq ( real_t &  scale1,
real_t &  sumsq1,
real_t  scale2,
real_t  sumsq2 

Adds two scaled, sum-of-squares representations.

On exit, scale1 and sumsq1 are updated such that: scale1^2 sumsq1 := scale1^2 sumsq1 + scale2^2 sumsq2.

◆ conj()

__host__ __device__ double slate::device::conj ( double  x)
conjugate of complex number x; x for real number.

◆ copy()

template<typename TA , typename TB >
__host__ __device__ void slate::device::copy ( TA  a,
TB &  b 

Overloaded copy and precision conversion.

Sets b = a, converting from type TA to type TB.

◆ geadd()

template<typename scalar_t >
void slate::device::geadd ( int64_t  m,
int64_t  n,
scalar_t const &  alpha,
scalar_t *  A,
int64_t  lda,
scalar_t const &  beta,
scalar_t *  B,
int64_t  ldb,
blas::Queue &  queue 

Routine for element-wise tile addition.


\[ B = \alpha A + \beta B. \]

[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]alphaThe scalar alpha.
[in]Ais an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]betaThe scalar beta.
[in,out]Bis an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]queueBLAS++ queue to execute in.


\[ B = \alpha A + \beta B. \]

[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]alphaThe scalar alpha.
[in]Ais an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]betaThe scalar beta.
[in,out]Bis an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]queueBLAS++ queue to execute in.

◆ geadd_batch_kernel()

template<typename scalar_t >
__global__ void slate::device::geadd_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t  alpha,
scalar_t **  Aarray,
int64_t  lda,
scalar_t  beta,
scalar_t **  Barray,
int64_t  ldb 

Kernel implementing element-wise tile set.

◆ geadd_func()

template<typename scalar_t >
__device__ void slate::device::geadd_func ( int64_t  m,
int64_t  n,
scalar_t  alpha,
scalar_t *  A,
int64_t  lda,
scalar_t  beta,
scalar_t *  B,
int64_t  ldb 

Kernel implementing element-wise tile addition.

Each thread deals with one row. Launched by geadd_kernel() and geadd_batch_kernel().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]Aarrayis an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.
[in,out]Bis an m-by-n matrix stored in an ldb-by-n array.
[in]ldbLeading dimension of each tile in Barray. ldb >= m.

Routine for element-wise tile addition.


\[ B = \alpha A + \beta B. \]

[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]alphaThe scalar alpha.
[in]Ais an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]betaThe scalar beta.
[in,out]Bis an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]queueBLAS++ queue to execute in.


\[ B = \alpha A + \beta B. \]

[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]alphaThe scalar alpha.
[in]Ais an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]betaThe scalar beta.
[in,out]Bis an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]queueBLAS++ queue to execute in.

◆ geadd_kernel()

template<typename scalar_t >
__global__ void slate::device::geadd_kernel ( int64_t  m,
int64_t  n,
scalar_t  alpha,
scalar_t *  A,
int64_t  lda,
scalar_t  beta,
scalar_t *  B,
int64_t  ldb 

Kernel implementing element-wise tile.

Routine for element-wise tile addition.


\[ B = \alpha A + \beta B. \]

[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]alphaThe scalar alpha.
[in]Ais an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]betaThe scalar beta.
[in,out]Bis an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]queueBLAS++ queue to execute in.


\[ B = \alpha A + \beta B. \]

[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]alphaThe scalar alpha.
[in]Ais an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]betaThe scalar beta.
[in,out]Bis an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]queueBLAS++ queue to execute in.

◆ gecopy()

template<typename src_scalar_t , typename dst_scalar_t >
void slate::device::gecopy ( int64_t  m,
int64_t  n,
src_scalar_t const *const *  Aarray,
int64_t  lda,
dst_scalar_t **  Barray,
int64_t  ldb,
int64_t  batch_count,
blas::Queue &  queue 

Batched routine for element-wise copy and precision conversion, copying A to B.


\[ Barray[k] = Aarray[k]. \]

[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[out]BarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]batch_countSize of Aarray and Barray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.


\[ Barray[k] = Aarray[k]. \]

[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[out]BarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]batch_countSize of Aarray and Barray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ gecopy_kernel()

template<typename src_scalar_t , typename dst_scalar_t >
__global__ void slate::device::gecopy_kernel ( int64_t  m,
int64_t  n,
src_scalar_t const *const *  Aarray,
int64_t  lda,
dst_scalar_t **  Barray,
int64_t  ldb 

Kernel implementing copy and precision conversions, copying A to B.

Each thread block deals with one tile. Each thread deals with one row. Launched by gecopy().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.
[out]BarrayArray of tiles of dimension gridDim.x, where each Barray[k] is an m-by-n matrix stored in an ldb-by-n array.
[in]ldbLeading dimension of each tile in Barray. ldb >= m.

◆ genorm()

template<typename scalar_t >
void slate::device::genorm ( lapack::Norm  norm,
NormScope  scope,
int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  values,
int64_t  ldv,
int64_t  batch_count,
blas::Queue &  queue 

Batched routine that computes a partial norm for each tile.

Batched routine that returns the largest absolute value of elements for each tile in Aarray.

[in]normNorm to compute. See values for description.
[in]scopeScope of the norm.
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= m.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: ldv >= m. On exit, values[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= i < m.
  • Norm::Max: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of values array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

Sets tiles_maxima[k] = max_{i, j}( abs( A^(k)_(i, j) )), for each tile A^(k), where A^(k) = Aarray[k], k = 0, ..., blockDim.x-1, i = 0, ..., m-1, j = 0, ..., n-1.

[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= m.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: ldv >= m. On exit, values[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= i < m.
  • Norm::Fro: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of tiles_sums (values) array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]streamdevice to execute in.

◆ genorm_fro_kernel()

template<typename scalar_t >
__global__ void slate::device::genorm_fro_kernel ( int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_values 

Sum of squares, in scaled representation, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Kernel assumes non-trivial tiles (m, n >= 1). Launched by genorm().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1. Also the number of threads per block, hence,
[in]AarrayArray of tiles of dimension blockDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_valuesArray of dimension 2 * blockDim.x. On exit, tiles_values[2*k + 0] = scale tiles_values[2*k + 1] = sumsq such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2 for tile A^(k).

◆ genorm_inf_kernel()

template<typename scalar_t >
__global__ void slate::device::genorm_inf_kernel ( int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_sums,
int64_t  ldv 

Sum of absolute values of each row of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row. Kernel assumes non-trivial tiles (m, n >= 1). Launched by genorm().

[in]mNumber of rows of each tile. m >= 1. Also the number of threads per block, hence,
[in]nNumber of columns of each tile. n >= 1.
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_sumsArray of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) for row i of tile A^(k).
[in]ldvLeading dimension of tiles_sums (values) array.

◆ genorm_max_kernel()

template<typename scalar_t >
__global__ void slate::device::genorm_max_kernel ( int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_maxima 

Finds the largest absolute value of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Uses dynamic shared memory array of length sizeof(real_t) * m. Kernel assumes non-trivial tiles (m, n >= 1). Launched by genorm().

[in]mNumber of rows of each tile. m >= 1. Also the number of threads per block (blockDim.x), hence,
[in]nNumber of columns of each tile. n >= 1.
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_maximaArray of dimension gridDim.x. On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) ) for tile A^(k).

◆ genorm_one_kernel()

template<typename scalar_t >
__global__ void slate::device::genorm_one_kernel ( int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_sums,
int64_t  ldv 

Sum of absolute values of each column of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one column. Kernel assumes non-trivial tiles (m, n >= 1). Launched by genorm().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence,
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_sumsArray of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) ) for row j of tile A^(k).
[in]ldvLeading dimension of tiles_sums (values) array.

◆ gescale()

template<typename scalar_t , typename scalar_t2 >
void slate::device::gescale ( int64_t  m,
int64_t  n,
scalar_t2  numer,
scalar_t2  denom,
scalar_t *  A,
int64_t  lda,
blas::Queue &  queue 

Kernel implementing element-wise tile scale.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]numerScale value numerator.
[in]denomScale value denominator.
[in,out]AAn m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.

◆ gescale_batch_kernel()

template<typename scalar_t , typename scalar_t2 >
__global__ void slate::device::gescale_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t2  mul,
scalar_t **  Aarray,
int64_t  lda 

Kernel implementing element-wise tile scale.

◆ gescale_col_batch_kernel() [1/2]

template<typename scalar_t , typename scalar_t2 >
__global__ void slate::device::gescale_col_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t2 const *const *  Carray,
scalar_t **  Aarray,
int64_t  lda 

Kernel implementing column scaling.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]CarrayVector of length n containing column scaling factors.
[in,out]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.

◆ gescale_col_batch_kernel() [2/2]

template<typename scalar_t , typename scalar_t2 >
void slate::device::gescale_col_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t2 const *const *  Carray,
scalar_t **  Aarray,
int64_t  lda,
int64_t  batch_count,
blas::Queue &  queue 

Kernel implementing column scaling.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]CarrayVector of length n containing column scaling factors.
[in,out]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ gescale_func()

template<typename scalar_t , typename scalar_t2 >
__device__ void slate::device::gescale_func ( int64_t  m,
int64_t  n,
scalar_t2  mul,
scalar_t *  A,
int64_t  lda 

Device function implementing element-wise tile scale.

Each thread block deals with one tile. gridDim.x == batch_count. Each thread deals with one row. Called by gescale_kernel and gescale_batch_kernel.

Kernel implementing element-wise tile scale.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]numerScale value numerator.
[in]denomScale value denominator.
[in,out]AAn m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.

◆ gescale_kernel()

template<typename scalar_t , typename scalar_t2 >
__global__ void slate::device::gescale_kernel ( int64_t  m,
int64_t  n,
scalar_t2  mul,
scalar_t *  A,
int64_t  lda 

Kernel implementing element-wise tile scale.

Kernel implementing element-wise tile scale.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]numerScale value numerator.
[in]denomScale value denominator.
[in,out]AAn m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.

◆ gescale_row_batch_kernel() [1/2]

template<typename scalar_t , typename scalar_t2 >
__global__ void slate::device::gescale_row_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t2 const *const *  Rarray,
scalar_t **  Aarray,
int64_t  lda 

Kernel implementing row scaling.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]RarrayVector of length m containing row scaling factors.
[in,out]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.

◆ gescale_row_batch_kernel() [2/2]

template<typename scalar_t , typename scalar_t2 >
void slate::device::gescale_row_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t2 const *const *  Rarray,
scalar_t **  Aarray,
int64_t  lda,
int64_t  batch_count,
blas::Queue &  queue 

Kernel implementing row scaling.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]RarrayVector of length m containing row scaling factors.
[in,out]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ gescale_row_col_batch()

template<typename scalar_t , typename scalar_t2 >
void slate::device::gescale_row_col_batch ( Equed  equed,
int64_t  m,
int64_t  n,
scalar_t2 const *const *  Rarray,
scalar_t2 const *const *  Carray,
scalar_t **  Aarray,
int64_t  lda,
int64_t  batch_count,
blas::Queue &  queue 

Batched routine for row and column scaling.

[in]equedForm of scaling to do.
  • Equed::Row: sets \( A = diag(R) A \)
  • Equed::Col: sets \( A = A diag(C) \)
  • Equed::Both: sets \( A = diag(R) A diag(C) \) for each R in Rarray, C in Carray, and A in Aarray.
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]RarrayVector of length m containing row scaling factors.
[in]CarrayVector of length n containing column scaling factors.
[in,out]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ gescale_row_col_batch_kernel() [1/2]

template<typename scalar_t , typename scalar_t2 >
__global__ void slate::device::gescale_row_col_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t2 const *const *  Rarray,
scalar_t2 const *const *  Carray,
scalar_t **  Aarray,
int64_t  lda 

Kernel implementing row and column scaling.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]RarrayVector of length m containing row scaling factors.
[in]CarrayVector of length n containing column scaling factors.
[in,out]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.

◆ gescale_row_col_batch_kernel() [2/2]

template<typename scalar_t , typename scalar_t2 >
void slate::device::gescale_row_col_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t2 const *const *  Rarray,
scalar_t2 const *const *  Carray,
scalar_t **  Aarray,
int64_t  lda,
int64_t  batch_count,
blas::Queue &  queue 

Kernel implementing row and column scaling.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]RarrayVector of length m containing row scaling factors.
[in]CarrayVector of length n containing column scaling factors.
[in,out]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.

◆ geset()

template<typename scalar_t >
void slate::device::geset ( int64_t  m,
int64_t  n,
scalar_t const &  offdiag_value,
scalar_t const &  diag_value,
scalar_t *  A,
int64_t  lda,
blas::Queue &  queue 

Element-wise m-by-n matrix A to diag_value on the diagonal and offdiag_value on the off-diagonals.

[in]mNumber of rows of A. m >= 0.
[in]nNumber of columns of A. n >= 0.
[in]offdiag_valueThe value to set outside of the diagonal.
[in]diag_valueThe value to set on the diagonal.
[out]AAn m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of A. lda >= m.
[in]queueBLAS++ queue to execute in.

◆ geset_batch_kernel()

template<typename scalar_t >
__global__ void slate::device::geset_batch_kernel ( int64_t  m,
int64_t  n,
scalar_t  offdiag_value,
scalar_t  diag_value,
scalar_t **  Aarray,
int64_t  lda 

Kernel implementing element-wise tile set.

◆ geset_func()

template<typename scalar_t >
__device__ void slate::device::geset_func ( int64_t  m,
int64_t  n,
scalar_t  offdiag_value,
scalar_t  diag_value,
scalar_t *  A,
int64_t  lda 

Kernel implementing element-wise tile set.

Each thread block deals with one tile. Each thread deals with one row. Launched by geset_kernel() and geset_batch_kernel().

Element-wise m-by-n matrix A to diag_value on the diagonal and offdiag_value on the off-diagonals.

[in]mNumber of rows of A. m >= 0.
[in]nNumber of columns of A. n >= 0.
[in]offdiag_valueThe value to set outside of the diagonal.
[in]diag_valueThe value to set on the diagonal.
[out]AAn m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of A. lda >= m.
[in]queueBLAS++ queue to execute in.

◆ geset_kernel()

template<typename scalar_t >
__global__ void slate::device::geset_kernel ( int64_t  m,
int64_t  n,
scalar_t  offdiag_value,
scalar_t  diag_value,
scalar_t *  A,
int64_t  lda 

Kernel implementing element-wise tile.

Element-wise m-by-n matrix A to diag_value on the diagonal and offdiag_value on the off-diagonals.

[in]mNumber of rows of A. m >= 0.
[in]nNumber of columns of A. n >= 0.
[in]offdiag_valueThe value to set outside of the diagonal.
[in]diag_valueThe value to set on the diagonal.
[out]AAn m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of A. lda >= m.
[in]queueBLAS++ queue to execute in.

◆ henorm()

template<typename scalar_t >
void slate::device::henorm ( lapack::Norm  norm,
lapack::Uplo  uplo,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  values,
int64_t  ldv,
int64_t  batch_count,
blas::Queue &  queue 

Batched routine that computes a partial norm for each tile.

Batched routine that returns the largest absolute value of elements for each tile in Aarray.

[in]normNorm to compute. See values for description.
[in]uploWhether each Aarray[k] is stored in the upper or lower triangle.
[in]nNumber of rows and columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= n.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: for symmetric, same as Norm::One
  • Norm::Max: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of values array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

Sets tiles_maxima[k] = max_{i, j}( abs( A^(k)_(i, j) )), for each tile A^(k), where A^(k) = Aarray[k], k = 0, ..., blockDim.x-1, i = 0, ..., n-1, j = 0, ..., n-1.

[in]nNumber of rows and columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= n.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: for symmetric, same as Norm::One
  • Norm::Max: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of tiles_sums (values) array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]streamdevice to execute in.

◆ henorm_fro_kernel()

template<typename scalar_t >
__global__ void slate::device::henorm_fro_kernel ( lapack::Uplo  uplo,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_values 

Sum of squares, in scaled representation, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Kernel assumes non-trivial tiles (n >= 1). Launched by henorm().

[in]nNumber of rows and columns of each tile. n >= 1. Also the number of threads per block, hence,
[in]AarrayArray of tiles of dimension blockDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= n.
[out]tiles_valuesArray of dimension 2 * blockDim.x. On exit, tiles_values[2*k + 0] = scale tiles_values[2*k + 1] = sumsq such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2 for tile A^(k).

◆ henorm_max_kernel()

template<typename scalar_t >
__global__ void slate::device::henorm_max_kernel ( lapack::Uplo  uplo,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_maxima 

Finds the largest absolute value of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Uses dynamic shared memory array of length sizeof(real_t) * n. Kernel assumes non-trivial tiles (n >= 1). Launched by henorm().

[in]nNumber of rows and columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence,
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= n.
[out]tiles_maximaArray of dimension gridDim.x. On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) ) for tile A^(k).

◆ henorm_one_kernel()

template<typename scalar_t >
__global__ void slate::device::henorm_one_kernel ( lapack::Uplo  uplo,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_sums,
int64_t  ldv 

Sum of absolute values of each column of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one column. Kernel assumes non-trivial tiles (n >= 1). Launched by henorm().

[in]nNumber of rows and columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence,
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= n.
[out]tiles_sumsArray of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) ) for row j of tile A^(k).
[in]ldvLeading dimension of tiles_sums (values) array.

◆ imag()

__host__ __device__ double slate::device::imag ( double  x)
imaginary component of complex number x; 0 for real number.

◆ max_nan_reduce()

template<typename real_t >
__device__ void slate::device::max_nan_reduce ( int  n,
int  tid,
real_t *  x 

Max reduction of n-element array x, leaving total in x[0].

Propagates NaN values consistently. With k threads, can reduce array up to 2*k in size. Assumes number of threads <= 1024, which is the current max number of CUDA threads.

[in]nSize of array.
[in]tidThread id.
[in]xArray of dimension n. On exit, x[0] = max(x[0], ..., x[n-1]); the rest of x is overwritten.

◆ real()

__host__ __device__ double slate::device::real ( double  x)
real component of complex number x; x for real number.

◆ roundup()

template<typename T >
__host__ __device__ constexpr T slate::device::roundup ( x,
ceil( x / y )*y, i.e., x rounded up to next multiple of y.

◆ sqr()

template<typename scalar_t >
__host__ __device__ scalar_t slate::device::sqr ( scalar_t  x)

Square of number.


◆ sum_reduce()

template<typename real_t >
__device__ void slate::device::sum_reduce ( int  n,
int  tid,
real_t *  x 

Sum reduction of n-element array x, leaving total in x[0].

With k threads, can reduce array up to 2*k in size. Assumes number of threads <= 1024 (which is current max number of CUDA threads).

[in]nSize of array.
[in]tidThread id.
[in]xArray of dimension n. On exit, x[0] = sum(x[0], ..., x[n-1]); rest of x is overwritten.

◆ synorm()

template<typename scalar_t >
void slate::device::synorm ( lapack::Norm  norm,
lapack::Uplo  uplo,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  values,
int64_t  ldv,
int64_t  batch_count,
blas::Queue &  queue 

Batched routine that computes a partial norm for each tile.

Batched routine that returns the largest absolute value of elements for each tile in Aarray.

[in]normNorm to compute. See values for description.
[in]uploWhether each Aarray[k] is stored in the upper or lower triangle.
[in]nNumber of rows and columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= n.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: for symmetric, same as Norm::One
  • Norm::Max: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of values array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

Sets tiles_maxima[k] = max_{i, j}( abs( A^(k)_(i, j) )), for each tile A^(k), where A^(k) = Aarray[k], k = 0, ..., blockDim.x-1, i = 0, ..., n-1, j = 0, ..., n-1.

[in]nNumber of rows and columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= n.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: for symmetric, same as Norm::One
  • Norm::Fro: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of tiles_sums (values) array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]streamdevice to execute in.

◆ synorm_fro_kernel()

template<typename scalar_t >
__global__ void slate::device::synorm_fro_kernel ( lapack::Uplo  uplo,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_values 

Sum of squares, in scaled representation, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Kernel assumes non-trivial tiles (n >= 1). Launched by synorm().

[in]nNumber of rows and columns of each tile. n >= 1. Also the number of threads per block, hence,
[in]AarrayArray of tiles of dimension blockDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= n.
[out]tiles_valuesArray of dimension 2 * blockDim.x. On exit, tiles_values[2*k + 0] = scale tiles_values[2*k + 1] = sumsq such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2 for tile A^(k).

◆ synorm_max_kernel()

template<typename scalar_t >
__global__ void slate::device::synorm_max_kernel ( lapack::Uplo  uplo,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_maxima 

Finds the largest absolute value of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Uses dynamic shared memory array of length sizeof(real_t) * n. Kernel assumes non-trivial tiles (n >= 1). Launched by synorm().

[in]nNumber of rows and columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence,
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= n.
[out]tiles_maximaArray of dimension gridDim.x. On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) ) for tile A^(k).

◆ synorm_offdiag_one_kernel()

template<typename scalar_t >
__global__ void slate::device::synorm_offdiag_one_kernel ( int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_sums,
int64_t  ldv 

Sum of absolute values of each row and each column of elements, for each tile in tiles.

Each thread block deals with one tile. Kernel assumes non-trivial tiles (m, n >= 1). Launched by synormOffdiag().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_sumsArray of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for column j of tile A^(k), and tiles_sums[k*ldv + i + n] = sum_{j} abs( A^(k)_(i, j) ) for row i of tile A^(k).
[in]ldvLeading dimension of tiles_sums (values) array.

◆ synorm_one_kernel()

template<typename scalar_t >
__global__ void slate::device::synorm_one_kernel ( lapack::Uplo  uplo,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_sums,
int64_t  ldv 

Sum of absolute values of each column of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one column. Kernel assumes non-trivial tiles (n >= 1). Launched by synorm().

[in]nNumber of rows and columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence,
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= n.
[out]tiles_sumsArray of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) ) for row j of tile A^(k).
[in]ldvLeading dimension of tiles_sums (values) array.

◆ synormOffdiag()

template<typename scalar_t >
void slate::device::synormOffdiag ( lapack::Norm  norm,
int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  values,
int64_t  ldv,
int64_t  batch_count,
blas::Queue &  queue 

Batched routine that computes a partial norm for each tile.

Batched routine that returns the largest absolute value of elements for each tile in Aarray.

Used for full, off-diagonal tiles within a symmetric matrix, where element Aij contributes to both column i and j.

[in]normNorm to compute. See values for description.
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= m.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: for symmetric, same as Norm::One
  • Norm::Max: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of values array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

Sets tiles_maxima[k] = max_{i, j}( abs( A^(k)_(i, j) )), for each tile A^(k), where A^(k) = Aarray[k], k = 0, ..., blockDim.x-1, i = 0, ..., n-1, j = 0, ..., n-1.

[in]nNumber of rows and columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= n.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: for symmetric, same as Norm::One
  • Norm::Max: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of tiles_sums (values) array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]streamGPU device to execute in.

◆ transpose() [1/3]

template<typename scalar_t >
void slate::device::transpose ( bool  is_conj,
int64_t  m,
int64_t  n,
scalar_t *  dA,
int64_t  lda,
scalar_t *  dAT,
int64_t  ldat,
blas::Queue &  queue 

Physically transpose a rectangular matrix out-of-place.

[in]mNumber of columns of tile. m >= 0.
[in]nNumber of rows of tile. n >= 0.
[in]dAA rectangular m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of dA. lda >= m.
[out]dATA rectangular m-by-n matrix stored in an ldat-by-m array in GPU memory. On output, dAT is the transpose of dA.
[in]ldatLeading dimension of dAT. ldat >= n.
[in]queueBLAS++ queue to execute in.

◆ transpose() [2/3]

template<typename scalar_t , int NX>
void slate::device::transpose ( bool  is_conj,
int64_t  m,
int64_t  n,
scalar_t *  dA,
int64_t  lda,
scalar_t *  dAT,
int64_t  ldat,
blas::Queue &  queue 

Physically transpose a rectangular matrix out-of-place.

[in]mNumber of columns of tile. m >= 0.
[in]nNumber of rows of tile. n >= 0.
[in]dAA rectangular m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of dA. lda >= m.
[out]dATA rectangular m-by-n matrix stored in an ldat-by-m array in GPU memory. On output, dAT is the transpose of dA.
[in]ldatLeading dimension of dAT. ldat >= n.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ transpose() [3/3]

template<typename scalar_t >
void slate::device::transpose ( bool  is_conj,
int64_t  n,
scalar_t *  A,
int64_t  lda,
blas::Queue &  queue 

Physically transpose a square matrix in place.

[in]nNumber of rows and columns of each tile. n >= 0.
[in,out]AA square n-by-n matrix stored in an lda-by-n array in GPU memory. On output, A is transposed.
[in]ldaLeading dimension of A. lda >= n.
[in]queueBLAS++ queue to execute in.

◆ transpose_batch() [1/3]

template<typename scalar_t >
void slate::device::transpose_batch ( bool  is_conj,
int64_t  m,
int64_t  n,
scalar_t **  dA_array,
int64_t  lda,
scalar_t **  dAT_array,
int64_t  ldat,
int64_t  batch_count,
blas::Queue &  queue 

Physically transpose a batch of rectangular matrices out-of-place.

[in]mNumber of columns of each tile. m >= 0.
[in]nNumber of rows of each tile. n >= 0.
[in]dA_arrayArray in GPU memory of dimension batch_count, containing pointers to matrices, where each dA_array[k] is a rectangular m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each dA_array[k] tile. lda >= m.
[out]dAT_arrayArray in GPU memory of dimension batch_count, containing pointers to matrices, where each dAT_array[k] is a rectangular m-by-n matrix stored in an ldat-by-m array in GPU memory. On output, each dAT_array[k] is the transpose of dA_array[k].
[in]ldatLeading dimension of each dAT_array[k] tile. ldat >= n.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ transpose_batch() [2/3]

template<typename scalar_t , int NX>
void slate::device::transpose_batch ( bool  is_conj,
int64_t  m,
int64_t  n,
scalar_t **  dA_array,
int64_t  lda,
scalar_t **  dAT_array,
int64_t  ldat,
int64_t  batch_count,
blas::Queue &  queue 

Physically transpose a batch of rectangular matrices out-of-place.

[in]mNumber of columns of each tile. m >= 0.
[in]nNumber of rows of each tile. n >= 0.
[in]dA_arrayArray in GPU memory of dimension batch_count, containing pointers to matrices, where each dA_array[k] is a rectangular m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each dA_array[k] tile. lda >= m.
[out]dAT_arrayArray in GPU memory of dimension batch_count, containing pointers to matrices, where each dAT_array[k] is a rectangular m-by-n matrix stored in an ldat-by-m array in GPU memory. On output, each dAT_array[k] is the transpose of dA_array[k].
[in]ldaLeading dimension of each dAT_array[k] tile. ldat >= n.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ transpose_batch() [3/3]

template<typename scalar_t >
void slate::device::transpose_batch ( bool  is_conj,
int64_t  n,
scalar_t **  Aarray,
int64_t  lda,
int64_t  batch_count,
blas::Queue &  queue 

Physically transpose a batch of square matrices in place.

[in]nNumber of rows and columns of each tile. n >= 0.
[in,out]AarrayArray in GPU memory of dimension batch_count, containing pointers to matrices, where each Aarray[k] is a square n-by-n matrix stored in an lda-by-n array in GPU memory. On output, each Aarray[k] is transposed.
[in]ldaLeading dimension of each tile. lda >= n.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ transpose_func() [1/2]

template<typename scalar_t , int NX>
__device__ void slate::device::transpose_func ( bool  is_conj,
int  m,
int  n,
const scalar_t *  A,
int64_t  lda,
scalar_t *  AT,
int64_t  ldat 

tile M-by-N matrix with ceil(M/NB) by ceil(N/NB) tiles sized NB-by-NB.

uses NX-by-NY threads, where NB/NX, NB/NY, NX/NY evenly. subtile each NB-by-NB tile with (NB/NX) subtiles sized NX-by-NB for each subtile load NX-by-NB subtile transposed from A into sA, as (NB/NY) blocks sized NX-by-NY save NB-by-NX subtile from sA into AT, as (NB/NX)*(NX/NY) blocks sized NX-by-NY A += NX AT += NX*ldat

e.g., with NB=32, NX=32, NY=8 ([sdc] precisions) load 32x32 subtile as 4 blocks of 32x8 columns: (A11 A12 A13 A14 ) save 32x32 subtile as 1*4 blocks of 32x8 columns: (AT11 AT12 AT13 AT14)

e.g., with NB=32, NX=16, NY=8 (z precision) load 16x32 subtile as 4 blocks of 16x8 columns: (A11 A12 A13 A14) save 32x16 subtile as 2*2 blocks of 16x8 columns: (AT11 AT12) (AT21 AT22)

◆ transpose_func() [2/2]

template<typename scalar_t >
__device__ void slate::device::transpose_func ( bool  is_conj,
int  n,
scalar_t *  A,
int64_t  lda 

Device routine handles one matrix.

Thread block grid: x = batch index (ignored here; see batch kernel), y = block row index, z = block col index. Each thread block is ib-by-ib threads and does one ib-by-ib block of an n-by-n matrix.

Let nt = ceildiv( n, ib ) be the number of blocks for one n-by-n matrix. An even number of blocks uses an (nt + 1) by (nt/2) grid. Example: for nt = 4 blocks, y by z = 5 by 2 grid: [ A00 A01 ] [-—. A11 ] [ A10 . | . . ] [ A10 '-—] [ A20 A21 | . . ] [ A20 A21 ] covers matrix as [ A30 A31 | A00 . ] [ A30 A31 ] [ A40 A41 | A01 A11 ] [ A40 A41 ]

An odd number of blocks uses an (nt) by (nt + 1)/2 grid. Example: for nt = 5 blocks, y by z = 5 by 3 grid: [ A00 | A01 A02 ] [ '-—. ] [ A00 . . | . . ] [ A10 A11 | A12 ] [ A10 A11 . | . . ] [ '--—] covers matrix as [ A20 A21 A22 | . . ] [ A20 A21 A22 ] [ A30 A31 A32 | A01 . ] [ A30 A31 A32 ] [ A40 A41 A42 | A02 A12 ] [ A40 A41 A42 ]

◆ transpose_rect_batch_func()

template<typename scalar_t , int NX>
void slate::device::transpose_rect_batch_func ( bool  is_conj,
int  m,
int  n,
scalar_t **  dAarray,
int64_t  lda,
scalar_t **  dATarray,
int64_t  ldat,
int  batch_count,
blas::Queue &  queue 

Device routine handles batches of rectangular matrices.

The routine loads blocks of data into small NX x NB local storage and then writes the blocks back transposed into the correct location transposed.

◆ transpose_rect_func()

template<typename scalar_t , int NX>
void slate::device::transpose_rect_func ( bool  is_conj,
int  m,
int  n,
scalar_t *  dA,
int64_t  lda,
scalar_t *  dAT,
int64_t  ldat,
blas::Queue &  queue 

Device routine handles a single rectangular matrix.

The routine loads blocks of data into small NX x NB local storage and then writes the blocks back transposed into the correct location transposed.

◆ transpose_sqr_batch_func()

template<typename scalar_t >
void slate::device::transpose_sqr_batch_func ( bool  is_conj,
int  n,
scalar_t **  Aarray,
int64_t  lda,
int  batch_count,
blas::Queue &  queue 

Device routine handles batches of square matrices.

The routine loads blocks of data into small ib x ib local storage and then writes the blocks back transposed into the correct location transposed.

◆ transpose_sqr_func()

template<typename scalar_t >
void slate::device::transpose_sqr_func ( bool  is_conj,
int  n,
scalar_t *  A,
int64_t  lda,
blas::Queue &  queue 

Device routine handles single square matrix.

The routine loads blocks of data into small ib x ib local storage and then writes the blocks back transposed into the correct location transposed.

◆ trnorm()

template<typename scalar_t >
void slate::device::trnorm ( lapack::Norm  norm,
lapack::Uplo  uplo,
lapack::Diag  diag,
int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  values,
int64_t  ldv,
int64_t  batch_count,
blas::Queue &  queue 

Batched routine that computes a partial norm for each trapezoidal tile.

Batched routine that returns the largest absolute value of elements for each tile in Aarray.

todo: rename to tznorm for consistency with other tz routines.

[in]normNorm to compute. See values for description.
[in]uploWhether each Aarray[k] is upper or lower trapezoidal.
[in]diagWhether or not each Aarray[k] has unit diagonal.
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= m.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: ldv >= m. On exit, values[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= i < m.
  • Norm::Max: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of values array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

Sets tiles_maxima[k] = max_{i, j}( abs( A^(k)_(i, j) )), for each tile A^(k), where A^(k) = Aarray[k], k = 0, ..., blockDim.x-1, i = 0, ..., m-1, j = 0, ..., n-1.

[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile. lda >= m.
[out]valuesArray in GPU memory, dimension batch_count * ldv.
  • Norm::Max: ldv = 1. On exit, values[k] = max_{i, j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count.
  • Norm::One: ldv >= n. On exit, values[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= j < n.
  • Norm::Inf: ldv >= m. On exit, values[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) for 0 <= k < batch_count, 0 <= i < m.
  • Norm::Max: ldv = 2. On exit, values[k*2 + 0] = scale_k values[k*2 + 1] = sumsq_k where scale_k^2 sumsq_k = sum_{i,j} abs( A^(k)_(i, j) )^2 for 0 <= k < batch_count.
[in]ldvLeading dimension of tiles_sums (values) array.
[in]batch_countSize of Aarray. batch_count >= 0.
[in]streamdevice to execute in.

◆ trnorm_fro_kernel()

template<typename scalar_t >
__global__ void slate::device::trnorm_fro_kernel ( lapack::Uplo  uplo,
lapack::Diag  diag,
int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_values 

Sum of squares, in scaled representation, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Kernel assumes non-trivial tiles (m, n >= 1). Launched by trnorm().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1. Also the number of threads per block, hence,
[in]AarrayArray of tiles of dimension blockDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_valuesArray of dimension 2 * blockDim.x. On exit, tiles_values[2*k + 0] = scale tiles_values[2*k + 1] = sumsq such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2 for tile A^(k).

◆ trnorm_inf_kernel()

template<typename scalar_t >
__global__ void slate::device::trnorm_inf_kernel ( lapack::Uplo  uplo,
lapack::Diag  diag,
int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_sums,
int64_t  ldv 

Sum of absolute values of each row of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row. Kernel assumes non-trivial tiles (m, n >= 1). Launched by trnorm().

[in]mNumber of rows of each tile. m >= 1. Also the number of threads per block, hence,
[in]nNumber of columns of each tile. n >= 1.
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_sumsArray of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) for row i of tile A^(k).
[in]ldvLeading dimension of tiles_sums (values) array.

◆ trnorm_max_kernel()

template<typename scalar_t >
__global__ void slate::device::trnorm_max_kernel ( lapack::Uplo  uplo,
lapack::Diag  diag,
int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_maxima 

Finds the largest absolute value of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Uses dynamic shared memory array of length sizeof(real_t) * m. Kernel assumes non-trivial tiles (m, n >= 1). Launched by trnorm().

[in]mNumber of rows of each tile. m >= 1. Also the number of threads per block (blockDim.x), hence,
[in]nNumber of columns of each tile. n >= 1.
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_maximaArray of dimension gridDim.x. On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) ) for tile A^(k).

◆ trnorm_one_kernel()

template<typename scalar_t >
__global__ void slate::device::trnorm_one_kernel ( lapack::Uplo  uplo,
lapack::Diag  diag,
int64_t  m,
int64_t  n,
scalar_t const *const *  Aarray,
int64_t  lda,
blas::real_type< scalar_t > *  tiles_sums,
int64_t  ldv 

Sum of absolute values of each column of elements, for each tile in Aarray.

Each thread block deals with one tile. Each thread deals with one column. Kernel assumes non-trivial tiles (m, n >= 1). Launched by trnorm().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence,
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile. lda >= m.
[out]tiles_sumsArray of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) ) for row j of tile A^(k).
[in]ldvLeading dimension of tiles_sums (values) array.

◆ tzadd()

template<typename scalar_t >
void slate::device::tzadd ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
scalar_t const &  alpha,
scalar_t **  Aarray,
int64_t  lda,
scalar_t const &  beta,
scalar_t **  Barray,
int64_t  ldb,
int64_t  batch_count,
blas::Queue &  queue 

Batched routine for element-wise trapezoidal tile addition.

Sets upper or lower part of

\[ Barray[k] = \alpha Aarray[k] + \beta Barray[k]. \]

[in]uploWhether each Aarray[k] is upper or lower trapezoidal.
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]alphaThe scalar alpha.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]betaThe scalar beta.
[in,out]BarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]batch_countSize of Aarray and Barray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

Sets upper or lower part of

\[ Barray[k] = \alpha Aarray[k] + \beta Barray[k]. \]

[in]uploWhether each Aarray[k] is upper or lower trapezoidal.
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]alphaThe scalar alpha.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[in]betaThe scalar beta.
[in,out]BarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]batch_countSize of Aarray and Barray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.

◆ tzadd_kernel()

template<typename scalar_t >
__global__ void slate::device::tzadd_kernel ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
scalar_t  alpha,
scalar_t **  Aarray,
int64_t  lda,
scalar_t  beta,
scalar_t **  Barray,
int64_t  ldb 

Kernel implementing element-wise tile addition.

Each thread block deals with one tile. Each thread deals with one row. Launched by tzadd().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.
[in,out]BarrayArray of tiles of dimension gridDim.x, where each Barray[k] is an m-by-n matrix stored in an ldb-by-n array.
[in]ldbLeading dimension of each tile in Barray. ldb >= m.

◆ tzcopy()

template<typename src_scalar_t , typename dst_scalar_t >
void slate::device::tzcopy ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
src_scalar_t const *const *  Aarray,
int64_t  lda,
dst_scalar_t **  Barray,
int64_t  ldb,
int64_t  batch_count,
blas::Queue &  queue 

Batched routine for element-wise trapezoidal copy and precision conversion, copying A to B.

Batched routine for element-wise copy and precision conversion.

Sets upper or lower part of

\[ Barray[k] = Aarray[k]. \]

[in]uploWhether each Aarray[k] is upper or lower trapezoidal.
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[out]BarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]batch_countSize of Aarray and Barray. batch_count >= 0.
[in]queueBLAS++ queue to execute in.
[in]mNumber of rows of each tile. m >= 0.
[in]nNumber of columns of each tile. n >= 0.
[in]AarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of each tile in A. lda >= m.
[out]BarrayArray in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldbLeading dimension of each tile in B. ldb >= m.
[in]batch_countSize of Aarray and Barray. batch_count >= 0.
[in]streamDevice to execute in.

◆ tzcopy_kernel()

template<typename src_scalar_t , typename dst_scalar_t >
__global__ void slate::device::tzcopy_kernel ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
src_scalar_t const *const *  Aarray,
int64_t  lda,
dst_scalar_t **  Barray,
int64_t  ldb 

Kernel implementing copy and precision conversions, copying A to B.

Each thread block deals with one tile. Each thread deals with one row. Launched by tzcopy().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.
[out]BarrayArray of tiles of dimension gridDim.x, where each Barray[k] is an m-by-n matrix stored in an ldb-by-n array.
[in]ldbLeading dimension of each tile in Barray. ldb >= m.

◆ tzscale_kernel()

template<typename scalar_t >
__global__ void slate::device::tzscale_kernel ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
blas::real_type< scalar_t >  numer,
blas::real_type< scalar_t >  denom,
scalar_t **  Aarray,
int64_t  lda 

Kernel implementing element-wise tile scale.

Each thread block deals with one tile. Each thread deals with one row. Launched by gescale().

[in]mNumber of rows of each tile. m >= 1.
[in]nNumber of columns of each tile. n >= 1.
[in]numerScale value numerator.
[in]denomScale value denominator.
[in,out]AarrayArray of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array.
[in]ldaLeading dimension of each tile in Aarray. lda >= m.

◆ tzset()

template<typename scalar_t >
void slate::device::tzset ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
scalar_t const &  offdiag_value,
scalar_t const &  diag_value,
scalar_t *  A,
int64_t  lda,
blas::Queue &  queue 

Element-wise trapezoidal tile set.

Sets upper or lower part of Aarray[k] to diag_value on the diagonal and offdiag_value on the off-diagonals.

[in]uploWhether each Aarray[k] is upper or lower trapezoidal.
[in]mNumber of rows of A. m >= 0.
[in]nNumber of columns of A. n >= 0.
[in]offdiag_valueConstant to set offdiagonal entries to.
[in]diag_valueConstant to set diagonal entries to.
[out]AAn m-by-n matrix stored in an lda-by-n array in GPU memory.
[in]ldaLeading dimension of A. lda >= m.
[in]queueBLAS++ queue to execute in.

◆ tzset_batch_kernel()

template<typename scalar_t >
__global__ void slate::device::tzset_batch_kernel ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
scalar_t  offdiag_value,
scalar_t  diag_value,
scalar_t **  Aarray,
int64_t  lda 

Kernel implementing element-wise tile set.

◆ tzset_func()

template<typename scalar_t >
__device__ void slate::device::tzset_func ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
scalar_t  offdiag_value,
scalar_t  diag_value,
scalar_t *  A,
int64_t  lda 

Device function implementing element-wise tile set.

Each thread block deals with one tile. gridDim.x == batch_count. Each thread deals with one row. Called by tzset_kernel and tzset_batch_kernel.

◆ tzset_kernel()

template<typename scalar_t >
__global__ void slate::device::tzset_kernel ( lapack::Uplo  uplo,
int64_t  m,
int64_t  n,
scalar_t  offdiag_value,
scalar_t  diag_value,
scalar_t *  A,
int64_t  lda 

Kernel implementing element-wise tile set.

Variable Documentation

◆ ib

static const int slate::device::ib = 32

block size for genorm_one_kernel

internal blocking 16 x 16 thread block = 256 threads 32 x 32 thread block = 1024 threads