SLATE 2024.05.31
Software for Linear Algebra Targeting Exascale
|
GPU device implementations of kernels. More...
Classes | |
struct | nx_traits |
Look up NX based on data type. More... | |
Functions | |
template<typename src_scalar_t , typename dst_scalar_t > | |
void | gecopy (int64_t m, int64_t n, src_scalar_t const *const *Aarray, int64_t lda, dst_scalar_t **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
Batched routine for element-wise copy and precision conversion, copying A to B. | |
template<typename src_scalar_t , typename dst_scalar_t > | |
void | tzcopy (Uplo uplo, int64_t m, int64_t n, src_scalar_t const *const *Aarray, int64_t lda, dst_scalar_t **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<typename scalar_t > | |
void | geadd (int64_t m, int64_t n, scalar_t const &alpha, scalar_t *A, int64_t lda, scalar_t const &beta, scalar_t *B, int64_t ldb, blas::Queue &queue) |
Routine for element-wise tile addition. | |
template<typename scalar_t > | |
void | tzadd (Uplo uplo, int64_t m, int64_t n, scalar_t const &alpha, scalar_t **Aarray, int64_t lda, scalar_t const &beta, scalar_t **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<typename scalar_t , typename scalar_t2 > | |
void | gescale (int64_t m, int64_t n, scalar_t2 numer, scalar_t2 denom, scalar_t *A, int64_t lda, blas::Queue &queue) |
Kernel implementing element-wise tile scale. | |
template<typename scalar_t , typename scalar_t2 > | |
void | gescale_row_col_batch (Equed equed, int64_t m, int64_t n, scalar_t2 const *const *Rarray, scalar_t2 const *const *Carray, scalar_t **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue) |
Batched routine for row and column scaling. | |
template<typename scalar_t > | |
void | geset (int64_t m, int64_t n, scalar_t const &offdiag_value, scalar_t const &diag_value, scalar_t *A, int64_t lda, blas::Queue &queue) |
Element-wise m-by-n matrix A to diag_value on the diagonal and offdiag_value on the off-diagonals. | |
template<typename scalar_t > | |
void | tzset (Uplo uplo, int64_t m, int64_t n, scalar_t const &offdiag_value, scalar_t const &diag_value, scalar_t *A, int64_t lda, blas::Queue &queue) |
template<typename scalar_t > | |
void | genorm (Norm norm, NormScope scope, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<typename scalar_t > | |
void | henorm (Norm norm, Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<typename scalar_t > | |
void | synorm (Norm norm, Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<typename scalar_t > | |
void | synormOffdiag (Norm norm, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<typename scalar_t > | |
void | trnorm (Norm norm, Uplo uplo, Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<typename scalar_t > | |
void | transpose (bool is_conj, int64_t n, scalar_t *A, int64_t lda, blas::Queue &queue) |
Physically transpose a square matrix in place. | |
template<typename scalar_t > | |
void | transpose_batch (bool is_conj, int64_t n, scalar_t **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue) |
Physically transpose a batch of square matrices in place. | |
template<typename scalar_t > | |
void | transpose (bool is_conj, int64_t m, int64_t n, scalar_t *dA, int64_t lda, scalar_t *dAT, int64_t ldat, blas::Queue &queue) |
Physically transpose a rectangular matrix out-of-place. | |
template<typename scalar_t > | |
void | transpose_batch (bool is_conj, int64_t m, int64_t n, scalar_t **dA_array, int64_t lda, scalar_t **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue) |
Physically transpose a batch of rectangular matrices out-of-place. | |
template<typename scalar_t > | |
__device__ void | geadd_func (int64_t m, int64_t n, scalar_t alpha, scalar_t *A, int64_t lda, scalar_t beta, scalar_t *B, int64_t ldb) |
Kernel implementing element-wise tile addition. | |
template<typename scalar_t > | |
__global__ void | geadd_kernel (int64_t m, int64_t n, scalar_t alpha, scalar_t *A, int64_t lda, scalar_t beta, scalar_t *B, int64_t ldb) |
Kernel implementing element-wise tile. | |
template<typename scalar_t > | |
__global__ void | geadd_batch_kernel (int64_t m, int64_t n, scalar_t alpha, scalar_t **Aarray, int64_t lda, scalar_t beta, scalar_t **Barray, int64_t ldb) |
Kernel implementing element-wise tile set. | |
template void | geadd (int64_t m, int64_t n, float const &alpha, float *Aarray, int64_t lda, float const &beta, float *Barray, int64_t ldb, blas::Queue &queue) |
template void | geadd (int64_t m, int64_t n, double const &alpha, double *Aarray, int64_t lda, double const &beta, double *Barray, int64_t ldb, blas::Queue &queue) |
template<> | |
void | geadd (int64_t m, int64_t n, std::complex< float > const &alpha, std::complex< float > *Aarray, int64_t lda, std::complex< float > const &beta, std::complex< float > *Barray, int64_t ldb, blas::Queue &queue) |
template<> | |
void | geadd (int64_t m, int64_t n, std::complex< double > const &alpha, std::complex< double > *Aarray, int64_t lda, std::complex< double > const &beta, std::complex< double > *Barray, int64_t ldb, blas::Queue &queue) |
template<typename src_scalar_t , typename dst_scalar_t > | |
__global__ void | gecopy_kernel (int64_t m, int64_t n, src_scalar_t const *const *Aarray, int64_t lda, dst_scalar_t **Barray, int64_t ldb) |
Kernel implementing copy and precision conversions, copying A to B. | |
template void | gecopy (int64_t m, int64_t n, float const *const *Aarray, int64_t lda, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template void | gecopy (int64_t m, int64_t n, float const *const *Aarray, int64_t lda, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template void | gecopy (int64_t m, int64_t n, double const *const *Aarray, int64_t lda, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template void | gecopy (int64_t m, int64_t n, double const *const *Aarray, int64_t lda, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | gecopy (int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | gecopy (int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | gecopy (int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | gecopy (int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | gecopy (int64_t m, int64_t n, float const *const *Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | gecopy (int64_t m, int64_t n, double const *const *Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<typename scalar_t > | |
__global__ void | genorm_max_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_maxima) |
Finds the largest absolute value of elements, for each tile in Aarray. | |
template<typename scalar_t > | |
__global__ void | genorm_one_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv) |
Sum of absolute values of each column of elements, for each tile in Aarray. | |
template<typename scalar_t > | |
__global__ void | genorm_inf_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv) |
Sum of absolute values of each row of elements, for each tile in Aarray. | |
template<typename scalar_t > | |
__global__ void | genorm_fro_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_values) |
Sum of squares, in scaled representation, for each tile in Aarray. | |
template<typename scalar_t > | |
__global__ void | ge_col_norms_max_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *col_max, int64_t ldv) |
template<typename scalar_t > | |
void | genorm (lapack::Norm norm, NormScope scope, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
Batched routine that computes a partial norm for each tile. | |
template void | genorm (lapack::Norm norm, NormScope scope, int64_t m, int64_t n, float const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template void | genorm (lapack::Norm norm, NormScope scope, int64_t m, int64_t n, double const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | genorm (lapack::Norm norm, NormScope scope, int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | genorm (lapack::Norm norm, NormScope scope, int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<typename scalar_t , typename scalar_t2 > | |
__device__ void | gescale_func (int64_t m, int64_t n, scalar_t2 mul, scalar_t *A, int64_t lda) |
Device function implementing element-wise tile scale. | |
template<typename scalar_t , typename scalar_t2 > | |
__global__ void | gescale_kernel (int64_t m, int64_t n, scalar_t2 mul, scalar_t *A, int64_t lda) |
Kernel implementing element-wise tile scale. | |
template<typename scalar_t , typename scalar_t2 > | |
__global__ void | gescale_batch_kernel (int64_t m, int64_t n, scalar_t2 mul, scalar_t **Aarray, int64_t lda) |
Kernel implementing element-wise tile scale. | |
template void | gescale (int64_t m, int64_t n, float numer, float denom, float *A, int64_t lda, blas::Queue &queue) |
template void | gescale (int64_t m, int64_t n, double numer, double denom, double *A, int64_t lda, blas::Queue &queue) |
template<> | |
void | gescale (int64_t m, int64_t n, float numer, float denom, std::complex< float > *A, int64_t lda, blas::Queue &queue) |
template<> | |
void | gescale (int64_t m, int64_t n, std::complex< float > numer, std::complex< float > denom, std::complex< float > *A, int64_t lda, blas::Queue &queue) |
template<> | |
void | gescale (int64_t m, int64_t n, double numer, double denom, std::complex< double > *A, int64_t lda, blas::Queue &queue) |
template<> | |
void | gescale (int64_t m, int64_t n, std::complex< double > numer, std::complex< double > denom, std::complex< double > *A, int64_t lda, blas::Queue &queue) |
template<typename scalar_t , typename scalar_t2 > | |
__global__ void | gescale_row_col_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Rarray, scalar_t2 const *const *Carray, scalar_t **Aarray, int64_t lda) |
Kernel implementing row and column scaling. | |
template<typename scalar_t , typename scalar_t2 > | |
__global__ void | gescale_col_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Carray, scalar_t **Aarray, int64_t lda) |
Kernel implementing column scaling. | |
template<typename scalar_t , typename scalar_t2 > | |
__global__ void | gescale_row_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Rarray, scalar_t **Aarray, int64_t lda) |
Kernel implementing row scaling. | |
template void | gescale_row_col_batch (Equed equed, int64_t m, int64_t n, float const *const *Rarray, float const *const *Carray, float **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue) |
template void | gescale_row_col_batch (Equed equed, int64_t m, int64_t n, double const *const *Rarray, double const *const *Carray, double **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | gescale_row_col_batch (Equed equed, int64_t m, int64_t n, float const *const *Rarray, float const *const *Carray, std::complex< float > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | gescale_row_col_batch (Equed equed, int64_t m, int64_t n, double const *const *Rarray, double const *const *Carray, std::complex< double > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | gescale_row_col_batch (Equed equed, int64_t m, int64_t n, std::complex< float > const *const *Rarray, std::complex< float > const *const *Carray, std::complex< float > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | gescale_row_col_batch (Equed equed, int64_t m, int64_t n, std::complex< double > const *const *Rarray, std::complex< double > const *const *Carray, std::complex< double > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue) |
template<typename scalar_t > | |
__device__ void | geset_func (int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t *A, int64_t lda) |
Kernel implementing element-wise tile set. | |
template<typename scalar_t > | |
__global__ void | geset_kernel (int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t *A, int64_t lda) |
Kernel implementing element-wise tile. | |
template<typename scalar_t > | |
__global__ void | geset_batch_kernel (int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t **Aarray, int64_t lda) |
Kernel implementing element-wise tile set. | |
template void | geset (int64_t m, int64_t n, float const &offdiag_value, float const &diag_value, float *A, int64_t lda, blas::Queue &queue) |
template void | geset (int64_t m, int64_t n, double const &offdiag_value, double const &diag_value, double *A, int64_t lda, blas::Queue &queue) |
template<> | |
void | geset (int64_t m, int64_t n, std::complex< float > const &offdiag_value, std::complex< float > const &diag_value, std::complex< float > *A, int64_t lda, blas::Queue &queue) |
template<> | |
void | geset (int64_t m, int64_t n, std::complex< double > const &offdiag_value, std::complex< double > const &diag_value, std::complex< double > *A, int64_t lda, blas::Queue &queue) |
template<typename scalar_t > | |
__global__ void | henorm_max_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_maxima) |
Finds the largest absolute value of elements, for each tile in Aarray. | |
template<typename scalar_t > | |
__global__ void | henorm_one_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv) |
Sum of absolute values of each column of elements, for each tile in Aarray. | |
template<typename scalar_t > | |
__global__ void | henorm_fro_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_values) |
Sum of squares, in scaled representation, for each tile in Aarray. | |
template<typename scalar_t > | |
void | henorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
Batched routine that computes a partial norm for each tile. | |
template void | henorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, float const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template void | henorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, double const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | henorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | henorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<typename scalar_t > | |
__global__ void | synorm_max_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_maxima) |
Finds the largest absolute value of elements, for each tile in Aarray. | |
template<typename scalar_t > | |
__global__ void | synorm_one_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv) |
Sum of absolute values of each column of elements, for each tile in Aarray. | |
template<typename scalar_t > | |
__global__ void | synorm_fro_kernel (lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_values) |
Sum of squares, in scaled representation, for each tile in Aarray. | |
template<typename scalar_t > | |
void | synorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
Batched routine that computes a partial norm for each tile. | |
template<typename scalar_t > | |
__global__ void | synorm_offdiag_one_kernel (int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv) |
Sum of absolute values of each row and each column of elements, for each tile in tiles. | |
template<typename scalar_t > | |
void | synormOffdiag (lapack::Norm norm, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
Batched routine that computes a partial norm for each tile. | |
template void | synorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, float const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template void | synorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, double const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | synorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | synorm (lapack::Norm norm, lapack::Uplo uplo, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template void | synormOffdiag (lapack::Norm norm, int64_t m, int64_t n, float const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template void | synormOffdiag (lapack::Norm norm, int64_t m, int64_t n, double const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | synormOffdiag (lapack::Norm norm, int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | synormOffdiag (lapack::Norm norm, int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<typename scalar_t > | |
__device__ void | transpose_func (bool is_conj, int n, scalar_t *A, int64_t lda) |
Device routine handles one matrix. | |
template<typename scalar_t , int NX> | |
__device__ void | transpose_func (bool is_conj, int m, int n, const scalar_t *A, int64_t lda, scalar_t *AT, int64_t ldat) |
tile M-by-N matrix with ceil(M/NB) by ceil(N/NB) tiles sized NB-by-NB. | |
template<typename scalar_t > | |
__global__ void | transpose_kernel (bool is_conj, int n, scalar_t *A, int64_t lda) |
in-place transpose of a square buffer | |
template<typename scalar_t > | |
__global__ void | transpose_batch_kernel (bool is_conj, int n, scalar_t **Aarray, int64_t lda) |
in-place transpose of array of square buffers | |
template<typename scalar_t , int NX> | |
__global__ void | transpose_kernel (bool is_conj, int m, int n, const scalar_t *A, int64_t lda, scalar_t *AT, int64_t ldat) |
out-of-place transpose of a rectangular buffer transposes A onto AT | |
template<typename scalar_t , int NX> | |
__global__ void | transpose_batch_kernel (bool is_conj, int m, int n, scalar_t **dA_array, int64_t lda, scalar_t **dAT_array, int64_t ldat) |
out-of-place transpose of an array of rectangular buffers transposes dA_array onto dAT_array | |
template void | transpose (bool is_conj, int64_t n, float *A, int64_t lda, blas::Queue &queue) |
template void | transpose (bool is_conj, int64_t n, double *A, int64_t lda, blas::Queue &queue) |
template void | transpose (bool is_conj, int64_t m, int64_t n, float *A, int64_t lda, float *B, int64_t ldb, blas::Queue &queue) |
template void | transpose (bool is_conj, int64_t m, int64_t n, double *A, int64_t lda, double *B, int64_t ldb, blas::Queue &queue) |
template<> | |
void | transpose (bool is_conj, int64_t n, std::complex< float > *A, int64_t lda, blas::Queue &queue) |
template<> | |
void | transpose (bool is_conj, int64_t n, std::complex< double > *A, int64_t lda, blas::Queue &queue) |
template<> | |
void | transpose (bool is_conj, int64_t m, int64_t n, std::complex< float > *A, int64_t lda, std::complex< float > *B, int64_t ldb, blas::Queue &queue) |
template<> | |
void | transpose (bool is_conj, int64_t m, int64_t n, std::complex< double > *A, int64_t lda, std::complex< double > *B, int64_t ldb, blas::Queue &queue) |
template void | transpose_batch (bool is_conj, int64_t n, float **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue) |
template void | transpose_batch (bool is_conj, int64_t n, double **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue) |
template void | transpose_batch (bool is_conj, int64_t m, int64_t n, float **Aarray, int64_t lda, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template void | transpose_batch (bool is_conj, int64_t m, int64_t n, double **Aarray, int64_t lda, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | transpose_batch (bool is_conj, int64_t n, std::complex< float > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | transpose_batch (bool is_conj, int64_t n, std::complex< double > **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | transpose_batch (bool is_conj, int64_t m, int64_t n, std::complex< float > **Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | transpose_batch (bool is_conj, int64_t m, int64_t n, std::complex< double > **Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<typename scalar_t > | |
__global__ void | trnorm_max_kernel (lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_maxima) |
Finds the largest absolute value of elements, for each tile in Aarray. | |
template<typename scalar_t > | |
__global__ void | trnorm_one_kernel (lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv) |
Sum of absolute values of each column of elements, for each tile in Aarray. | |
template<typename scalar_t > | |
__global__ void | trnorm_inf_kernel (lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_sums, int64_t ldv) |
Sum of absolute values of each row of elements, for each tile in Aarray. | |
template<typename scalar_t > | |
__global__ void | trnorm_fro_kernel (lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *tiles_values) |
Sum of squares, in scaled representation, for each tile in Aarray. | |
template<typename scalar_t > | |
void | trnorm (lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, scalar_t const *const *Aarray, int64_t lda, blas::real_type< scalar_t > *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
Batched routine that computes a partial norm for each trapezoidal tile. | |
template void | trnorm (lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, float const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template void | trnorm (lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, double const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | trnorm (lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, float *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | trnorm (lapack::Norm norm, lapack::Uplo uplo, lapack::Diag diag, int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, double *values, int64_t ldv, int64_t batch_count, blas::Queue &queue) |
template<typename scalar_t > | |
__global__ void | tzadd_kernel (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t alpha, scalar_t **Aarray, int64_t lda, scalar_t beta, scalar_t **Barray, int64_t ldb) |
Kernel implementing element-wise tile addition. | |
template<typename scalar_t > | |
void | tzadd (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t const &alpha, scalar_t **Aarray, int64_t lda, scalar_t const &beta, scalar_t **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
Batched routine for element-wise trapezoidal tile addition. | |
template void | tzadd (lapack::Uplo uplo, int64_t m, int64_t n, float const &alpha, float **Aarray, int64_t lda, float const &beta, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template void | tzadd (lapack::Uplo uplo, int64_t m, int64_t n, double const &alpha, double **Aarray, int64_t lda, double const &beta, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | tzadd (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< float > const &alpha, std::complex< float > **Aarray, int64_t lda, std::complex< float > const &beta, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | tzadd (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< double > const &alpha, std::complex< double > **Aarray, int64_t lda, std::complex< double > const &beta, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<typename src_scalar_t , typename dst_scalar_t > | |
__global__ void | tzcopy_kernel (lapack::Uplo uplo, int64_t m, int64_t n, src_scalar_t const *const *Aarray, int64_t lda, dst_scalar_t **Barray, int64_t ldb) |
Kernel implementing copy and precision conversions, copying A to B. | |
template<typename src_scalar_t , typename dst_scalar_t > | |
void | tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, src_scalar_t const *const *Aarray, int64_t lda, dst_scalar_t **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
Batched routine for element-wise trapezoidal copy and precision conversion, copying A to B. | |
template void | tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, float const *const *Aarray, int64_t lda, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template void | tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, float const *const *Aarray, int64_t lda, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template void | tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, double const *const *Aarray, int64_t lda, double **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template void | tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, double const *const *Aarray, int64_t lda, float **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< float > const *const *Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, std::complex< double > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | tzcopy (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< double > const *const *Aarray, int64_t lda, std::complex< float > **Barray, int64_t ldb, int64_t batch_count, blas::Queue &queue) |
template<typename scalar_t > | |
__global__ void | tzscale_kernel (lapack::Uplo uplo, int64_t m, int64_t n, blas::real_type< scalar_t > numer, blas::real_type< scalar_t > denom, scalar_t **Aarray, int64_t lda) |
Kernel implementing element-wise tile scale. | |
template<typename scalar_t > | |
__device__ void | tzset_func (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t *A, int64_t lda) |
Device function implementing element-wise tile set. | |
template<typename scalar_t > | |
__global__ void | tzset_kernel (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t *A, int64_t lda) |
Kernel implementing element-wise tile set. | |
template<typename scalar_t > | |
__global__ void | tzset_batch_kernel (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t offdiag_value, scalar_t diag_value, scalar_t **Aarray, int64_t lda) |
Kernel implementing element-wise tile set. | |
template<typename scalar_t > | |
void | tzset (lapack::Uplo uplo, int64_t m, int64_t n, scalar_t const &offdiag_value, scalar_t const &diag_value, scalar_t *A, int64_t lda, blas::Queue &queue) |
Element-wise trapezoidal tile set. | |
template void | tzset (lapack::Uplo uplo, int64_t m, int64_t n, float const &offdiag_value, float const &diag_value, float *A, int64_t lda, blas::Queue &queue) |
template void | tzset (lapack::Uplo uplo, int64_t m, int64_t n, double const &offdiag_value, double const &diag_value, double *A, int64_t lda, blas::Queue &queue) |
template<> | |
void | tzset (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< float > const &offdiag_value, std::complex< float > const &diag_value, std::complex< float > *A, int64_t lda, blas::Queue &queue) |
template<> | |
void | tzset (lapack::Uplo uplo, int64_t m, int64_t n, std::complex< double > const &offdiag_value, std::complex< double > const &diag_value, std::complex< double > *A, int64_t lda, blas::Queue &queue) |
template<typename real_t > | |
__host__ __device__ real_t | max_nan (real_t x, real_t y) |
max that propagates nan consistently: max_nan( 1, nan ) = nan max_nan( nan, 1 ) = nan | |
template<typename real_t > | |
__device__ void | max_nan_reduce (int n, int tid, real_t *x) |
Max reduction of n-element array x, leaving total in x[0]. | |
template<typename real_t > | |
__device__ void | sum_reduce (int n, int tid, real_t *x) |
Sum reduction of n-element array x, leaving total in x[0]. | |
__host__ __device__ double | real (rocblas_double_complex x) |
__host__ __device__ float | real (rocblas_float_complex x) |
__host__ __device__ double | imag (rocblas_double_complex x) |
__host__ __device__ float | imag (rocblas_float_complex x) |
__host__ __device__ rocblas_double_complex | conj (rocblas_double_complex x) |
__host__ __device__ rocblas_float_complex | conj (rocblas_float_complex x) |
__host__ __device__ double | real (double x) |
__host__ __device__ float | real (float x) |
__host__ __device__ double | imag (double x) |
__host__ __device__ float | imag (float x) |
__host__ __device__ double | conj (double x) |
__host__ __device__ float | conj (float x) |
__host__ __device__ float | abs (float x) |
Overloaded versions of absolute value on device. | |
__host__ __device__ double | abs (double x) |
__host__ __device__ float | abs (cuFloatComplex x) |
__host__ __device__ double | abs (cuDoubleComplex x) |
template<typename scalar_t > | |
__host__ __device__ scalar_t | sqr (scalar_t x) |
Square of number. | |
template<typename real_t > | |
__host__ __device__ void | combine_sumsq (real_t &scale1, real_t &sumsq1, real_t scale2, real_t sumsq2) |
Adds two scaled, sum-of-squares representations. | |
template<typename real_t > | |
__host__ __device__ void | add_sumsq (real_t &scale, real_t &sumsq, real_t absx) |
Adds new value to scaled, sum-of-squares representation. | |
template<typename T > | |
__host__ __device__ constexpr T | ceildiv (T x, T y) |
template<typename T > | |
__host__ __device__ constexpr T | roundup (T x, T y) |
template<typename TA , typename TB > | |
__host__ __device__ void | copy (TA a, TB &b) |
Overloaded copy and precision conversion. | |
__host__ __device__ void | copy (cuFloatComplex a, cuDoubleComplex &b) |
Sets b = a, converting from complex-float to complex-double. | |
__host__ __device__ void | copy (cuDoubleComplex a, cuFloatComplex &b) |
Sets b = a, converting from complex-double to complex-float. | |
__host__ __device__ void | copy (float a, cuFloatComplex &b) |
Sets b = a, converting from float to complex-float. | |
__host__ __device__ void | copy (double a, cuDoubleComplex &b) |
Sets b = a, converting from double to complex-double. | |
template<typename x_scalar_t , typename y_scalar_t > | |
void | transpose_batch (bool is_conj, int64_t m, int64_t n, x_scalar_t **dA_array, int64_t lda, y_scalar_t **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue) |
template<typename scalar_t , typename scalar_t2 > | |
void | gescale_row_col_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Rarray, scalar_t2 const *const *Carray, scalar_t **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue) |
Kernel implementing row and column scaling. | |
template<typename scalar_t , typename scalar_t2 > | |
void | gescale_col_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Carray, scalar_t **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue) |
Kernel implementing column scaling. | |
template<typename scalar_t , typename scalar_t2 > | |
void | gescale_row_batch_kernel (int64_t m, int64_t n, scalar_t2 const *const *Rarray, scalar_t **Aarray, int64_t lda, int64_t batch_count, blas::Queue &queue) |
Kernel implementing row scaling. | |
template<typename scalar_t > | |
void | transpose_sqr_batch_func (bool is_conj, int n, scalar_t **Aarray, int64_t lda, int batch_count, blas::Queue &queue) |
Device routine handles batches of square matrices. | |
template<typename scalar_t > | |
void | transpose_sqr_func (bool is_conj, int n, scalar_t *A, int64_t lda, blas::Queue &queue) |
Device routine handles single square matrix. | |
template<typename scalar_t , int NX> | |
void | transpose_rect_batch_func (bool is_conj, int m, int n, scalar_t **dAarray, int64_t lda, scalar_t **dATarray, int64_t ldat, int batch_count, blas::Queue &queue) |
Device routine handles batches of rectangular matrices. | |
template<typename scalar_t , int NX> | |
void | transpose_rect_func (bool is_conj, int m, int n, scalar_t *dA, int64_t lda, scalar_t *dAT, int64_t ldat, blas::Queue &queue) |
Device routine handles a single rectangular matrix. | |
template<typename scalar_t , int NX> | |
void | transpose (bool is_conj, int64_t m, int64_t n, scalar_t *dA, int64_t lda, scalar_t *dAT, int64_t ldat, blas::Queue &queue) |
Physically transpose a rectangular matrix out-of-place. | |
template<typename scalar_t , int NX> | |
void | transpose_batch (bool is_conj, int64_t m, int64_t n, scalar_t **dA_array, int64_t lda, scalar_t **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue) |
Physically transpose a batch of rectangular matrices out-of-place. | |
template<> | |
void | transpose (bool is_conj, int64_t m, int64_t n, float *dA, int64_t lda, float *dAT, int64_t ldat, blas::Queue &queue) |
template<> | |
void | transpose (bool is_conj, int64_t m, int64_t n, double *dA, int64_t lda, double *dAT, int64_t ldat, blas::Queue &queue) |
template<> | |
void | transpose (bool is_conj, int64_t m, int64_t n, std::complex< float > *dA, int64_t lda, std::complex< float > *dAT, int64_t ldat, blas::Queue &queue) |
template<> | |
void | transpose (bool is_conj, int64_t m, int64_t n, std::complex< double > *dA, int64_t lda, std::complex< double > *dAT, int64_t ldat, blas::Queue &queue) |
template<> | |
void | transpose_batch (bool is_conj, int64_t m, int64_t n, float **dA_array, int64_t lda, float **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | transpose_batch (bool is_conj, int64_t m, int64_t n, double **dA_array, int64_t lda, double **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | transpose_batch (bool is_conj, int64_t m, int64_t n, std::complex< float > **dA_array, int64_t lda, std::complex< float > **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue) |
template<> | |
void | transpose_batch (bool is_conj, int64_t m, int64_t n, std::complex< double > **dA_array, int64_t lda, std::complex< double > **dAT_array, int64_t ldat, int64_t batch_count, blas::Queue &queue) |
Variables | |
const int | ib = 32 |
block size for genorm_one_kernel | |
const int | ib1 = 33 |
ib + 1 for stride to avoid GPU bank conflicts | |
static const int | NB = 32 |
block size for transpose_func | |
static const int | NY = 8 |
y dim of thread block size for transpose_func | |
GPU device implementations of kernels.
__host__ __device__ void slate::device::add_sumsq | ( | real_t & | scale, |
real_t & | sumsq, | ||
real_t | absx | ||
) |
Adds new value to scaled, sum-of-squares representation.
On exit, scale and sumsq are updated such that: scale^2 sumsq := scale^2 sumsq + (absx)^2
|
inlineconstexpr |
__host__ __device__ void slate::device::combine_sumsq | ( | real_t & | scale1, |
real_t & | sumsq1, | ||
real_t | scale2, | ||
real_t | sumsq2 | ||
) |
Adds two scaled, sum-of-squares representations.
On exit, scale1 and sumsq1 are updated such that: scale1^2 sumsq1 := scale1^2 sumsq1 + scale2^2 sumsq2.
|
inline |
|
inline |
Overloaded copy and precision conversion.
Sets b = a, converting from type TA to type TB.
void slate::device::geadd | ( | int64_t | m, |
int64_t | n, | ||
scalar_t const & | alpha, | ||
scalar_t * | A, | ||
int64_t | lda, | ||
scalar_t const & | beta, | ||
scalar_t * | B, | ||
int64_t | ldb, | ||
blas::Queue & | queue | ||
) |
Routine for element-wise tile addition.
Sets
\[ B = \alpha A + \beta B. \]
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | alpha | The scalar alpha. |
[in] | A | is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile in A. lda >= m. |
[in] | beta | The scalar beta. |
[in,out] | B | is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | ldb | Leading dimension of each tile in B. ldb >= m. |
[in] | queue | BLAS++ queue to execute in. |
Sets
\[ B = \alpha A + \beta B. \]
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | alpha | The scalar alpha. |
[in] | A | is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile in A. lda >= m. |
[in] | beta | The scalar beta. |
[in,out] | B | is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | ldb | Leading dimension of each tile in B. ldb >= m. |
[in] | queue | BLAS++ queue to execute in. |
__global__ void slate::device::geadd_batch_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t | alpha, | ||
scalar_t ** | Aarray, | ||
int64_t | lda, | ||
scalar_t | beta, | ||
scalar_t ** | Barray, | ||
int64_t | ldb | ||
) |
Kernel implementing element-wise tile set.
__device__ void slate::device::geadd_func | ( | int64_t | m, |
int64_t | n, | ||
scalar_t | alpha, | ||
scalar_t * | A, | ||
int64_t | lda, | ||
scalar_t | beta, | ||
scalar_t * | B, | ||
int64_t | ldb | ||
) |
Kernel implementing element-wise tile addition.
Each thread deals with one row. Launched by geadd_kernel() and geadd_batch_kernel().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | Aarray | is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile in Aarray. lda >= m. |
[in,out] | B | is an m-by-n matrix stored in an ldb-by-n array. |
[in] | ldb | Leading dimension of each tile in Barray. ldb >= m. |
Routine for element-wise tile addition.
Sets
\[ B = \alpha A + \beta B. \]
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | alpha | The scalar alpha. |
[in] | A | is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile in A. lda >= m. |
[in] | beta | The scalar beta. |
[in,out] | B | is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | ldb | Leading dimension of each tile in B. ldb >= m. |
[in] | queue | BLAS++ queue to execute in. |
Sets
\[ B = \alpha A + \beta B. \]
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | alpha | The scalar alpha. |
[in] | A | is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile in A. lda >= m. |
[in] | beta | The scalar beta. |
[in,out] | B | is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | ldb | Leading dimension of each tile in B. ldb >= m. |
[in] | queue | BLAS++ queue to execute in. |
__global__ void slate::device::geadd_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t | alpha, | ||
scalar_t * | A, | ||
int64_t | lda, | ||
scalar_t | beta, | ||
scalar_t * | B, | ||
int64_t | ldb | ||
) |
Kernel implementing element-wise tile.
Routine for element-wise tile addition.
Sets
\[ B = \alpha A + \beta B. \]
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | alpha | The scalar alpha. |
[in] | A | is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile in A. lda >= m. |
[in] | beta | The scalar beta. |
[in,out] | B | is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | ldb | Leading dimension of each tile in B. ldb >= m. |
[in] | queue | BLAS++ queue to execute in. |
Sets
\[ B = \alpha A + \beta B. \]
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | alpha | The scalar alpha. |
[in] | A | is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile in A. lda >= m. |
[in] | beta | The scalar beta. |
[in,out] | B | is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | ldb | Leading dimension of each tile in B. ldb >= m. |
[in] | queue | BLAS++ queue to execute in. |
void slate::device::gecopy | ( | int64_t | m, |
int64_t | n, | ||
src_scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
dst_scalar_t ** | Barray, | ||
int64_t | ldb, | ||
int64_t | batch_count, | ||
blas::Queue & | queue | ||
) |
Batched routine for element-wise copy and precision conversion, copying A to B.
Sets
\[ Barray[k] = Aarray[k]. \]
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile in A. lda >= m. |
[out] | Barray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | ldb | Leading dimension of each tile in B. ldb >= m. |
[in] | batch_count | Size of Aarray and Barray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
Sets
\[ Barray[k] = Aarray[k]. \]
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile in A. lda >= m. |
[out] | Barray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | ldb | Leading dimension of each tile in B. ldb >= m. |
[in] | batch_count | Size of Aarray and Barray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
__global__ void slate::device::gecopy_kernel | ( | int64_t | m, |
int64_t | n, | ||
src_scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
dst_scalar_t ** | Barray, | ||
int64_t | ldb | ||
) |
Kernel implementing copy and precision conversions, copying A to B.
Each thread block deals with one tile. Each thread deals with one row. Launched by gecopy().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile in Aarray. lda >= m. |
[out] | Barray | Array of tiles of dimension gridDim.x, where each Barray[k] is an m-by-n matrix stored in an ldb-by-n array. |
[in] | ldb | Leading dimension of each tile in Barray. ldb >= m. |
void slate::device::genorm | ( | lapack::Norm | norm, |
NormScope | scope, | ||
int64_t | m, | ||
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | values, | ||
int64_t | ldv, | ||
int64_t | batch_count, | ||
blas::Queue & | queue | ||
) |
Batched routine that computes a partial norm for each tile.
Batched routine that returns the largest absolute value of elements for each tile in Aarray.
[in] | norm | Norm to compute. See values for description. |
[in] | scope | Scope of the norm.
|
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile. lda >= m. |
[out] | values | Array in GPU memory, dimension batch_count * ldv.
|
[in] | ldv | Leading dimension of values array. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
Sets tiles_maxima[k] = max_{i, j}( abs( A^(k)_(i, j) )), for each tile A^(k), where A^(k) = Aarray[k], k = 0, ..., blockDim.x-1, i = 0, ..., m-1, j = 0, ..., n-1.
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile. lda >= m. |
[out] | values | Array in GPU memory, dimension batch_count * ldv.
|
[in] | ldv | Leading dimension of tiles_sums (values) array. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | stream | device to execute in. |
__global__ void slate::device::genorm_fro_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | tiles_values | ||
) |
Sum of squares, in scaled representation, for each tile in Aarray.
Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Kernel assumes non-trivial tiles (m, n >= 1). Launched by genorm().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. Also the number of threads per block, hence, |
[in] | Aarray | Array of tiles of dimension blockDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile. lda >= m. |
[out] | tiles_values | Array of dimension 2 * blockDim.x. On exit, tiles_values[2*k + 0] = scale tiles_values[2*k + 1] = sumsq such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2 for tile A^(k). |
__global__ void slate::device::genorm_inf_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | tiles_sums, | ||
int64_t | ldv | ||
) |
Sum of absolute values of each row of elements, for each tile in Aarray.
Each thread block deals with one tile. Each thread deals with one row. Kernel assumes non-trivial tiles (m, n >= 1). Launched by genorm().
[in] | m | Number of rows of each tile. m >= 1. Also the number of threads per block, hence, |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile. lda >= m. |
[out] | tiles_sums | Array of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) for row i of tile A^(k). |
[in] | ldv | Leading dimension of tiles_sums (values) array. |
__global__ void slate::device::genorm_max_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | tiles_maxima | ||
) |
Finds the largest absolute value of elements, for each tile in Aarray.
Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Uses dynamic shared memory array of length sizeof(real_t) * m. Kernel assumes non-trivial tiles (m, n >= 1). Launched by genorm().
[in] | m | Number of rows of each tile. m >= 1. Also the number of threads per block (blockDim.x), hence, |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile. lda >= m. |
[out] | tiles_maxima | Array of dimension gridDim.x. On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) ) for tile A^(k). |
__global__ void slate::device::genorm_one_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | tiles_sums, | ||
int64_t | ldv | ||
) |
Sum of absolute values of each column of elements, for each tile in Aarray.
Each thread block deals with one tile. Each thread deals with one column. Kernel assumes non-trivial tiles (m, n >= 1). Launched by genorm().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence, |
[in] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile. lda >= m. |
[out] | tiles_sums | Array of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) ) for row j of tile A^(k). |
[in] | ldv | Leading dimension of tiles_sums (values) array. |
void slate::device::gescale | ( | int64_t | m, |
int64_t | n, | ||
scalar_t2 | numer, | ||
scalar_t2 | denom, | ||
scalar_t * | A, | ||
int64_t | lda, | ||
blas::Queue & | queue | ||
) |
Kernel implementing element-wise tile scale.
Each thread block deals with one tile. Each thread deals with one row. Launched by gescale().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | numer | Scale value numerator. |
[in] | denom | Scale value denominator. |
[in,out] | A | An m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile in Aarray. lda >= m. |
__global__ void slate::device::gescale_batch_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t2 | mul, | ||
scalar_t ** | Aarray, | ||
int64_t | lda | ||
) |
Kernel implementing element-wise tile scale.
__global__ void slate::device::gescale_col_batch_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t2 const *const * | Carray, | ||
scalar_t ** | Aarray, | ||
int64_t | lda | ||
) |
Kernel implementing column scaling.
Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | Carray | Vector of length n containing column scaling factors. |
[in,out] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile in Aarray. lda >= m. |
void slate::device::gescale_col_batch_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t2 const *const * | Carray, | ||
scalar_t ** | Aarray, | ||
int64_t | lda, | ||
int64_t | batch_count, | ||
blas::Queue & | queue | ||
) |
Kernel implementing column scaling.
Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | Carray | Vector of length n containing column scaling factors. |
[in,out] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile in Aarray. lda >= m. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
__device__ void slate::device::gescale_func | ( | int64_t | m, |
int64_t | n, | ||
scalar_t2 | mul, | ||
scalar_t * | A, | ||
int64_t | lda | ||
) |
Device function implementing element-wise tile scale.
Each thread block deals with one tile. gridDim.x == batch_count. Each thread deals with one row. Called by gescale_kernel and gescale_batch_kernel.
Kernel implementing element-wise tile scale.
Each thread block deals with one tile. Each thread deals with one row. Launched by gescale().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | numer | Scale value numerator. |
[in] | denom | Scale value denominator. |
[in,out] | A | An m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile in Aarray. lda >= m. |
__global__ void slate::device::gescale_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t2 | mul, | ||
scalar_t * | A, | ||
int64_t | lda | ||
) |
Kernel implementing element-wise tile scale.
Kernel implementing element-wise tile scale.
Each thread block deals with one tile. Each thread deals with one row. Launched by gescale().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | numer | Scale value numerator. |
[in] | denom | Scale value denominator. |
[in,out] | A | An m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile in Aarray. lda >= m. |
__global__ void slate::device::gescale_row_batch_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t2 const *const * | Rarray, | ||
scalar_t ** | Aarray, | ||
int64_t | lda | ||
) |
Kernel implementing row scaling.
Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | Rarray | Vector of length m containing row scaling factors. |
[in,out] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile in Aarray. lda >= m. |
void slate::device::gescale_row_batch_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t2 const *const * | Rarray, | ||
scalar_t ** | Aarray, | ||
int64_t | lda, | ||
int64_t | batch_count, | ||
blas::Queue & | queue | ||
) |
Kernel implementing row scaling.
Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | Rarray | Vector of length m containing row scaling factors. |
[in,out] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile in Aarray. lda >= m. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
void slate::device::gescale_row_col_batch | ( | Equed | equed, |
int64_t | m, | ||
int64_t | n, | ||
scalar_t2 const *const * | Rarray, | ||
scalar_t2 const *const * | Carray, | ||
scalar_t ** | Aarray, | ||
int64_t | lda, | ||
int64_t | batch_count, | ||
blas::Queue & | queue | ||
) |
Batched routine for row and column scaling.
[in] | equed | Form of scaling to do.
|
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | Rarray | Vector of length m containing row scaling factors. |
[in] | Carray | Vector of length n containing column scaling factors. |
[in,out] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile in A. lda >= m. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
__global__ void slate::device::gescale_row_col_batch_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t2 const *const * | Rarray, | ||
scalar_t2 const *const * | Carray, | ||
scalar_t ** | Aarray, | ||
int64_t | lda | ||
) |
Kernel implementing row and column scaling.
Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | Rarray | Vector of length m containing row scaling factors. |
[in] | Carray | Vector of length n containing column scaling factors. |
[in,out] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile in Aarray. lda >= m. |
void slate::device::gescale_row_col_batch_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t2 const *const * | Rarray, | ||
scalar_t2 const *const * | Carray, | ||
scalar_t ** | Aarray, | ||
int64_t | lda, | ||
int64_t | batch_count, | ||
blas::Queue & | queue | ||
) |
Kernel implementing row and column scaling.
Each thread block deals with one tile. Each thread deals with one row. Launched by gescale_row_col().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | Rarray | Vector of length m containing row scaling factors. |
[in] | Carray | Vector of length n containing column scaling factors. |
[in,out] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile in Aarray. lda >= m. |
void slate::device::geset | ( | int64_t | m, |
int64_t | n, | ||
scalar_t const & | offdiag_value, | ||
scalar_t const & | diag_value, | ||
scalar_t * | A, | ||
int64_t | lda, | ||
blas::Queue & | queue | ||
) |
Element-wise m-by-n matrix A to diag_value on the diagonal and offdiag_value on the off-diagonals.
[in] | m | Number of rows of A. m >= 0. |
[in] | n | Number of columns of A. n >= 0. |
[in] | offdiag_value | The value to set outside of the diagonal. |
[in] | diag_value | The value to set on the diagonal. |
[out] | A | An m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of A. lda >= m. |
[in] | queue | BLAS++ queue to execute in. |
__global__ void slate::device::geset_batch_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t | offdiag_value, | ||
scalar_t | diag_value, | ||
scalar_t ** | Aarray, | ||
int64_t | lda | ||
) |
Kernel implementing element-wise tile set.
__device__ void slate::device::geset_func | ( | int64_t | m, |
int64_t | n, | ||
scalar_t | offdiag_value, | ||
scalar_t | diag_value, | ||
scalar_t * | A, | ||
int64_t | lda | ||
) |
Kernel implementing element-wise tile set.
Each thread block deals with one tile. Each thread deals with one row. Launched by geset_kernel() and geset_batch_kernel().
Element-wise m-by-n matrix A to diag_value on the diagonal and offdiag_value on the off-diagonals.
[in] | m | Number of rows of A. m >= 0. |
[in] | n | Number of columns of A. n >= 0. |
[in] | offdiag_value | The value to set outside of the diagonal. |
[in] | diag_value | The value to set on the diagonal. |
[out] | A | An m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of A. lda >= m. |
[in] | queue | BLAS++ queue to execute in. |
__global__ void slate::device::geset_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t | offdiag_value, | ||
scalar_t | diag_value, | ||
scalar_t * | A, | ||
int64_t | lda | ||
) |
Kernel implementing element-wise tile.
Element-wise m-by-n matrix A to diag_value on the diagonal and offdiag_value on the off-diagonals.
[in] | m | Number of rows of A. m >= 0. |
[in] | n | Number of columns of A. n >= 0. |
[in] | offdiag_value | The value to set outside of the diagonal. |
[in] | diag_value | The value to set on the diagonal. |
[out] | A | An m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of A. lda >= m. |
[in] | queue | BLAS++ queue to execute in. |
void slate::device::henorm | ( | lapack::Norm | norm, |
lapack::Uplo | uplo, | ||
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | values, | ||
int64_t | ldv, | ||
int64_t | batch_count, | ||
blas::Queue & | queue | ||
) |
Batched routine that computes a partial norm for each tile.
Batched routine that returns the largest absolute value of elements for each tile in Aarray.
[in] | norm | Norm to compute. See values for description. |
[in] | uplo | Whether each Aarray[k] is stored in the upper or lower triangle. |
[in] | n | Number of rows and columns of each tile. n >= 0. |
[in] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile. lda >= n. |
[out] | values | Array in GPU memory, dimension batch_count * ldv.
|
[in] | ldv | Leading dimension of values array. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
Sets tiles_maxima[k] = max_{i, j}( abs( A^(k)_(i, j) )), for each tile A^(k), where A^(k) = Aarray[k], k = 0, ..., blockDim.x-1, i = 0, ..., n-1, j = 0, ..., n-1.
[in] | n | Number of rows and columns of each tile. n >= 0. |
[in] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile. lda >= n. |
[out] | values | Array in GPU memory, dimension batch_count * ldv.
|
[in] | ldv | Leading dimension of tiles_sums (values) array. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | stream | device to execute in. |
__global__ void slate::device::henorm_fro_kernel | ( | lapack::Uplo | uplo, |
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | tiles_values | ||
) |
Sum of squares, in scaled representation, for each tile in Aarray.
Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Kernel assumes non-trivial tiles (n >= 1). Launched by henorm().
[in] | n | Number of rows and columns of each tile. n >= 1. Also the number of threads per block, hence, |
[in] | Aarray | Array of tiles of dimension blockDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile. lda >= n. |
[out] | tiles_values | Array of dimension 2 * blockDim.x. On exit, tiles_values[2*k + 0] = scale tiles_values[2*k + 1] = sumsq such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2 for tile A^(k). |
__global__ void slate::device::henorm_max_kernel | ( | lapack::Uplo | uplo, |
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | tiles_maxima | ||
) |
Finds the largest absolute value of elements, for each tile in Aarray.
Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Uses dynamic shared memory array of length sizeof(real_t) * n. Kernel assumes non-trivial tiles (n >= 1). Launched by henorm().
[in] | n | Number of rows and columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence, |
[in] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile. lda >= n. |
[out] | tiles_maxima | Array of dimension gridDim.x. On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) ) for tile A^(k). |
__global__ void slate::device::henorm_one_kernel | ( | lapack::Uplo | uplo, |
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | tiles_sums, | ||
int64_t | ldv | ||
) |
Sum of absolute values of each column of elements, for each tile in Aarray.
Each thread block deals with one tile. Each thread deals with one column. Kernel assumes non-trivial tiles (n >= 1). Launched by henorm().
[in] | n | Number of rows and columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence, |
[in] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile. lda >= n. |
[out] | tiles_sums | Array of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) ) for row j of tile A^(k). |
[in] | ldv | Leading dimension of tiles_sums (values) array. |
|
inline |
__device__ void slate::device::max_nan_reduce | ( | int | n, |
int | tid, | ||
real_t * | x | ||
) |
Max reduction of n-element array x, leaving total in x[0].
Propagates NaN values consistently. With k threads, can reduce array up to 2*k in size. Assumes number of threads <= 1024, which is the current max number of CUDA threads.
[in] | n | Size of array. |
[in] | tid | Thread id. |
[in] | x | Array of dimension n. On exit, x[0] = max(x[0], ..., x[n-1]); the rest of x is overwritten. |
|
inline |
|
inlineconstexpr |
|
inline |
Square of number.
__device__ void slate::device::sum_reduce | ( | int | n, |
int | tid, | ||
real_t * | x | ||
) |
Sum reduction of n-element array x, leaving total in x[0].
With k threads, can reduce array up to 2*k in size. Assumes number of threads <= 1024 (which is current max number of CUDA threads).
[in] | n | Size of array. |
[in] | tid | Thread id. |
[in] | x | Array of dimension n. On exit, x[0] = sum(x[0], ..., x[n-1]); rest of x is overwritten. |
void slate::device::synorm | ( | lapack::Norm | norm, |
lapack::Uplo | uplo, | ||
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | values, | ||
int64_t | ldv, | ||
int64_t | batch_count, | ||
blas::Queue & | queue | ||
) |
Batched routine that computes a partial norm for each tile.
Batched routine that returns the largest absolute value of elements for each tile in Aarray.
[in] | norm | Norm to compute. See values for description. |
[in] | uplo | Whether each Aarray[k] is stored in the upper or lower triangle. |
[in] | n | Number of rows and columns of each tile. n >= 0. |
[in] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile. lda >= n. |
[out] | values | Array in GPU memory, dimension batch_count * ldv.
|
[in] | ldv | Leading dimension of values array. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
Sets tiles_maxima[k] = max_{i, j}( abs( A^(k)_(i, j) )), for each tile A^(k), where A^(k) = Aarray[k], k = 0, ..., blockDim.x-1, i = 0, ..., n-1, j = 0, ..., n-1.
[in] | n | Number of rows and columns of each tile. n >= 0. |
[in] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile. lda >= n. |
[out] | values | Array in GPU memory, dimension batch_count * ldv.
|
[in] | ldv | Leading dimension of tiles_sums (values) array. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | stream | device to execute in. |
__global__ void slate::device::synorm_fro_kernel | ( | lapack::Uplo | uplo, |
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | tiles_values | ||
) |
Sum of squares, in scaled representation, for each tile in Aarray.
Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Kernel assumes non-trivial tiles (n >= 1). Launched by synorm().
[in] | n | Number of rows and columns of each tile. n >= 1. Also the number of threads per block, hence, |
[in] | Aarray | Array of tiles of dimension blockDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile. lda >= n. |
[out] | tiles_values | Array of dimension 2 * blockDim.x. On exit, tiles_values[2*k + 0] = scale tiles_values[2*k + 1] = sumsq such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2 for tile A^(k). |
__global__ void slate::device::synorm_max_kernel | ( | lapack::Uplo | uplo, |
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | tiles_maxima | ||
) |
Finds the largest absolute value of elements, for each tile in Aarray.
Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Uses dynamic shared memory array of length sizeof(real_t) * n. Kernel assumes non-trivial tiles (n >= 1). Launched by synorm().
[in] | n | Number of rows and columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence, |
[in] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile. lda >= n. |
[out] | tiles_maxima | Array of dimension gridDim.x. On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) ) for tile A^(k). |
__global__ void slate::device::synorm_offdiag_one_kernel | ( | int64_t | m, |
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | tiles_sums, | ||
int64_t | ldv | ||
) |
Sum of absolute values of each row and each column of elements, for each tile in tiles.
Each thread block deals with one tile. Kernel assumes non-trivial tiles (m, n >= 1). Launched by synormOffdiag().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile. lda >= m. |
[out] | tiles_sums | Array of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + j] = sum_{i} abs( A^(k)_(i, j) ) for column j of tile A^(k), and tiles_sums[k*ldv + i + n] = sum_{j} abs( A^(k)_(i, j) ) for row i of tile A^(k). |
[in] | ldv | Leading dimension of tiles_sums (values) array. |
__global__ void slate::device::synorm_one_kernel | ( | lapack::Uplo | uplo, |
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | tiles_sums, | ||
int64_t | ldv | ||
) |
Sum of absolute values of each column of elements, for each tile in Aarray.
Each thread block deals with one tile. Each thread deals with one column. Kernel assumes non-trivial tiles (n >= 1). Launched by synorm().
[in] | n | Number of rows and columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence, |
[in] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile. lda >= n. |
[out] | tiles_sums | Array of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) ) for row j of tile A^(k). |
[in] | ldv | Leading dimension of tiles_sums (values) array. |
void slate::device::synormOffdiag | ( | lapack::Norm | norm, |
int64_t | m, | ||
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | values, | ||
int64_t | ldv, | ||
int64_t | batch_count, | ||
blas::Queue & | queue | ||
) |
Batched routine that computes a partial norm for each tile.
Batched routine that returns the largest absolute value of elements for each tile in Aarray.
Used for full, off-diagonal tiles within a symmetric matrix, where element Aij contributes to both column i and j.
[in] | norm | Norm to compute. See values for description. |
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile. lda >= m. |
[out] | values | Array in GPU memory, dimension batch_count * ldv.
|
[in] | ldv | Leading dimension of values array. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
Sets tiles_maxima[k] = max_{i, j}( abs( A^(k)_(i, j) )), for each tile A^(k), where A^(k) = Aarray[k], k = 0, ..., blockDim.x-1, i = 0, ..., n-1, j = 0, ..., n-1.
[in] | n | Number of rows and columns of each tile. n >= 0. |
[in] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an n-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile. lda >= n. |
[out] | values | Array in GPU memory, dimension batch_count * ldv.
|
[in] | ldv | Leading dimension of tiles_sums (values) array. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | stream | GPU device to execute in. |
void slate::device::transpose | ( | bool | is_conj, |
int64_t | m, | ||
int64_t | n, | ||
scalar_t * | dA, | ||
int64_t | lda, | ||
scalar_t * | dAT, | ||
int64_t | ldat, | ||
blas::Queue & | queue | ||
) |
Physically transpose a rectangular matrix out-of-place.
[in] | m | Number of columns of tile. m >= 0. |
[in] | n | Number of rows of tile. n >= 0. |
[in] | dA | A rectangular m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of dA. lda >= m. |
[out] | dAT | A rectangular m-by-n matrix stored in an ldat-by-m array in GPU memory. On output, dAT is the transpose of dA. |
[in] | ldat | Leading dimension of dAT. ldat >= n. |
[in] | queue | BLAS++ queue to execute in. |
void slate::device::transpose | ( | bool | is_conj, |
int64_t | m, | ||
int64_t | n, | ||
scalar_t * | dA, | ||
int64_t | lda, | ||
scalar_t * | dAT, | ||
int64_t | ldat, | ||
blas::Queue & | queue | ||
) |
Physically transpose a rectangular matrix out-of-place.
[in] | m | Number of columns of tile. m >= 0. |
[in] | n | Number of rows of tile. n >= 0. |
[in] | dA | A rectangular m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of dA. lda >= m. |
[out] | dAT | A rectangular m-by-n matrix stored in an ldat-by-m array in GPU memory. On output, dAT is the transpose of dA. |
[in] | ldat | Leading dimension of dAT. ldat >= n. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
void slate::device::transpose | ( | bool | is_conj, |
int64_t | n, | ||
scalar_t * | A, | ||
int64_t | lda, | ||
blas::Queue & | queue | ||
) |
Physically transpose a square matrix in place.
[in] | n | Number of rows and columns of each tile. n >= 0. |
[in,out] | A | A square n-by-n matrix stored in an lda-by-n array in GPU memory. On output, A is transposed. |
[in] | lda | Leading dimension of A. lda >= n. |
[in] | queue | BLAS++ queue to execute in. |
void slate::device::transpose_batch | ( | bool | is_conj, |
int64_t | m, | ||
int64_t | n, | ||
scalar_t ** | dA_array, | ||
int64_t | lda, | ||
scalar_t ** | dAT_array, | ||
int64_t | ldat, | ||
int64_t | batch_count, | ||
blas::Queue & | queue | ||
) |
Physically transpose a batch of rectangular matrices out-of-place.
[in] | m | Number of columns of each tile. m >= 0. |
[in] | n | Number of rows of each tile. n >= 0. |
[in] | dA_array | Array in GPU memory of dimension batch_count, containing pointers to matrices, where each dA_array[k] is a rectangular m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each dA_array[k] tile. lda >= m. |
[out] | dAT_array | Array in GPU memory of dimension batch_count, containing pointers to matrices, where each dAT_array[k] is a rectangular m-by-n matrix stored in an ldat-by-m array in GPU memory. On output, each dAT_array[k] is the transpose of dA_array[k]. |
[in] | ldat | Leading dimension of each dAT_array[k] tile. ldat >= n. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
void slate::device::transpose_batch | ( | bool | is_conj, |
int64_t | m, | ||
int64_t | n, | ||
scalar_t ** | dA_array, | ||
int64_t | lda, | ||
scalar_t ** | dAT_array, | ||
int64_t | ldat, | ||
int64_t | batch_count, | ||
blas::Queue & | queue | ||
) |
Physically transpose a batch of rectangular matrices out-of-place.
[in] | m | Number of columns of each tile. m >= 0. |
[in] | n | Number of rows of each tile. n >= 0. |
[in] | dA_array | Array in GPU memory of dimension batch_count, containing pointers to matrices, where each dA_array[k] is a rectangular m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each dA_array[k] tile. lda >= m. |
[out] | dAT_array | Array in GPU memory of dimension batch_count, containing pointers to matrices, where each dAT_array[k] is a rectangular m-by-n matrix stored in an ldat-by-m array in GPU memory. On output, each dAT_array[k] is the transpose of dA_array[k]. |
[in] | lda | Leading dimension of each dAT_array[k] tile. ldat >= n. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
void slate::device::transpose_batch | ( | bool | is_conj, |
int64_t | n, | ||
scalar_t ** | Aarray, | ||
int64_t | lda, | ||
int64_t | batch_count, | ||
blas::Queue & | queue | ||
) |
Physically transpose a batch of square matrices in place.
[in] | n | Number of rows and columns of each tile. n >= 0. |
[in,out] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to matrices, where each Aarray[k] is a square n-by-n matrix stored in an lda-by-n array in GPU memory. On output, each Aarray[k] is transposed. |
[in] | lda | Leading dimension of each tile. lda >= n. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
__device__ void slate::device::transpose_func | ( | bool | is_conj, |
int | m, | ||
int | n, | ||
const scalar_t * | A, | ||
int64_t | lda, | ||
scalar_t * | AT, | ||
int64_t | ldat | ||
) |
tile M-by-N matrix with ceil(M/NB) by ceil(N/NB) tiles sized NB-by-NB.
uses NX-by-NY threads, where NB/NX, NB/NY, NX/NY evenly. subtile each NB-by-NB tile with (NB/NX) subtiles sized NX-by-NB for each subtile load NX-by-NB subtile transposed from A into sA, as (NB/NY) blocks sized NX-by-NY save NB-by-NX subtile from sA into AT, as (NB/NX)*(NX/NY) blocks sized NX-by-NY A += NX AT += NX*ldat
e.g., with NB=32, NX=32, NY=8 ([sdc] precisions) load 32x32 subtile as 4 blocks of 32x8 columns: (A11 A12 A13 A14 ) save 32x32 subtile as 1*4 blocks of 32x8 columns: (AT11 AT12 AT13 AT14)
e.g., with NB=32, NX=16, NY=8 (z precision) load 16x32 subtile as 4 blocks of 16x8 columns: (A11 A12 A13 A14) save 32x16 subtile as 2*2 blocks of 16x8 columns: (AT11 AT12) (AT21 AT22)
__device__ void slate::device::transpose_func | ( | bool | is_conj, |
int | n, | ||
scalar_t * | A, | ||
int64_t | lda | ||
) |
Device routine handles one matrix.
Thread block grid: x = batch index (ignored here; see batch kernel), y = block row index, z = block col index. Each thread block is ib-by-ib threads and does one ib-by-ib block of an n-by-n matrix.
Let nt = ceildiv( n, ib ) be the number of blocks for one n-by-n matrix. An even number of blocks uses an (nt + 1) by (nt/2) grid. Example: for nt = 4 blocks, y by z = 5 by 2 grid: [ A00 A01 ] [-—. A11 ] [ A10 . | . . ] [ A10 '-—] [ A20 A21 | . . ] [ A20 A21 ] covers matrix as [ A30 A31 | A00 . ] [ A30 A31 ] [ A40 A41 | A01 A11 ] [ A40 A41 ]
An odd number of blocks uses an (nt) by (nt + 1)/2 grid. Example: for nt = 5 blocks, y by z = 5 by 3 grid: [ A00 | A01 A02 ] [ '-—. ] [ A00 . . | . . ] [ A10 A11 | A12 ] [ A10 A11 . | . . ] [ '--—] covers matrix as [ A20 A21 A22 | . . ] [ A20 A21 A22 ] [ A30 A31 A32 | A01 . ] [ A30 A31 A32 ] [ A40 A41 A42 | A02 A12 ] [ A40 A41 A42 ]
void slate::device::transpose_rect_batch_func | ( | bool | is_conj, |
int | m, | ||
int | n, | ||
scalar_t ** | dAarray, | ||
int64_t | lda, | ||
scalar_t ** | dATarray, | ||
int64_t | ldat, | ||
int | batch_count, | ||
blas::Queue & | queue | ||
) |
Device routine handles batches of rectangular matrices.
The routine loads blocks of data into small NX x NB local storage and then writes the blocks back transposed into the correct location transposed.
void slate::device::transpose_rect_func | ( | bool | is_conj, |
int | m, | ||
int | n, | ||
scalar_t * | dA, | ||
int64_t | lda, | ||
scalar_t * | dAT, | ||
int64_t | ldat, | ||
blas::Queue & | queue | ||
) |
Device routine handles a single rectangular matrix.
The routine loads blocks of data into small NX x NB local storage and then writes the blocks back transposed into the correct location transposed.
void slate::device::transpose_sqr_batch_func | ( | bool | is_conj, |
int | n, | ||
scalar_t ** | Aarray, | ||
int64_t | lda, | ||
int | batch_count, | ||
blas::Queue & | queue | ||
) |
Device routine handles batches of square matrices.
The routine loads blocks of data into small ib x ib local storage and then writes the blocks back transposed into the correct location transposed.
void slate::device::transpose_sqr_func | ( | bool | is_conj, |
int | n, | ||
scalar_t * | A, | ||
int64_t | lda, | ||
blas::Queue & | queue | ||
) |
Device routine handles single square matrix.
The routine loads blocks of data into small ib x ib local storage and then writes the blocks back transposed into the correct location transposed.
void slate::device::trnorm | ( | lapack::Norm | norm, |
lapack::Uplo | uplo, | ||
lapack::Diag | diag, | ||
int64_t | m, | ||
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | values, | ||
int64_t | ldv, | ||
int64_t | batch_count, | ||
blas::Queue & | queue | ||
) |
Batched routine that computes a partial norm for each trapezoidal tile.
Batched routine that returns the largest absolute value of elements for each tile in Aarray.
todo: rename to tznorm for consistency with other tz routines.
[in] | norm | Norm to compute. See values for description. |
[in] | uplo | Whether each Aarray[k] is upper or lower trapezoidal. |
[in] | diag | Whether or not each Aarray[k] has unit diagonal. |
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile. lda >= m. |
[out] | values | Array in GPU memory, dimension batch_count * ldv.
|
[in] | ldv | Leading dimension of values array. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
Sets tiles_maxima[k] = max_{i, j}( abs( A^(k)_(i, j) )), for each tile A^(k), where A^(k) = Aarray[k], k = 0, ..., blockDim.x-1, i = 0, ..., m-1, j = 0, ..., n-1.
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile. lda >= m. |
[out] | values | Array in GPU memory, dimension batch_count * ldv.
|
[in] | ldv | Leading dimension of tiles_sums (values) array. |
[in] | batch_count | Size of Aarray. batch_count >= 0. |
[in] | stream | device to execute in. |
__global__ void slate::device::trnorm_fro_kernel | ( | lapack::Uplo | uplo, |
lapack::Diag | diag, | ||
int64_t | m, | ||
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | tiles_values | ||
) |
Sum of squares, in scaled representation, for each tile in Aarray.
Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Kernel assumes non-trivial tiles (m, n >= 1). Launched by trnorm().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. Also the number of threads per block, hence, |
[in] | Aarray | Array of tiles of dimension blockDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile. lda >= m. |
[out] | tiles_values | Array of dimension 2 * blockDim.x. On exit, tiles_values[2*k + 0] = scale tiles_values[2*k + 1] = sumsq such that scale^2 * sumsq = sum_{i,j} abs( A^(k)_{i,j} )^2 for tile A^(k). |
__global__ void slate::device::trnorm_inf_kernel | ( | lapack::Uplo | uplo, |
lapack::Diag | diag, | ||
int64_t | m, | ||
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | tiles_sums, | ||
int64_t | ldv | ||
) |
Sum of absolute values of each row of elements, for each tile in Aarray.
Each thread block deals with one tile. Each thread deals with one row. Kernel assumes non-trivial tiles (m, n >= 1). Launched by trnorm().
[in] | m | Number of rows of each tile. m >= 1. Also the number of threads per block, hence, |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile. lda >= m. |
[out] | tiles_sums | Array of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + i] = sum_{j} abs( A^(k)_(i, j) ) for row i of tile A^(k). |
[in] | ldv | Leading dimension of tiles_sums (values) array. |
__global__ void slate::device::trnorm_max_kernel | ( | lapack::Uplo | uplo, |
lapack::Diag | diag, | ||
int64_t | m, | ||
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | tiles_maxima | ||
) |
Finds the largest absolute value of elements, for each tile in Aarray.
Each thread block deals with one tile. Each thread deals with one row, followed by a reduction. Uses dynamic shared memory array of length sizeof(real_t) * m. Kernel assumes non-trivial tiles (m, n >= 1). Launched by trnorm().
[in] | m | Number of rows of each tile. m >= 1. Also the number of threads per block (blockDim.x), hence, |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile. lda >= m. |
[out] | tiles_maxima | Array of dimension gridDim.x. On exit, tiles_maxima[k] = max_{i, j} abs( A^(k)_(i, j) ) for tile A^(k). |
__global__ void slate::device::trnorm_one_kernel | ( | lapack::Uplo | uplo, |
lapack::Diag | diag, | ||
int64_t | m, | ||
int64_t | n, | ||
scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
blas::real_type< scalar_t > * | tiles_sums, | ||
int64_t | ldv | ||
) |
Sum of absolute values of each column of elements, for each tile in Aarray.
Each thread block deals with one tile. Each thread deals with one column. Kernel assumes non-trivial tiles (m, n >= 1). Launched by trnorm().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. Also the number of threads per block (blockDim.x), hence, |
[in] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile. lda >= m. |
[out] | tiles_sums | Array of dimension gridDim.x * ldv. On exit, tiles_sums[k*ldv + j] = max_{i} abs( A^(k)_(i, j) ) for row j of tile A^(k). |
[in] | ldv | Leading dimension of tiles_sums (values) array. |
void slate::device::tzadd | ( | lapack::Uplo | uplo, |
int64_t | m, | ||
int64_t | n, | ||
scalar_t const & | alpha, | ||
scalar_t ** | Aarray, | ||
int64_t | lda, | ||
scalar_t const & | beta, | ||
scalar_t ** | Barray, | ||
int64_t | ldb, | ||
int64_t | batch_count, | ||
blas::Queue & | queue | ||
) |
Batched routine for element-wise trapezoidal tile addition.
Sets upper or lower part of
\[ Barray[k] = \alpha Aarray[k] + \beta Barray[k]. \]
[in] | uplo | Whether each Aarray[k] is upper or lower trapezoidal. |
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | alpha | The scalar alpha. |
[in] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile in A. lda >= m. |
[in] | beta | The scalar beta. |
[in,out] | Barray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | ldb | Leading dimension of each tile in B. ldb >= m. |
[in] | batch_count | Size of Aarray and Barray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
Sets upper or lower part of
\[ Barray[k] = \alpha Aarray[k] + \beta Barray[k]. \]
[in] | uplo | Whether each Aarray[k] is upper or lower trapezoidal. |
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | alpha | The scalar alpha. |
[in] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile in A. lda >= m. |
[in] | beta | The scalar beta. |
[in,out] | Barray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | ldb | Leading dimension of each tile in B. ldb >= m. |
[in] | batch_count | Size of Aarray and Barray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
__global__ void slate::device::tzadd_kernel | ( | lapack::Uplo | uplo, |
int64_t | m, | ||
int64_t | n, | ||
scalar_t | alpha, | ||
scalar_t ** | Aarray, | ||
int64_t | lda, | ||
scalar_t | beta, | ||
scalar_t ** | Barray, | ||
int64_t | ldb | ||
) |
Kernel implementing element-wise tile addition.
Each thread block deals with one tile. Each thread deals with one row. Launched by tzadd().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile in Aarray. lda >= m. |
[in,out] | Barray | Array of tiles of dimension gridDim.x, where each Barray[k] is an m-by-n matrix stored in an ldb-by-n array. |
[in] | ldb | Leading dimension of each tile in Barray. ldb >= m. |
void slate::device::tzcopy | ( | lapack::Uplo | uplo, |
int64_t | m, | ||
int64_t | n, | ||
src_scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
dst_scalar_t ** | Barray, | ||
int64_t | ldb, | ||
int64_t | batch_count, | ||
blas::Queue & | queue | ||
) |
Batched routine for element-wise trapezoidal copy and precision conversion, copying A to B.
Batched routine for element-wise copy and precision conversion.
Sets upper or lower part of
\[ Barray[k] = Aarray[k]. \]
[in] | uplo | Whether each Aarray[k] is upper or lower trapezoidal. |
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile in A. lda >= m. |
[out] | Barray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | ldb | Leading dimension of each tile in B. ldb >= m. |
[in] | batch_count | Size of Aarray and Barray. batch_count >= 0. |
[in] | queue | BLAS++ queue to execute in. |
[in] | m | Number of rows of each tile. m >= 0. |
[in] | n | Number of columns of each tile. n >= 0. |
[in] | Aarray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of each tile in A. lda >= m. |
[out] | Barray | Array in GPU memory of dimension batch_count, containing pointers to tiles, where each Barray[k] is an m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | ldb | Leading dimension of each tile in B. ldb >= m. |
[in] | batch_count | Size of Aarray and Barray. batch_count >= 0. |
[in] | stream | Device to execute in. |
__global__ void slate::device::tzcopy_kernel | ( | lapack::Uplo | uplo, |
int64_t | m, | ||
int64_t | n, | ||
src_scalar_t const *const * | Aarray, | ||
int64_t | lda, | ||
dst_scalar_t ** | Barray, | ||
int64_t | ldb | ||
) |
Kernel implementing copy and precision conversions, copying A to B.
Each thread block deals with one tile. Each thread deals with one row. Launched by tzcopy().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile in Aarray. lda >= m. |
[out] | Barray | Array of tiles of dimension gridDim.x, where each Barray[k] is an m-by-n matrix stored in an ldb-by-n array. |
[in] | ldb | Leading dimension of each tile in Barray. ldb >= m. |
__global__ void slate::device::tzscale_kernel | ( | lapack::Uplo | uplo, |
int64_t | m, | ||
int64_t | n, | ||
blas::real_type< scalar_t > | numer, | ||
blas::real_type< scalar_t > | denom, | ||
scalar_t ** | Aarray, | ||
int64_t | lda | ||
) |
Kernel implementing element-wise tile scale.
Each thread block deals with one tile. Each thread deals with one row. Launched by gescale().
[in] | m | Number of rows of each tile. m >= 1. |
[in] | n | Number of columns of each tile. n >= 1. |
[in] | numer | Scale value numerator. |
[in] | denom | Scale value denominator. |
[in,out] | Aarray | Array of tiles of dimension gridDim.x, where each Aarray[k] is an m-by-n matrix stored in an lda-by-n array. |
[in] | lda | Leading dimension of each tile in Aarray. lda >= m. |
void slate::device::tzset | ( | lapack::Uplo | uplo, |
int64_t | m, | ||
int64_t | n, | ||
scalar_t const & | offdiag_value, | ||
scalar_t const & | diag_value, | ||
scalar_t * | A, | ||
int64_t | lda, | ||
blas::Queue & | queue | ||
) |
Element-wise trapezoidal tile set.
Sets upper or lower part of Aarray[k] to diag_value on the diagonal and offdiag_value on the off-diagonals.
[in] | uplo | Whether each Aarray[k] is upper or lower trapezoidal. |
[in] | m | Number of rows of A. m >= 0. |
[in] | n | Number of columns of A. n >= 0. |
[in] | offdiag_value | Constant to set offdiagonal entries to. |
[in] | diag_value | Constant to set diagonal entries to. |
[out] | A | An m-by-n matrix stored in an lda-by-n array in GPU memory. |
[in] | lda | Leading dimension of A. lda >= m. |
[in] | queue | BLAS++ queue to execute in. |
__global__ void slate::device::tzset_batch_kernel | ( | lapack::Uplo | uplo, |
int64_t | m, | ||
int64_t | n, | ||
scalar_t | offdiag_value, | ||
scalar_t | diag_value, | ||
scalar_t ** | Aarray, | ||
int64_t | lda | ||
) |
Kernel implementing element-wise tile set.
__device__ void slate::device::tzset_func | ( | lapack::Uplo | uplo, |
int64_t | m, | ||
int64_t | n, | ||
scalar_t | offdiag_value, | ||
scalar_t | diag_value, | ||
scalar_t * | A, | ||
int64_t | lda | ||
) |
Device function implementing element-wise tile set.
Each thread block deals with one tile. gridDim.x == batch_count. Each thread deals with one row. Called by tzset_kernel and tzset_batch_kernel.
__global__ void slate::device::tzset_kernel | ( | lapack::Uplo | uplo, |
int64_t | m, | ||
int64_t | n, | ||
scalar_t | offdiag_value, | ||
scalar_t | diag_value, | ||
scalar_t * | A, | ||
int64_t | lda | ||
) |
Kernel implementing element-wise tile set.
static const int slate::device::ib = 32 |
block size for genorm_one_kernel
internal blocking 16 x 16 thread block = 256 threads 32 x 32 thread block = 1024 threads