77template <
typename TA,
typename TB,
typename TC>
83 scalar_type<TA, TB, TC> alpha,
84 TA
const *A, int64_t lda,
85 TB
const *B, int64_t ldb,
86 scalar_type<TA, TB, TC> beta,
90 using scalar_t = blas::scalar_type<TA, TB>;
92 #define A(i_, j_) A[ (i_) + (j_)*lda ]
93 #define B(i_, j_) B[ (i_) + (j_)*ldb ]
94 #define C(i_, j_) C[ (i_) + (j_)*ldc ]
97 const scalar_t zero = 0;
98 const scalar_t one = 1;
101 blas_error_if( layout != Layout::ColMajor &&
102 layout != Layout::RowMajor );
103 blas_error_if( side != Side::Left &&
104 side != Side::Right );
105 blas_error_if( uplo != Uplo::Lower &&
106 uplo != Uplo::Upper &&
107 uplo != Uplo::General );
108 blas_error_if( m < 0 );
109 blas_error_if( n < 0 );
112 if (layout == Layout::RowMajor) {
113 side = (side == Side::Left)
116 if (uplo == Uplo::Lower)
118 else if (uplo == Uplo::Upper)
124 blas_error_if( lda < ((side == Side::Left) ? m : n) );
125 blas_error_if( ldb < m );
126 blas_error_if( ldc < m );
129 if (m == 0 || n == 0)
135 for (int64_t j = 0; j < n; ++j) {
136 for (int64_t i = 0; i < m; ++i)
140 else if (beta != one) {
141 for (int64_t j = 0; j < n; ++j) {
142 for (int64_t i = 0; i < m; ++i)
150 if (side == Side::Left) {
151 if (uplo != Uplo::Lower) {
153 for (int64_t j = 0; j < n; ++j) {
154 for (int64_t i = 0; i < m; ++i) {
156 scalar_t alpha_Bij = alpha*B(i, j);
159 for (int64_t k = 0; k < i; ++k) {
160 C(k, j) += A(k, i) * alpha_Bij;
161 sum += A(k, i) * B(k, j);
165 + A(i, i) * alpha_Bij
172 for (int64_t j = 0; j < n; ++j) {
173 for (int64_t i = m-1; i >= 0; --i) {
175 scalar_t alpha_Bij = alpha*B(i, j);
178 for (int64_t k = i+1; k < m; ++k) {
179 C(k, j) += A(k, i) * alpha_Bij;
180 sum += A(k, i) * B(k, j);
184 + A(i, i) * alpha_Bij
191 if (uplo != Uplo::Lower) {
193 for (int64_t j = 0; j < n; ++j) {
195 scalar_t alpha_Akj = alpha * A(j, j);
197 for (int64_t i = 0; i < m; ++i)
198 C(i, j) = beta * C(i, j) + B(i, j) * alpha_Akj;
200 for (int64_t k = 0; k < j; ++k) {
201 alpha_Akj = alpha*A(k, j);
202 for (int64_t i = 0; i < m; ++i)
203 C(i, j) += B(i, k) * alpha_Akj;
206 for (int64_t k = j+1; k < n; ++k) {
207 alpha_Akj = alpha * A(j, k);
208 for (int64_t i = 0; i < m; ++i)
209 C(i, j) += B(i, k) * alpha_Akj;
215 for (int64_t j = 0; j < n; ++j) {
217 scalar_t alpha_Akj = alpha * A(j, j);
219 for (int64_t i = 0; i < m; ++i)
220 C(i, j) = beta * C(i, j) + B(i, j) * alpha_Akj;
222 for (int64_t k = 0; k < j; ++k) {
223 alpha_Akj = alpha * A(j, k);
224 for (int64_t i = 0; i < m; ++i)
225 C(i, j) += B(i, k) * alpha_Akj;
228 for (int64_t k = j+1; k < n; ++k) {
229 alpha_Akj = alpha*A(k, j);
230 for (int64_t i = 0; i < m; ++i)
231 C(i, j) += B(i, k) * alpha_Akj;
void swap(int64_t n, float *x, int64_t incx, float *y, int64_t incy, blas::Queue &queue)
GPU device, float version.
Definition device_swap.cc:67
void symm(blas::Layout layout, blas::Side side, blas::Uplo uplo, int64_t m, int64_t n, float alpha, float const *A, int64_t lda, float const *B, int64_t ldb, float beta, float *C, int64_t ldc, blas::Queue &queue)
GPU device, float version.
Definition device_symm.cc:106