78template <
typename TA,
typename TX,
typename TY>
83 blas::scalar_type<TA, TX, TY> alpha,
84 TA
const *A, int64_t lda,
85 TX
const *x, int64_t incx,
86 blas::scalar_type<TA, TX, TY> beta,
90 using scalar_t = blas::scalar_type<TA, TX, TY>;
92 #define A(i_, j_) A[ (i_) + (j_)*lda ]
95 const scalar_t zero = 0;
96 const scalar_t one = 1;
99 blas_error_if( layout != Layout::ColMajor &&
100 layout != Layout::RowMajor );
101 blas_error_if( trans != Op::NoTrans &&
102 trans != Op::Trans &&
103 trans != Op::ConjTrans );
104 blas_error_if( m < 0 );
105 blas_error_if( n < 0 );
107 if (layout == Layout::ColMajor)
108 blas_error_if( lda < m );
110 blas_error_if( lda < n );
112 blas_error_if( incx == 0 );
113 blas_error_if( incy == 0 );
116 if (m == 0 || n == 0 || (alpha == zero && beta == one))
120 if (layout == Layout::RowMajor) {
123 if (trans == Op::NoTrans) {
127 if (trans == Op::ConjTrans) {
134 int64_t lenx = (trans == Op::NoTrans ? n : m);
135 int64_t leny = (trans == Op::NoTrans ? m : n);
136 int64_t kx = (incx > 0 ? 0 : (-lenx + 1)*incx);
137 int64_t ky = (incy > 0 ? 0 : (-leny + 1)*incy);
144 for (int64_t i = 0; i < leny; ++i) {
149 for (int64_t i = 0; i < leny; ++i) {
157 for (int64_t i = 0; i < leny; ++i) {
163 for (int64_t i = 0; i < leny; ++i) {
174 if (trans == Op::NoTrans && ! doconj) {
178 for (int64_t j = 0; j < n; ++j) {
179 scalar_t tmp = alpha*x[jx];
181 for (int64_t i = 0; i < m; ++i) {
182 y[i] += tmp * A(i, j);
187 for (int64_t j = 0; j < n; ++j) {
188 scalar_t tmp = alpha*x[jx];
191 for (int64_t i = 0; i < m; ++i) {
192 y[iy] += tmp * A(i, j);
198 else if (trans == Op::NoTrans && doconj) {
203 for (int64_t j = 0; j < n; ++j) {
204 scalar_t tmp = alpha*x[jx];
206 for (int64_t i = 0; i < m; ++i) {
207 y[i] += tmp * conj(A(i, j));
212 for (int64_t j = 0; j < n; ++j) {
213 scalar_t tmp = alpha*x[jx];
216 for (int64_t i = 0; i < m; ++i) {
217 y[iy] += tmp * conj(A(i, j));
223 else if (trans == Op::Trans) {
227 for (int64_t j = 0; j < n; ++j) {
229 for (int64_t i = 0; i < m; ++i) {
230 tmp += A(i, j) * x[i];
237 for (int64_t j = 0; j < n; ++j) {
240 for (int64_t i = 0; i < m; ++i) {
241 tmp += A(i, j) * x[ix];
253 for (int64_t j = 0; j < n; ++j) {
255 for (int64_t i = 0; i < m; ++i) {
256 tmp += conj(A(i, j)) * x[i];
263 for (int64_t j = 0; j < n; ++j) {
266 for (int64_t i = 0; i < m; ++i) {
267 tmp += conj(A(i, j)) * x[ix];
void gemv(blas::Layout layout, blas::Op trans, int64_t m, int64_t n, blas::scalar_type< TA, TX, TY > alpha, TA const *A, int64_t lda, TX const *x, int64_t incx, blas::scalar_type< TA, TX, TY > beta, TY *y, int64_t incy)
General matrix-vector multiply:
Definition gemv.hh:79
void swap(int64_t n, float *x, int64_t incx, float *y, int64_t incy, blas::Queue &queue)
GPU device, float version.
Definition device_swap.cc:67