blaspp/gemv_8hh_source.html

// Copyright (c) 2017-2023, University of Tennessee. All rights reserved.

// SPDX-License-Identifier: BSD-3-Clause

// This program is free software: you can redistribute it and/or modify it under

// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.


#ifndef BLAS_GEMV_HH

#define BLAS_GEMV_HH


#include "blas/util.hh"


#include <limits>


namespace blas {


// =============================================================================


template <typename TA, typename TX, typename TY>

void gemv(

    blas::Layout layout,

    blas::Op trans,

    int64_t m, int64_t n,

    blas::scalar_type<TA, TX, TY> alpha,

    TA const *A, int64_t lda,

    TX const *x, int64_t incx,

    blas::scalar_type<TA, TX, TY> beta,

    TY *y, int64_t incy )

{

    using std::swap;

    using scalar_t = blas::scalar_type<TA, TX, TY>;


    #define A(i_, j_) A[ (i_) + (j_)*lda ]


    // constants

    const scalar_t zero = 0;

    const scalar_t one  = 1;


    // check arguments

    blas_error_if( layout != Layout::ColMajor &&

                   layout != Layout::RowMajor );

    blas_error_if( trans != Op::NoTrans &&

                   trans != Op::Trans &&

                   trans != Op::ConjTrans );

    blas_error_if( m < 0 );

    blas_error_if( n < 0 );


    if (layout == Layout::ColMajor)

        blas_error_if( lda < m );

    else

        blas_error_if( lda < n );


    blas_error_if( incx == 0 );

    blas_error_if( incy == 0 );


    // quick return

    if (m == 0 || n == 0 || (alpha == zero && beta == one))

        return;


    bool doconj = false;

    if (layout == Layout::RowMajor) {

        // A => A^T; A^T => A; A^H => A & conj

        swap( m, n );

        if (trans == Op::NoTrans) {

            trans = Op::Trans;

        }

        else {

            if (trans == Op::ConjTrans) {

                doconj = true;

            }

            trans = Op::NoTrans;

        }

    }


    int64_t lenx = (trans == Op::NoTrans ? n : m);

    int64_t leny = (trans == Op::NoTrans ? m : n);

    int64_t kx = (incx > 0 ? 0 : (-lenx + 1)*incx);

    int64_t ky = (incy > 0 ? 0 : (-leny + 1)*incy);


    // ----------

    // form y = beta*y

    if (beta != one) {

        if (incy == 1) {

            if (beta == zero) {

                for (int64_t i = 0; i < leny; ++i) {

                    y[i] = zero;

                }

            }

            else {

                for (int64_t i = 0; i < leny; ++i) {

                    y[i] *= beta;

                }

            }

        }

        else {

            int64_t iy = ky;

            if (beta == zero) {

                for (int64_t i = 0; i < leny; ++i) {

                    y[iy] = zero;

                    iy += incy;

                }

            }

            else {

                for (int64_t i = 0; i < leny; ++i) {

                    y[iy] *= beta;

                    iy += incy;

                }

            }

        }

    }

    if (alpha == zero)

        return;


    // ----------

    if (trans == Op::NoTrans && ! doconj) {

        // form y += alpha * A * x

        int64_t jx = kx;

        if (incy == 1) {

            for (int64_t j = 0; j < n; ++j) {

                scalar_t tmp = alpha*x[jx];

                jx += incx;

                for (int64_t i = 0; i < m; ++i) {

                    y[i] += tmp * A(i, j);

                }

            }

        }

        else {

            for (int64_t j = 0; j < n; ++j) {

                scalar_t tmp = alpha*x[jx];

                jx += incx;

                int64_t iy = ky;

                for (int64_t i = 0; i < m; ++i) {

                    y[iy] += tmp * A(i, j);

                    iy += incy;

                }

            }

        }

    }

    else if (trans == Op::NoTrans && doconj) {

        // form y += alpha * conj( A ) * x

        // this occurs for row-major A^H * x

        int64_t jx = kx;

        if (incy == 1) {

            for (int64_t j = 0; j < n; ++j) {

                scalar_t tmp = alpha*x[jx];

                jx += incx;

                for (int64_t i = 0; i < m; ++i) {

                    y[i] += tmp * conj(A(i, j));

                }

            }

        }

        else {

            for (int64_t j = 0; j < n; ++j) {

                scalar_t tmp = alpha*x[jx];

                jx += incx;

                int64_t iy = ky;

                for (int64_t i = 0; i < m; ++i) {

                    y[iy] += tmp * conj(A(i, j));

                    iy += incy;

                }

            }

        }

    }

    else if (trans == Op::Trans) {

        // form y += alpha * A^T * x

        int64_t jy = ky;

        if (incx == 1) {

            for (int64_t j = 0; j < n; ++j) {

                scalar_t tmp = zero;

                for (int64_t i = 0; i < m; ++i) {

                    tmp += A(i, j) * x[i];

                }

                y[jy] += alpha*tmp;

                jy += incy;

            }

        }

        else {

            for (int64_t j = 0; j < n; ++j) {

                scalar_t tmp = zero;

                int64_t ix = kx;

                for (int64_t i = 0; i < m; ++i) {

                    tmp += A(i, j) * x[ix];

                    ix += incx;

                }

                y[jy] += alpha*tmp;

                jy += incy;

            }

        }

    }

    else {

        // form y += alpha * A^H * x

        int64_t jy = ky;

        if (incx == 1) {

            for (int64_t j = 0; j < n; ++j) {

                scalar_t tmp = zero;

                for (int64_t i = 0; i < m; ++i) {

                    tmp += conj(A(i, j)) * x[i];

                }

                y[jy] += alpha*tmp;

                jy += incy;

            }

        }

        else {

            for (int64_t j = 0; j < n; ++j) {

                scalar_t tmp = zero;

                int64_t ix = kx;

                for (int64_t i = 0; i < m; ++i) {

                    tmp += conj(A(i, j)) * x[ix];

                    ix += incx;

                }

                y[jy] += alpha*tmp;

                jy += incy;

            }

        }

    }


    #undef A

}


}  // namespace blas


#endif        //  #ifndef BLAS_GEMV_HH

blas::gemv
void gemv(blas::Layout layout, blas::Op trans, int64_t m, int64_t n, blas::scalar_type< TA, TX, TY > alpha, TA const *A, int64_t lda, TX const *x, int64_t incx, blas::scalar_type< TA, TX, TY > beta, TY *y, int64_t incy)
General matrix-vector multiply:
Definition gemv.hh:79

blas::swap
void swap(int64_t n, float *x, int64_t incx, float *y, int64_t incy, blas::Queue &queue)
GPU device, float version.
Definition device_swap.cc:67