BLAS++ 2024.05.31
BLAS C++ API
Loading...
Searching...
No Matches
device.hh
1// Copyright (c) 2017-2023, University of Tennessee. All rights reserved.
2// SPDX-License-Identifier: BSD-3-Clause
3// This program is free software: you can redistribute it and/or modify it under
4// the terms of the BSD 3-Clause license. See the accompanying LICENSE file.
5
6#ifndef BLAS_DEVICE_HH
7#define BLAS_DEVICE_HH
8
9#include "blas/util.hh"
10#include "blas/defines.h"
11
12#if defined( BLAS_HAVE_CUBLAS ) \
13 || defined( BLAS_HAVE_ROCBLAS ) \
14 || defined( BLAS_HAVE_SYCL )
15 #define BLAS_HAVE_DEVICE
16#endif
17
18#ifdef BLAS_HAVE_CUBLAS
19 #include <cuda_runtime.h>
20 #include <cublas_v2.h>
21
22#elif defined(BLAS_HAVE_ROCBLAS)
23 // Default to HCC platform on ROCm
24 #if ! defined(__HIP_PLATFORM_NVCC__) && ! defined(__HIP_PLATFORM_HCC__)
25 #define __HIP_PLATFORM_HCC__
26 #define BLAS_HIP_PLATFORM_HCC
27 #endif
28
29 #include <hip/hip_runtime.h>
30
31 // Headers moved in ROCm 5.2
32 #if HIP_VERSION >= 50200000
33 #include <rocblas/rocblas.h>
34 #else
35 #include <rocblas.h>
36 #endif
37
38 // If we defined __HIP_PLATFORM_HCC__, undef it.
39 #ifdef BLAS_HIP_PLATFORM_HCC
40 #undef __HIP_PLATFORM_HCC__
41 #undef BLAS_HIP_PLATFORM_HCC
42 #endif
43
44#elif defined(BLAS_HAVE_SYCL)
45 #include <sycl/detail/cl.h> // For CL version
46 #include <sycl.hpp>
47
48#endif
49
50namespace blas {
51
52// -----------------------------------------------------------------------------
53// types
54
55#ifdef BLAS_HAVE_CUBLAS
56 typedef int device_blas_int;
57#elif defined(BLAS_HAVE_ROCBLAS)
58 typedef int device_blas_int;
59#elif defined(BLAS_HAVE_SYCL)
60 typedef std::int64_t device_blas_int;
61#else
62 typedef int device_blas_int;
63#endif
64
65// -----------------------------------------------------------------------------
79enum class MemcpyKind : device_blas_int {
80 HostToHost = 0,
81 HostToDevice = 1,
82 DeviceToHost = 2,
83 DeviceToDevice = 3,
84 Default = 4,
85};
86
87// -----------------------------------------------------------------------------
88#if defined(BLAS_HAVE_CUBLAS)
91 inline cudaMemcpyKind memcpy2cuda( MemcpyKind kind )
92 {
93 switch (kind) {
94 case MemcpyKind::HostToHost: return cudaMemcpyHostToHost; break;
95 case MemcpyKind::HostToDevice: return cudaMemcpyHostToDevice; break;
96 case MemcpyKind::DeviceToHost: return cudaMemcpyDeviceToHost; break;
97 case MemcpyKind::DeviceToDevice: return cudaMemcpyDeviceToDevice; break;
98 case MemcpyKind::Default: return cudaMemcpyDefault;
99 default: throw blas::Error( "unknown memcpy direction" );
100 }
101 }
102#elif defined(BLAS_HAVE_ROCBLAS)
105 inline hipMemcpyKind memcpy2hip( MemcpyKind kind )
106 {
107 switch (kind) {
108 case MemcpyKind::HostToHost: return hipMemcpyHostToHost; break;
109 case MemcpyKind::HostToDevice: return hipMemcpyHostToDevice; break;
110 case MemcpyKind::DeviceToHost: return hipMemcpyDeviceToHost; break;
111 case MemcpyKind::DeviceToDevice: return hipMemcpyDeviceToDevice; break;
112 case MemcpyKind::Default: return hipMemcpyDefault;
113 default: throw blas::Error( "unknown memcpy direction" );
114 }
115 }
116#elif defined(BLAS_HAVE_SYCL)
122 inline int64_t memcpy2sycl( MemcpyKind kind ) { return 0; }
123#endif
124
125// -----------------------------------------------------------------------------
126// constants
127const int MaxBatchChunk = 50000;
128
129#if defined( BLAS_HAVE_CUBLAS ) || defined( BLAS_HAVE_ROCBLAS )
130 const int MaxForkSize = 10;
131#else
132 // SYCL and no GPU code doesn't support fork mode.
133 const int MaxForkSize = 1;
134#endif
135
136//==============================================================================
142class Queue
143{
144public:
145 // Define generic names for vendor types.
146 #if defined( BLAS_HAVE_CUBLAS )
147 using stream_t = cudaStream_t;
148 using event_t = cudaEvent_t;
149 using handle_t = cublasHandle_t;
150
151 #elif defined( BLAS_HAVE_ROCBLAS )
152 using stream_t = hipStream_t;
153 using event_t = hipEvent_t;
154 using handle_t = rocblas_handle;
155
156 #elif defined( BLAS_HAVE_SYCL )
157 using stream_t = sycl::queue;
158
159 #else
160 // No GPU code.
161 using stream_t = void*; // unused
162 #endif
163
164 Queue();
165 Queue( int device );
166
167 Queue( int device, stream_t& stream );
168
169 #if defined( BLAS_HAVE_CUBLAS ) || defined( BLAS_HAVE_ROCBLAS )
170 Queue( int device, handle_t handle );
171 #endif
172
173 // Disable copying; must construct anew.
174 Queue( Queue const& ) = delete;
175 Queue& operator=( Queue const& ) = delete;
176
177 ~Queue();
178
179 int device() const { return device_; }
180 void sync();
181
183 void* work() { return (void*) work_; }
184
186 template <typename scalar_t>
187 size_t work_size() const { return lwork_ / sizeof(scalar_t); }
188
189 template <typename scalar_t>
190 void work_ensure_size( size_t lwork );
191
192 // switch from default stream to parallel streams
193 void fork( int num_streams=MaxForkSize );
194
195 // switch back to the default stream
196 void join();
197
198 // return the next-in-line stream (for both default and fork modes)
199 void revolve();
200
201 #if defined( BLAS_HAVE_CUBLAS ) || defined( BLAS_HAVE_ROCBLAS )
202 // Common for CUDA, ROCm.
203 void set_handle( handle_t& in_handle );
204 handle_t handle() const { return handle_; }
205 #endif
206
207 // Common for all: CUDA, ROCm, SYCL, no GPU.
208 void set_stream( stream_t& in_stream );
209
210 stream_t& stream()
211 {
212 #if defined( BLAS_HAVE_CUBLAS ) || defined( BLAS_HAVE_ROCBLAS )
213 return streams_[ current_stream_index_ ];
214 #else
215 return streams_[ 0 ];
216 #endif
217 }
218
219private:
220 // Workspace for pointer arrays of batch routines or other purposes.
221 char* work_;
222 size_t lwork_;
223
224 // streams_[ 0 ] is default stream; rest are parallel streams in fork mode.
225 stream_t streams_[ MaxForkSize ];
226
227 #if defined( BLAS_HAVE_CUBLAS ) || defined( BLAS_HAVE_ROCBLAS )
228 // Associated device BLAS handle.
229 handle_t handle_;
230
231 event_t events_[ MaxForkSize ];
232
233 // The number of streams the queue is currently using for
234 // launching kernels (1 by default).
235 int num_active_streams_;
236
237 // Index to the current stream in use.
238 int current_stream_index_;
239
240 // Whether the queue owns the BLAS handle and default stream,
241 // or the user provided them.
242 bool own_handle_;
243 bool own_default_stream_;
244 #endif
245
246 // Associated device ID.
247 int device_;
248};
249
250// -----------------------------------------------------------------------------
251// Light wrappers around CUDA and cuBLAS functions.
252#ifdef BLAS_HAVE_CUBLAS
253
254inline bool is_device_error( cudaError_t error )
255{
256 return (error != cudaSuccess);
257}
258
259inline bool is_device_error( cublasStatus_t error )
260{
261 return (error != CUBLAS_STATUS_SUCCESS);
262}
263
264inline const char* device_error_string( cudaError_t error )
265{
266 return cudaGetErrorString( error );
267}
268
269// see device_error.cc
270const char* device_error_string( cublasStatus_t error );
271
272#endif // HAVE_CUBLAS
273
274// -----------------------------------------------------------------------------
275// Light wrappers around HIP and rocBLAS functions.
276#ifdef BLAS_HAVE_ROCBLAS
277
278inline bool is_device_error( hipError_t error )
279{
280 return (error != hipSuccess);
281}
282
283inline bool is_device_error( rocblas_status error )
284{
285 return (error != rocblas_status_success);
286}
287
288inline const char* device_error_string( hipError_t error )
289{
290 return hipGetErrorString( error );
291}
292
293inline const char* device_error_string( rocblas_status error )
294{
295 return rocblas_status_to_string( error );
296}
297
298#endif // HAVE_ROCBLAS
299
300// -----------------------------------------------------------------------------
301// device errors
302#if defined(BLAS_ERROR_NDEBUG) || (defined(BLAS_ERROR_ASSERT) && defined(NDEBUG))
303
304 // BLAS++ does no error checking on device errors;
305 #define blas_dev_call( error ) \
306 error
307
308#elif defined(BLAS_ERROR_ASSERT)
309
310 // BLAS++ aborts on device errors
311 #if defined(BLAS_HAVE_SYCL)
312 #define blas_dev_call( error ) \
313 do { \
314 try { \
315 error; \
316 } \
317 catch (sycl::exception const& e) { \
318 blas::internal::abort_if( true, __func__, \
319 "%s", e.what() ); \
320 } \
321 catch (std::exception const& e) { \
322 blas::internal::abort_if( true, __func__, \
323 "%s", e.what() ); \
324 } \
325 catch (...) { \
326 blas::internal::abort_if( true, __func__, \
327 "%s", "unknown exception" ); \
328 } \
329 } while(0)
330
331 #else
332 #define blas_dev_call( error ) \
333 do { \
334 auto e = error; \
335 blas::internal::abort_if( blas::is_device_error(e), __func__, \
336 "%s", blas::device_error_string(e) ); \
337 } while(0)
338 #endif
339
340#else
341
342 // BLAS++ throws device errors (default)
343 #if defined(BLAS_HAVE_SYCL)
344 #define blas_dev_call( error ) \
345 do { \
346 try { \
347 error; \
348 } \
349 catch (sycl::exception const& e) { \
350 blas::internal::throw_if( true, \
351 e.what(), __func__ ); \
352 } \
353 catch (std::exception const& e) { \
354 blas::internal::throw_if( true, \
355 e.what(), __func__ ); \
356 } \
357 catch (...) { \
358 blas::internal::throw_if( true, \
359 "unknown exception", __func__ ); \
360 } \
361 } while(0)
362
363 #else
364 #define blas_dev_call( error ) \
365 do { \
366 auto e = error; \
367 blas::internal::throw_if( blas::is_device_error(e), \
368 blas::device_error_string(e), \
369 __func__ ); \
370 } while(0)
371 #endif
372
373#endif
374
375// -----------------------------------------------------------------------------
376// set/get device functions
377
378// private, internal routine; sets device for CUDA, ROCm; nothing for SYCL
379void internal_set_device( int device );
380
381int get_device_count();
382
383// -----------------------------------------------------------------------------
384// memory functions
385
386void device_free( void* ptr, blas::Queue &queue );
387
388void host_free_pinned( void* ptr, blas::Queue &queue );
389
390// -----------------------------------------------------------------------------
391// Template functions declared here
392// -----------------------------------------------------------------------------
393
394//------------------------------------------------------------------------------
404template <typename T>
405T* device_malloc(
406 int64_t nelements, blas::Queue &queue )
407{
408 blas_error_if( nelements < 0 );
409
410 T* ptr = nullptr;
411 #ifdef BLAS_HAVE_CUBLAS
412 blas::internal_set_device( queue.device() );
413 blas_dev_call(
414 cudaMalloc( (void**)&ptr, nelements * sizeof(T) ) );
415
416 #elif defined(BLAS_HAVE_ROCBLAS)
417 blas::internal_set_device( queue.device() );
418 blas_dev_call(
419 hipMalloc( (void**)&ptr, nelements * sizeof(T) ) );
420
421 #elif defined(BLAS_HAVE_SYCL)
422 blas_dev_call(
423 ptr = (T*)sycl::malloc_shared( nelements*sizeof(T), queue.stream() ) );
424
425 #else
426 throw blas::Error( "device BLAS not available", __func__ );
427 #endif
428 return ptr;
429}
430
431//------------------------------------------------------------------------------
445template <typename T>
446T* host_malloc_pinned(
447 int64_t nelements, blas::Queue &queue )
448{
449 blas_error_if( nelements < 0 );
450
451 T* ptr = nullptr;
452 #ifdef BLAS_HAVE_CUBLAS
453 blas_dev_call(
454 cudaMallocHost( (void**)&ptr, nelements * sizeof(T) ) );
455
456 #elif defined(BLAS_HAVE_ROCBLAS)
457 blas_dev_call(
458 hipHostMalloc( (void**)&ptr, nelements * sizeof(T) ) );
459
460 #elif defined(BLAS_HAVE_SYCL)
461 blas_dev_call(
462 ptr = (T*)sycl::malloc_host( nelements*sizeof(T), queue.stream() ) );
463
464 #else
465 throw blas::Error( "device BLAS not available", __func__ );
466 #endif
467 return ptr;
468}
469
470//------------------------------------------------------------------------------
486template <typename T>
487void device_memset(
488 T* ptr,
489 int value, int64_t nelements, Queue& queue)
490{
491 blas_error_if( nelements < 0 );
492
493 #ifdef BLAS_HAVE_CUBLAS
494 blas::internal_set_device( queue.device() );
495 blas_dev_call(
496 cudaMemsetAsync(
497 ptr, value,
498 nelements * sizeof(T), queue.stream() ) );
499
500 #elif defined(BLAS_HAVE_ROCBLAS)
501 blas::internal_set_device( queue.device() );
502 blas_dev_call(
503 hipMemsetAsync(
504 ptr, value,
505 nelements * sizeof(T), queue.stream() ) );
506
507 #elif defined(BLAS_HAVE_SYCL)
508 blas_dev_call(
509 queue.stream().memset( ptr, value, nelements * sizeof(T) ) );
510
511 #else
512 throw blas::Error( "device BLAS not available", __func__ );
513 #endif
514}
515
516//------------------------------------------------------------------------------
528template <typename T>
529[[deprecated("Use device_memcpy without kind. To be removed 2025-05.")]]
530void device_memcpy(
531 T* dst,
532 T const* src,
533 int64_t nelements, MemcpyKind kind, Queue& queue)
534{
535 blas_error_if( nelements < 0 );
536
537 #ifdef BLAS_HAVE_CUBLAS
538 blas::internal_set_device( queue.device() );
539 blas_dev_call(
540 cudaMemcpyAsync(
541 dst, src, sizeof(T)*nelements,
542 memcpy2cuda(kind), queue.stream() ) );
543
544 #elif defined(BLAS_HAVE_ROCBLAS)
545 blas::internal_set_device( queue.device() );
546 blas_dev_call(
547 hipMemcpyAsync(
548 dst, src, sizeof(T)*nelements,
549 memcpy2hip(kind), queue.stream() ) );
550
551 #elif defined(BLAS_HAVE_SYCL)
552 blas_dev_call(
553 queue.stream().memcpy( dst, src, sizeof(T)*nelements ) );
554
555 #else
556 throw blas::Error( "device BLAS not available", __func__ );
557 #endif
558}
559
560//------------------------------------------------------------------------------
578template <typename T>
579void device_memcpy(
580 T* dst,
581 T const* src,
582 int64_t nelements, Queue& queue)
583{
584 blas_error_if( nelements < 0 );
585
586 #ifdef BLAS_HAVE_CUBLAS
587 blas::internal_set_device( queue.device() );
588 blas_dev_call(
589 cudaMemcpyAsync(
590 dst, src, sizeof(T)*nelements,
591 cudaMemcpyDefault, queue.stream() ) );
592
593 #elif defined(BLAS_HAVE_ROCBLAS)
594 blas::internal_set_device( queue.device() );
595 blas_dev_call(
596 hipMemcpyAsync(
597 dst, src, sizeof(T)*nelements,
598 hipMemcpyDefault, queue.stream() ) );
599
600 #elif defined(BLAS_HAVE_SYCL)
601 blas_dev_call(
602 queue.stream().memcpy( dst, src, sizeof(T)*nelements ) );
603
604 #else
605 throw blas::Error( "device BLAS not available", __func__ );
606 #endif
607}
608
609//------------------------------------------------------------------------------
621template <typename T>
622[[deprecated("Use device_memcpy_2d without kind. To be removed 2025-05.")]]
623void device_memcpy_2d(
624 T* dst, int64_t dst_pitch,
625 T const* src, int64_t src_pitch,
626 int64_t width, int64_t height, MemcpyKind kind, Queue& queue)
627{
628 blas_error_if( width < 0 );
629 blas_error_if( height < 0 );
630 blas_error_if( dst_pitch < width );
631 blas_error_if( src_pitch < width );
632
633 #ifdef BLAS_HAVE_CUBLAS
634 blas::internal_set_device( queue.device() );
635 blas_dev_call(
636 cudaMemcpy2DAsync(
637 dst, sizeof(T)*dst_pitch,
638 src, sizeof(T)*src_pitch,
639 sizeof(T)*width, height, memcpy2cuda(kind), queue.stream() ) );
640
641 #elif defined(BLAS_HAVE_ROCBLAS)
642 blas::internal_set_device( queue.device() );
643 blas_dev_call(
644 hipMemcpy2DAsync(
645 dst, sizeof(T)*dst_pitch,
646 src, sizeof(T)*src_pitch,
647 sizeof(T)*width, height, memcpy2hip(kind), queue.stream() ) );
648
649 #elif defined(BLAS_HAVE_SYCL)
650 if (dst_pitch == width && src_pitch == width) {
651 // one contiguous memcpy
652 blas_dev_call(
653 queue.stream().memcpy( dst, src, width * height * sizeof(T) ) );
654 }
655 else {
656 // Copy each contiguous image row (matrix column).
657 // SYCL does not support set/get/lacpy matrix.
658 for (int64_t i = 0; i < height; ++i) {
659 T* dst_row = dst + i*dst_pitch;
660 T const* src_row = src + i*src_pitch;
661 blas_dev_call(
662 queue.stream().memcpy( dst_row, src_row, width*sizeof(T) ) );
663 }
664 }
665 #else
666 throw blas::Error( "device BLAS not available", __func__ );
667 #endif
668}
669
670//------------------------------------------------------------------------------
713template <typename T>
714void device_memcpy_2d(
715 T* dst, int64_t dst_pitch,
716 T const* src, int64_t src_pitch,
717 int64_t width, int64_t height, Queue& queue)
718{
719 blas_error_if( width < 0 );
720 blas_error_if( height < 0 );
721 blas_error_if( dst_pitch < width );
722 blas_error_if( src_pitch < width );
723
724 #ifdef BLAS_HAVE_CUBLAS
725 blas::internal_set_device( queue.device() );
726 blas_dev_call(
727 cudaMemcpy2DAsync(
728 dst, sizeof(T)*dst_pitch,
729 src, sizeof(T)*src_pitch,
730 sizeof(T)*width, height,
731 cudaMemcpyDefault, queue.stream() ) );
732
733 #elif defined(BLAS_HAVE_ROCBLAS)
734 blas::internal_set_device( queue.device() );
735 blas_dev_call(
736 hipMemcpy2DAsync(
737 dst, sizeof(T)*dst_pitch,
738 src, sizeof(T)*src_pitch,
739 sizeof(T)*width, height,
740 hipMemcpyDefault, queue.stream() ) );
741
742 #elif defined(BLAS_HAVE_SYCL)
743 if (dst_pitch == width && src_pitch == width) {
744 // one contiguous memcpy
745 blas_dev_call(
746 queue.stream().memcpy( dst, src, width * height * sizeof(T) ) );
747 }
748 else {
749 // Copy each contiguous image row (matrix column).
750 // SYCL does not support set/get/lacpy matrix.
751 for (int64_t i = 0; i < height; ++i) {
752 T* dst_row = dst + i*dst_pitch;
753 T const* src_row = src + i*src_pitch;
754 blas_dev_call(
755 queue.stream().memcpy( dst_row, src_row, width*sizeof(T) ) );
756 }
757 }
758 #else
759 throw blas::Error( "device BLAS not available", __func__ );
760 #endif
761}
762
763//------------------------------------------------------------------------------
786template <typename T>
787void device_copy_vector(
788 int64_t n,
789 T const* src, int64_t inc_src,
790 T* dst, int64_t inc_dst, Queue& queue)
791{
792 if (inc_src == 1 && inc_dst == 1) {
793 // Copy contiguous vector.
794 device_memcpy( dst, src, n, queue );
795 }
796 else {
797 // Interpret as copying one row from inc-by-n matrix.
798 device_memcpy_2d( dst, inc_dst, src, inc_src, 1, n, queue );
799 }
800}
801
802//------------------------------------------------------------------------------
833template <typename T>
834void device_copy_matrix(
835 int64_t m, int64_t n,
836 T const* src, int64_t ld_src,
837 T* dst, int64_t ld_dst, Queue& queue)
838{
839 device_memcpy_2d( dst, ld_dst, src, ld_src, m, n, queue );
840}
841
842//------------------------------------------------------------------------------
851template <typename scalar_t>
852void Queue::work_ensure_size( size_t lwork )
853{
854 lwork *= sizeof(scalar_t);
855 if (lwork > lwork_) {
856 sync();
857 if (work_) {
858 device_free( work_, *this );
859 }
860 lwork_ = max( lwork, 3*MaxBatchChunk*sizeof(void*) );
861 work_ = device_malloc<char>( lwork_, *this );
862 }
863}
864
865} // namespace blas
866
867#endif // #ifndef BLAS_DEVICE_HH
Exception class for BLAS errors.
Definition util.hh:30
Queue for executing GPU device routines.
Definition device.hh:143
void work_ensure_size(size_t lwork)
Ensures GPU device workspace is of size at least lwork elements of scalar_t, synchronizing and reallo...
Definition device.hh:852
void sync()
Synchronize with queue.
Definition device_queue.cc:238
Queue()
Default constructor.
Definition device_queue.cc:19
void * work()
Definition device.hh:183
void fork(int num_streams=MaxForkSize)
Forks the kernel launches assigned to this queue to parallel streams.
Definition device_queue.cc:255
void join()
Switch executions on this queue back from parallel streams to the default stream.
Definition device_queue.cc:296
size_t work_size() const
Definition device.hh:187
void revolve()
In fork mode, switch execution to the next-in-line stream.
Definition device_queue.cc:322