core/stdarch/crates/core_arch/src/nvptx/
packed.rs

1//! NVPTX Packed data types (SIMD)
2//!
3//! Packed Data Types is what PTX calls SIMD types. See [PTX ISA (Packed Data Types)](https://docs.nvidia.com/cuda/parallel-thread-execution/#packed-data-types) for a full reference.
4
5// Note: #[assert_instr] tests are not actually being run on nvptx due to being a `no_std` target incapable of running tests. Something like FileCheck would be appropriate for verifying the correct instruction is used.
6
7use crate::intrinsics::simd::*;
8
9#[allow(improper_ctypes)]
10extern "C" {
11    #[link_name = "llvm.minnum.v2f16"]
12    fn llvm_f16x2_minnum(a: f16x2, b: f16x2) -> f16x2;
13    #[link_name = "llvm.minimum.v2f16"]
14    fn llvm_f16x2_minimum(a: f16x2, b: f16x2) -> f16x2;
15    #[link_name = "llvm.maxnum.v2f16"]
16    fn llvm_f16x2_maxnum(a: f16x2, b: f16x2) -> f16x2;
17    #[link_name = "llvm.maximum.v2f16"]
18    fn llvm_f16x2_maximum(a: f16x2, b: f16x2) -> f16x2;
19}
20
21types! {
22    #![unstable(feature = "stdarch_nvptx", issue = "111199")]
23
24    /// PTX-specific 32-bit wide floating point (f16 x 2) vector type
25    pub struct f16x2(2 x f16);
26
27}
28
29/// Add two values, round to nearest even
30///
31/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-add>
32///
33/// Corresponds to the CUDA C intrinsics:
34///  - [`__hadd2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g921c795176eaa31265bd80ef4fe4b8e6)
35///  - [`__hadd2_rn`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g6cd8ddb2c3d670e1a10c3eb2e7644f82)
36#[inline]
37#[cfg_attr(test, assert_instr(add.rn.f16x22))]
38#[unstable(feature = "stdarch_nvptx", issue = "111199")]
39pub unsafe fn f16x2_add(a: f16x2, b: f16x2) -> f16x2 {
40    simd_add(a, b)
41}
42
43/// Subtract two values, round to nearest even
44///
45/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-sub>
46///
47/// Corresponds to the CUDA C intrinsics:
48///  - [`__hsub2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1ga5536c9c3d853d8c8b9de60e18b41e54)
49///  - [`__hsub2_rn`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g8adc164c68d553354f749f0f0645a874)
50#[inline]
51#[cfg_attr(test, assert_instr(sub.rn.f16x2))]
52#[unstable(feature = "stdarch_nvptx", issue = "111199")]
53pub unsafe fn f16x2_sub(a: f16x2, b: f16x2) -> f16x2 {
54    simd_sub(a, b)
55}
56
57/// Multiply two values, round to nearest even
58///
59/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-mul>
60///
61/// Corresponds to the CUDA C intrinsics:
62///  - [`__hmul2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g70de3f2ee48babe4e0969397ac17708e)
63///  - [`__hmul2_rn`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g99f8fe23a4b4c6898d6faf999afaa76e)
64#[inline]
65#[cfg_attr(test, assert_instr(mul.rn.f16x2))]
66#[unstable(feature = "stdarch_nvptx", issue = "111199")]
67pub unsafe fn f16x2_mul(a: f16x2, b: f16x2) -> f16x2 {
68    simd_mul(a, b)
69}
70
71/// Fused multiply-add, round to nearest even
72///
73/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-fma>
74///
75/// Corresponds to the CUDA C intrinsics:
76///  - [`__fma2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g43628ba21ded8b1e188a367348008dab)
77///  - [`__fma2_rn`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__ARITHMETIC.html#group__CUDA__MATH____HALF2__ARITHMETIC_1g43628ba21ded8b1e188a367348008dab)
78#[inline]
79#[cfg_attr(test, assert_instr(fma.rn.f16x2))]
80#[unstable(feature = "stdarch_nvptx", issue = "111199")]
81pub unsafe fn f16x2_fma(a: f16x2, b: f16x2, c: f16x2) -> f16x2 {
82    simd_fma(a, b, c)
83}
84
85/// Arithmetic negate
86///
87/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-neg>
88///
89/// Corresponds to the CUDA C intrinsic [`__hmin2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g9e17a33f96061804166f3fbd395422b6)
90#[inline]
91#[cfg_attr(test, assert_instr(neg.f16x2))]
92#[unstable(feature = "stdarch_nvptx", issue = "111199")]
93pub unsafe fn f16x2_neg(a: f16x2) -> f16x2 {
94    simd_neg(a)
95}
96
97/// Find the minimum of two values
98///
99/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-min>
100///
101/// Corresponds to the CUDA C intrinsic [`__hmin2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g9e17a33f96061804166f3fbd395422b6)
102#[inline]
103#[cfg_attr(test, assert_instr(min.f16x2))]
104#[unstable(feature = "stdarch_nvptx", issue = "111199")]
105pub unsafe fn f16x2_min(a: f16x2, b: f16x2) -> f16x2 {
106    llvm_f16x2_minnum(a, b)
107}
108
109/// Find the minimum of two values, NaNs pass through.
110///
111/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-min>
112///
113/// Corresponds to the CUDA C intrinsic [`__hmin2_nan`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g8bb8f58e9294cc261d2f42c4d5aecd6b)
114#[inline]
115#[cfg_attr(test, assert_instr(min.NaN.f16x2))]
116#[unstable(feature = "stdarch_nvptx", issue = "111199")]
117pub unsafe fn f16x2_min_nan(a: f16x2, b: f16x2) -> f16x2 {
118    llvm_f16x2_minimum(a, b)
119}
120
121/// Find the maximum of two values
122///
123/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-max>
124///
125/// Corresponds to the CUDA C intrinsic [`__hmax2`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g59fc7fc7975d8127b202444a05e57e3d)
126#[inline]
127#[cfg_attr(test, assert_instr(max.f16x2))]
128#[unstable(feature = "stdarch_nvptx", issue = "111199")]
129pub unsafe fn f16x2_max(a: f16x2, b: f16x2) -> f16x2 {
130    llvm_f16x2_maxnum(a, b)
131}
132
133/// Find the maximum of two values, NaNs pass through.
134///
135/// <https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-max>
136///
137/// Corresponds to the CUDA C intrinsic [`__hmax2_nan`](https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH____HALF2__COMPARISON.html#group__CUDA__MATH____HALF2__COMPARISON_1g41623db7850e3074fd9daa80a14c3897)
138#[inline]
139#[cfg_attr(test, assert_instr(max.NaN.f16x2))]
140#[unstable(feature = "stdarch_nvptx", issue = "111199")]
141pub unsafe fn f16x2_max_nan(a: f16x2, b: f16x2) -> f16x2 {
142    llvm_f16x2_maximum(a, b)
143}