core/stdarch/crates/core_arch/src/x86/
avx.rs

1//! Advanced Vector Extensions (AVX)
2//!
3//! The references are:
4//!
5//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
6//!   Instruction Set Reference, A-Z][intel64_ref]. - [AMD64 Architecture
7//!   Programmer's Manual, Volume 3: General-Purpose and System
8//!   Instructions][amd64_ref].
9//!
10//! [Wikipedia][wiki] provides a quick overview of the instructions available.
11//!
12//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
13//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
14//! [wiki]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
15
16use crate::{
17    core_arch::{simd::*, x86::*},
18    intrinsics::simd::*,
19    mem, ptr,
20};
21
22#[cfg(test)]
23use stdarch_test::assert_instr;
24
25/// Adds packed double-precision (64-bit) floating-point elements
26/// in `a` and `b`.
27///
28/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_pd)
29#[inline]
30#[target_feature(enable = "avx")]
31#[cfg_attr(test, assert_instr(vaddpd))]
32#[stable(feature = "simd_x86", since = "1.27.0")]
33pub unsafe fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
34    simd_add(a, b)
35}
36
37/// Adds packed single-precision (32-bit) floating-point elements in `a` and
38/// `b`.
39///
40/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_ps)
41#[inline]
42#[target_feature(enable = "avx")]
43#[cfg_attr(test, assert_instr(vaddps))]
44#[stable(feature = "simd_x86", since = "1.27.0")]
45pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
46    simd_add(a, b)
47}
48
49/// Computes the bitwise AND of a packed double-precision (64-bit)
50/// floating-point elements in `a` and `b`.
51///
52/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_pd)
53#[inline]
54#[target_feature(enable = "avx")]
55// See https://github.com/rust-lang/stdarch/issues/71
56#[cfg_attr(test, assert_instr(vandp))]
57#[stable(feature = "simd_x86", since = "1.27.0")]
58pub unsafe fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
59    let a: u64x4 = transmute(a);
60    let b: u64x4 = transmute(b);
61    transmute(simd_and(a, b))
62}
63
64/// Computes the bitwise AND of packed single-precision (32-bit) floating-point
65/// elements in `a` and `b`.
66///
67/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_ps)
68#[inline]
69#[target_feature(enable = "avx")]
70#[cfg_attr(test, assert_instr(vandps))]
71#[stable(feature = "simd_x86", since = "1.27.0")]
72pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
73    let a: u32x8 = transmute(a);
74    let b: u32x8 = transmute(b);
75    transmute(simd_and(a, b))
76}
77
78/// Computes the bitwise OR packed double-precision (64-bit) floating-point
79/// elements in `a` and `b`.
80///
81/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_pd)
82#[inline]
83#[target_feature(enable = "avx")]
84// See <https://github.com/rust-lang/stdarch/issues/71>.
85#[cfg_attr(test, assert_instr(vorp))]
86#[stable(feature = "simd_x86", since = "1.27.0")]
87pub unsafe fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
88    let a: u64x4 = transmute(a);
89    let b: u64x4 = transmute(b);
90    transmute(simd_or(a, b))
91}
92
93/// Computes the bitwise OR packed single-precision (32-bit) floating-point
94/// elements in `a` and `b`.
95///
96/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_ps)
97#[inline]
98#[target_feature(enable = "avx")]
99#[cfg_attr(test, assert_instr(vorps))]
100#[stable(feature = "simd_x86", since = "1.27.0")]
101pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
102    let a: u32x8 = transmute(a);
103    let b: u32x8 = transmute(b);
104    transmute(simd_or(a, b))
105}
106
107/// Shuffles double-precision (64-bit) floating-point elements within 128-bit
108/// lanes using the control in `imm8`.
109///
110/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_pd)
111#[inline]
112#[target_feature(enable = "avx")]
113#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
114#[rustc_legacy_const_generics(2)]
115#[stable(feature = "simd_x86", since = "1.27.0")]
116pub unsafe fn _mm256_shuffle_pd<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
117    static_assert_uimm_bits!(MASK, 8);
118    simd_shuffle!(
119        a,
120        b,
121        [
122            MASK as u32 & 0b1,
123            ((MASK as u32 >> 1) & 0b1) + 4,
124            ((MASK as u32 >> 2) & 0b1) + 2,
125            ((MASK as u32 >> 3) & 0b1) + 6,
126        ],
127    )
128}
129
130/// Shuffles single-precision (32-bit) floating-point elements in `a` within
131/// 128-bit lanes using the control in `imm8`.
132///
133/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_ps)
134#[inline]
135#[target_feature(enable = "avx")]
136#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
137#[rustc_legacy_const_generics(2)]
138#[stable(feature = "simd_x86", since = "1.27.0")]
139pub unsafe fn _mm256_shuffle_ps<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
140    static_assert_uimm_bits!(MASK, 8);
141    simd_shuffle!(
142        a,
143        b,
144        [
145            MASK as u32 & 0b11,
146            (MASK as u32 >> 2) & 0b11,
147            ((MASK as u32 >> 4) & 0b11) + 8,
148            ((MASK as u32 >> 6) & 0b11) + 8,
149            (MASK as u32 & 0b11) + 4,
150            ((MASK as u32 >> 2) & 0b11) + 4,
151            ((MASK as u32 >> 4) & 0b11) + 12,
152            ((MASK as u32 >> 6) & 0b11) + 12,
153        ],
154    )
155}
156
157/// Computes the bitwise NOT of packed double-precision (64-bit) floating-point
158/// elements in `a`, and then AND with `b`.
159///
160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_pd)
161#[inline]
162#[target_feature(enable = "avx")]
163#[cfg_attr(test, assert_instr(vandnp))]
164#[stable(feature = "simd_x86", since = "1.27.0")]
165pub unsafe fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
166    let a: u64x4 = transmute(a);
167    let b: u64x4 = transmute(b);
168    transmute(simd_and(simd_xor(u64x4::splat(!(0_u64)), a), b))
169}
170
171/// Computes the bitwise NOT of packed single-precision (32-bit) floating-point
172/// elements in `a`
173/// and then AND with `b`.
174///
175/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_ps)
176#[inline]
177#[target_feature(enable = "avx")]
178#[cfg_attr(test, assert_instr(vandnps))]
179#[stable(feature = "simd_x86", since = "1.27.0")]
180pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
181    let a: u32x8 = transmute(a);
182    let b: u32x8 = transmute(b);
183    transmute(simd_and(simd_xor(u32x8::splat(!(0_u32)), a), b))
184}
185
186/// Compares packed double-precision (64-bit) floating-point elements
187/// in `a` and `b`, and returns packed maximum values
188///
189/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_pd)
190#[inline]
191#[target_feature(enable = "avx")]
192#[cfg_attr(test, assert_instr(vmaxpd))]
193#[stable(feature = "simd_x86", since = "1.27.0")]
194pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
195    vmaxpd(a, b)
196}
197
198/// Compares packed single-precision (32-bit) floating-point elements in `a`
199/// and `b`, and returns packed maximum values
200///
201/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_ps)
202#[inline]
203#[target_feature(enable = "avx")]
204#[cfg_attr(test, assert_instr(vmaxps))]
205#[stable(feature = "simd_x86", since = "1.27.0")]
206pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
207    vmaxps(a, b)
208}
209
210/// Compares packed double-precision (64-bit) floating-point elements
211/// in `a` and `b`, and returns packed minimum values
212///
213/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_pd)
214#[inline]
215#[target_feature(enable = "avx")]
216#[cfg_attr(test, assert_instr(vminpd))]
217#[stable(feature = "simd_x86", since = "1.27.0")]
218pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
219    vminpd(a, b)
220}
221
222/// Compares packed single-precision (32-bit) floating-point elements in `a`
223/// and `b`, and returns packed minimum values
224///
225/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_ps)
226#[inline]
227#[target_feature(enable = "avx")]
228#[cfg_attr(test, assert_instr(vminps))]
229#[stable(feature = "simd_x86", since = "1.27.0")]
230pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
231    vminps(a, b)
232}
233
234/// Multiplies packed double-precision (64-bit) floating-point elements
235/// in `a` and `b`.
236///
237/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_pd)
238#[inline]
239#[target_feature(enable = "avx")]
240#[cfg_attr(test, assert_instr(vmulpd))]
241#[stable(feature = "simd_x86", since = "1.27.0")]
242pub unsafe fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
243    simd_mul(a, b)
244}
245
246/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
247/// `b`.
248///
249/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_ps)
250#[inline]
251#[target_feature(enable = "avx")]
252#[cfg_attr(test, assert_instr(vmulps))]
253#[stable(feature = "simd_x86", since = "1.27.0")]
254pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
255    simd_mul(a, b)
256}
257
258/// Alternatively adds and subtracts packed double-precision (64-bit)
259/// floating-point elements in `a` to/from packed elements in `b`.
260///
261/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_pd)
262#[inline]
263#[target_feature(enable = "avx")]
264#[cfg_attr(test, assert_instr(vaddsubpd))]
265#[stable(feature = "simd_x86", since = "1.27.0")]
266pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
267    let a = a.as_f64x4();
268    let b = b.as_f64x4();
269    let add = simd_add(a, b);
270    let sub = simd_sub(a, b);
271    simd_shuffle!(add, sub, [4, 1, 6, 3])
272}
273
274/// Alternatively adds and subtracts packed single-precision (32-bit)
275/// floating-point elements in `a` to/from packed elements in `b`.
276///
277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_addsub_ps)
278#[inline]
279#[target_feature(enable = "avx")]
280#[cfg_attr(test, assert_instr(vaddsubps))]
281#[stable(feature = "simd_x86", since = "1.27.0")]
282pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
283    let a = a.as_f32x8();
284    let b = b.as_f32x8();
285    let add = simd_add(a, b);
286    let sub = simd_sub(a, b);
287    simd_shuffle!(add, sub, [8, 1, 10, 3, 12, 5, 14, 7])
288}
289
290/// Subtracts packed double-precision (64-bit) floating-point elements in `b`
291/// from packed elements in `a`.
292///
293/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_pd)
294#[inline]
295#[target_feature(enable = "avx")]
296#[cfg_attr(test, assert_instr(vsubpd))]
297#[stable(feature = "simd_x86", since = "1.27.0")]
298pub unsafe fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
299    simd_sub(a, b)
300}
301
302/// Subtracts packed single-precision (32-bit) floating-point elements in `b`
303/// from packed elements in `a`.
304///
305/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_ps)
306#[inline]
307#[target_feature(enable = "avx")]
308#[cfg_attr(test, assert_instr(vsubps))]
309#[stable(feature = "simd_x86", since = "1.27.0")]
310pub unsafe fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
311    simd_sub(a, b)
312}
313
314/// Computes the division of each of the 8 packed 32-bit floating-point elements
315/// in `a` by the corresponding packed elements in `b`.
316///
317/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_ps)
318#[inline]
319#[target_feature(enable = "avx")]
320#[cfg_attr(test, assert_instr(vdivps))]
321#[stable(feature = "simd_x86", since = "1.27.0")]
322pub unsafe fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
323    simd_div(a, b)
324}
325
326/// Computes the division of each of the 4 packed 64-bit floating-point elements
327/// in `a` by the corresponding packed elements in `b`.
328///
329/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_div_pd)
330#[inline]
331#[target_feature(enable = "avx")]
332#[cfg_attr(test, assert_instr(vdivpd))]
333#[stable(feature = "simd_x86", since = "1.27.0")]
334pub unsafe fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
335    simd_div(a, b)
336}
337
338/// Rounds packed double-precision (64-bit) floating point elements in `a`
339/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
340///
341/// - `0x00`: Round to the nearest whole number.
342/// - `0x01`: Round down, toward negative infinity.
343/// - `0x02`: Round up, toward positive infinity.
344/// - `0x03`: Truncate the values.
345///
346/// For a complete list of options, check [the LLVM docs][llvm_docs].
347///
348/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
349///
350/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_pd)
351#[inline]
352#[target_feature(enable = "avx")]
353#[cfg_attr(test, assert_instr(vroundpd, ROUNDING = 0x3))]
354#[rustc_legacy_const_generics(1)]
355#[stable(feature = "simd_x86", since = "1.27.0")]
356pub unsafe fn _mm256_round_pd<const ROUNDING: i32>(a: __m256d) -> __m256d {
357    static_assert_uimm_bits!(ROUNDING, 4);
358    roundpd256(a, ROUNDING)
359}
360
361/// Rounds packed double-precision (64-bit) floating point elements in `a`
362/// toward positive infinity.
363///
364/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_pd)
365#[inline]
366#[target_feature(enable = "avx")]
367#[cfg_attr(test, assert_instr(vroundpd))]
368#[stable(feature = "simd_x86", since = "1.27.0")]
369pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d {
370    simd_ceil(a)
371}
372
373/// Rounds packed double-precision (64-bit) floating point elements in `a`
374/// toward negative infinity.
375///
376/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_pd)
377#[inline]
378#[target_feature(enable = "avx")]
379#[cfg_attr(test, assert_instr(vroundpd))]
380#[stable(feature = "simd_x86", since = "1.27.0")]
381pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d {
382    simd_floor(a)
383}
384
385/// Rounds packed single-precision (32-bit) floating point elements in `a`
386/// according to the flag `ROUNDING`. The value of `ROUNDING` may be as follows:
387///
388/// - `0x00`: Round to the nearest whole number.
389/// - `0x01`: Round down, toward negative infinity.
390/// - `0x02`: Round up, toward positive infinity.
391/// - `0x03`: Truncate the values.
392///
393/// For a complete list of options, check [the LLVM docs][llvm_docs].
394///
395/// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
396///
397/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_round_ps)
398#[inline]
399#[target_feature(enable = "avx")]
400#[cfg_attr(test, assert_instr(vroundps, ROUNDING = 0x00))]
401#[rustc_legacy_const_generics(1)]
402#[stable(feature = "simd_x86", since = "1.27.0")]
403pub unsafe fn _mm256_round_ps<const ROUNDING: i32>(a: __m256) -> __m256 {
404    static_assert_uimm_bits!(ROUNDING, 4);
405    roundps256(a, ROUNDING)
406}
407
408/// Rounds packed single-precision (32-bit) floating point elements in `a`
409/// toward positive infinity.
410///
411/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_ceil_ps)
412#[inline]
413#[target_feature(enable = "avx")]
414#[cfg_attr(test, assert_instr(vroundps))]
415#[stable(feature = "simd_x86", since = "1.27.0")]
416pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 {
417    simd_ceil(a)
418}
419
420/// Rounds packed single-precision (32-bit) floating point elements in `a`
421/// toward negative infinity.
422///
423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_floor_ps)
424#[inline]
425#[target_feature(enable = "avx")]
426#[cfg_attr(test, assert_instr(vroundps))]
427#[stable(feature = "simd_x86", since = "1.27.0")]
428pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 {
429    simd_floor(a)
430}
431
432/// Returns the square root of packed single-precision (32-bit) floating point
433/// elements in `a`.
434///
435/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_ps)
436#[inline]
437#[target_feature(enable = "avx")]
438#[cfg_attr(test, assert_instr(vsqrtps))]
439#[stable(feature = "simd_x86", since = "1.27.0")]
440pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 {
441    simd_fsqrt(a)
442}
443
444/// Returns the square root of packed double-precision (64-bit) floating point
445/// elements in `a`.
446///
447/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sqrt_pd)
448#[inline]
449#[target_feature(enable = "avx")]
450#[cfg_attr(test, assert_instr(vsqrtpd))]
451#[stable(feature = "simd_x86", since = "1.27.0")]
452pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
453    simd_fsqrt(a)
454}
455
456/// Blends packed double-precision (64-bit) floating-point elements from
457/// `a` and `b` using control mask `imm8`.
458///
459/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_pd)
460#[inline]
461#[target_feature(enable = "avx")]
462// Note: LLVM7 prefers single-precision blend instructions when
463// possible, see: https://bugs.llvm.org/show_bug.cgi?id=38194
464// #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
465#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
466#[rustc_legacy_const_generics(2)]
467#[stable(feature = "simd_x86", since = "1.27.0")]
468pub unsafe fn _mm256_blend_pd<const IMM4: i32>(a: __m256d, b: __m256d) -> __m256d {
469    static_assert_uimm_bits!(IMM4, 4);
470    simd_shuffle!(
471        a,
472        b,
473        [
474            ((IMM4 as u32 >> 0) & 1) * 4 + 0,
475            ((IMM4 as u32 >> 1) & 1) * 4 + 1,
476            ((IMM4 as u32 >> 2) & 1) * 4 + 2,
477            ((IMM4 as u32 >> 3) & 1) * 4 + 3,
478        ],
479    )
480}
481
482/// Blends packed single-precision (32-bit) floating-point elements from
483/// `a` and `b` using control mask `imm8`.
484///
485/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_ps)
486#[inline]
487#[target_feature(enable = "avx")]
488#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
489#[rustc_legacy_const_generics(2)]
490#[stable(feature = "simd_x86", since = "1.27.0")]
491pub unsafe fn _mm256_blend_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
492    static_assert_uimm_bits!(IMM8, 8);
493    simd_shuffle!(
494        a,
495        b,
496        [
497            ((IMM8 as u32 >> 0) & 1) * 8 + 0,
498            ((IMM8 as u32 >> 1) & 1) * 8 + 1,
499            ((IMM8 as u32 >> 2) & 1) * 8 + 2,
500            ((IMM8 as u32 >> 3) & 1) * 8 + 3,
501            ((IMM8 as u32 >> 4) & 1) * 8 + 4,
502            ((IMM8 as u32 >> 5) & 1) * 8 + 5,
503            ((IMM8 as u32 >> 6) & 1) * 8 + 6,
504            ((IMM8 as u32 >> 7) & 1) * 8 + 7,
505        ],
506    )
507}
508
509/// Blends packed double-precision (64-bit) floating-point elements from
510/// `a` and `b` using `c` as a mask.
511///
512/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_pd)
513#[inline]
514#[target_feature(enable = "avx")]
515#[cfg_attr(test, assert_instr(vblendvpd))]
516#[stable(feature = "simd_x86", since = "1.27.0")]
517pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
518    let mask: i64x4 = simd_lt(transmute::<_, i64x4>(c), i64x4::ZERO);
519    transmute(simd_select(mask, b.as_f64x4(), a.as_f64x4()))
520}
521
522/// Blends packed single-precision (32-bit) floating-point elements from
523/// `a` and `b` using `c` as a mask.
524///
525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_ps)
526#[inline]
527#[target_feature(enable = "avx")]
528#[cfg_attr(test, assert_instr(vblendvps))]
529#[stable(feature = "simd_x86", since = "1.27.0")]
530pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
531    let mask: i32x8 = simd_lt(transmute::<_, i32x8>(c), i32x8::ZERO);
532    transmute(simd_select(mask, b.as_f32x8(), a.as_f32x8()))
533}
534
535/// Conditionally multiplies the packed single-precision (32-bit) floating-point
536/// elements in `a` and `b` using the high 4 bits in `imm8`,
537/// sum the four products, and conditionally return the sum
538///  using the low 4 bits of `imm8`.
539///
540/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dp_ps)
541#[inline]
542#[target_feature(enable = "avx")]
543#[cfg_attr(test, assert_instr(vdpps, IMM8 = 0x0))]
544#[rustc_legacy_const_generics(2)]
545#[stable(feature = "simd_x86", since = "1.27.0")]
546pub unsafe fn _mm256_dp_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
547    static_assert_uimm_bits!(IMM8, 8);
548    vdpps(a, b, IMM8)
549}
550
551/// Horizontal addition of adjacent pairs in the two packed vectors
552/// of 4 64-bit floating points `a` and `b`.
553/// In the result, sums of elements from `a` are returned in even locations,
554/// while sums of elements from `b` are returned in odd locations.
555///
556/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_pd)
557#[inline]
558#[target_feature(enable = "avx")]
559#[cfg_attr(test, assert_instr(vhaddpd))]
560#[stable(feature = "simd_x86", since = "1.27.0")]
561pub unsafe fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
562    vhaddpd(a, b)
563}
564
565/// Horizontal addition of adjacent pairs in the two packed vectors
566/// of 8 32-bit floating points `a` and `b`.
567/// In the result, sums of elements from `a` are returned in locations of
568/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
569/// 2, 3, 6, 7.
570///
571/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_ps)
572#[inline]
573#[target_feature(enable = "avx")]
574#[cfg_attr(test, assert_instr(vhaddps))]
575#[stable(feature = "simd_x86", since = "1.27.0")]
576pub unsafe fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
577    vhaddps(a, b)
578}
579
580/// Horizontal subtraction of adjacent pairs in the two packed vectors
581/// of 4 64-bit floating points `a` and `b`.
582/// In the result, sums of elements from `a` are returned in even locations,
583/// while sums of elements from `b` are returned in odd locations.
584///
585/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_pd)
586#[inline]
587#[target_feature(enable = "avx")]
588#[cfg_attr(test, assert_instr(vhsubpd))]
589#[stable(feature = "simd_x86", since = "1.27.0")]
590pub unsafe fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
591    vhsubpd(a, b)
592}
593
594/// Horizontal subtraction of adjacent pairs in the two packed vectors
595/// of 8 32-bit floating points `a` and `b`.
596/// In the result, sums of elements from `a` are returned in locations of
597/// indices 0, 1, 4, 5; while sums of elements from `b` are locations
598/// 2, 3, 6, 7.
599///
600/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_ps)
601#[inline]
602#[target_feature(enable = "avx")]
603#[cfg_attr(test, assert_instr(vhsubps))]
604#[stable(feature = "simd_x86", since = "1.27.0")]
605pub unsafe fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
606    vhsubps(a, b)
607}
608
609/// Computes the bitwise XOR of packed double-precision (64-bit) floating-point
610/// elements in `a` and `b`.
611///
612/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_pd)
613#[inline]
614#[target_feature(enable = "avx")]
615#[cfg_attr(test, assert_instr(vxorp))]
616#[stable(feature = "simd_x86", since = "1.27.0")]
617pub unsafe fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
618    let a: u64x4 = transmute(a);
619    let b: u64x4 = transmute(b);
620    transmute(simd_xor(a, b))
621}
622
623/// Computes the bitwise XOR of packed single-precision (32-bit) floating-point
624/// elements in `a` and `b`.
625///
626/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_ps)
627#[inline]
628#[target_feature(enable = "avx")]
629#[cfg_attr(test, assert_instr(vxorps))]
630#[stable(feature = "simd_x86", since = "1.27.0")]
631pub unsafe fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
632    let a: u32x8 = transmute(a);
633    let b: u32x8 = transmute(b);
634    transmute(simd_xor(a, b))
635}
636
637/// Equal (ordered, non-signaling)
638#[stable(feature = "simd_x86", since = "1.27.0")]
639pub const _CMP_EQ_OQ: i32 = 0x00;
640/// Less-than (ordered, signaling)
641#[stable(feature = "simd_x86", since = "1.27.0")]
642pub const _CMP_LT_OS: i32 = 0x01;
643/// Less-than-or-equal (ordered, signaling)
644#[stable(feature = "simd_x86", since = "1.27.0")]
645pub const _CMP_LE_OS: i32 = 0x02;
646/// Unordered (non-signaling)
647#[stable(feature = "simd_x86", since = "1.27.0")]
648pub const _CMP_UNORD_Q: i32 = 0x03;
649/// Not-equal (unordered, non-signaling)
650#[stable(feature = "simd_x86", since = "1.27.0")]
651pub const _CMP_NEQ_UQ: i32 = 0x04;
652/// Not-less-than (unordered, signaling)
653#[stable(feature = "simd_x86", since = "1.27.0")]
654pub const _CMP_NLT_US: i32 = 0x05;
655/// Not-less-than-or-equal (unordered, signaling)
656#[stable(feature = "simd_x86", since = "1.27.0")]
657pub const _CMP_NLE_US: i32 = 0x06;
658/// Ordered (non-signaling)
659#[stable(feature = "simd_x86", since = "1.27.0")]
660pub const _CMP_ORD_Q: i32 = 0x07;
661/// Equal (unordered, non-signaling)
662#[stable(feature = "simd_x86", since = "1.27.0")]
663pub const _CMP_EQ_UQ: i32 = 0x08;
664/// Not-greater-than-or-equal (unordered, signaling)
665#[stable(feature = "simd_x86", since = "1.27.0")]
666pub const _CMP_NGE_US: i32 = 0x09;
667/// Not-greater-than (unordered, signaling)
668#[stable(feature = "simd_x86", since = "1.27.0")]
669pub const _CMP_NGT_US: i32 = 0x0a;
670/// False (ordered, non-signaling)
671#[stable(feature = "simd_x86", since = "1.27.0")]
672pub const _CMP_FALSE_OQ: i32 = 0x0b;
673/// Not-equal (ordered, non-signaling)
674#[stable(feature = "simd_x86", since = "1.27.0")]
675pub const _CMP_NEQ_OQ: i32 = 0x0c;
676/// Greater-than-or-equal (ordered, signaling)
677#[stable(feature = "simd_x86", since = "1.27.0")]
678pub const _CMP_GE_OS: i32 = 0x0d;
679/// Greater-than (ordered, signaling)
680#[stable(feature = "simd_x86", since = "1.27.0")]
681pub const _CMP_GT_OS: i32 = 0x0e;
682/// True (unordered, non-signaling)
683#[stable(feature = "simd_x86", since = "1.27.0")]
684pub const _CMP_TRUE_UQ: i32 = 0x0f;
685/// Equal (ordered, signaling)
686#[stable(feature = "simd_x86", since = "1.27.0")]
687pub const _CMP_EQ_OS: i32 = 0x10;
688/// Less-than (ordered, non-signaling)
689#[stable(feature = "simd_x86", since = "1.27.0")]
690pub const _CMP_LT_OQ: i32 = 0x11;
691/// Less-than-or-equal (ordered, non-signaling)
692#[stable(feature = "simd_x86", since = "1.27.0")]
693pub const _CMP_LE_OQ: i32 = 0x12;
694/// Unordered (signaling)
695#[stable(feature = "simd_x86", since = "1.27.0")]
696pub const _CMP_UNORD_S: i32 = 0x13;
697/// Not-equal (unordered, signaling)
698#[stable(feature = "simd_x86", since = "1.27.0")]
699pub const _CMP_NEQ_US: i32 = 0x14;
700/// Not-less-than (unordered, non-signaling)
701#[stable(feature = "simd_x86", since = "1.27.0")]
702pub const _CMP_NLT_UQ: i32 = 0x15;
703/// Not-less-than-or-equal (unordered, non-signaling)
704#[stable(feature = "simd_x86", since = "1.27.0")]
705pub const _CMP_NLE_UQ: i32 = 0x16;
706/// Ordered (signaling)
707#[stable(feature = "simd_x86", since = "1.27.0")]
708pub const _CMP_ORD_S: i32 = 0x17;
709/// Equal (unordered, signaling)
710#[stable(feature = "simd_x86", since = "1.27.0")]
711pub const _CMP_EQ_US: i32 = 0x18;
712/// Not-greater-than-or-equal (unordered, non-signaling)
713#[stable(feature = "simd_x86", since = "1.27.0")]
714pub const _CMP_NGE_UQ: i32 = 0x19;
715/// Not-greater-than (unordered, non-signaling)
716#[stable(feature = "simd_x86", since = "1.27.0")]
717pub const _CMP_NGT_UQ: i32 = 0x1a;
718/// False (ordered, signaling)
719#[stable(feature = "simd_x86", since = "1.27.0")]
720pub const _CMP_FALSE_OS: i32 = 0x1b;
721/// Not-equal (ordered, signaling)
722#[stable(feature = "simd_x86", since = "1.27.0")]
723pub const _CMP_NEQ_OS: i32 = 0x1c;
724/// Greater-than-or-equal (ordered, non-signaling)
725#[stable(feature = "simd_x86", since = "1.27.0")]
726pub const _CMP_GE_OQ: i32 = 0x1d;
727/// Greater-than (ordered, non-signaling)
728#[stable(feature = "simd_x86", since = "1.27.0")]
729pub const _CMP_GT_OQ: i32 = 0x1e;
730/// True (unordered, signaling)
731#[stable(feature = "simd_x86", since = "1.27.0")]
732pub const _CMP_TRUE_US: i32 = 0x1f;
733
734/// Compares packed double-precision (64-bit) floating-point
735/// elements in `a` and `b` based on the comparison operand
736/// specified by `IMM5`.
737///
738/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_pd)
739#[inline]
740#[target_feature(enable = "avx")]
741#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
742#[rustc_legacy_const_generics(2)]
743#[stable(feature = "simd_x86", since = "1.27.0")]
744pub unsafe fn _mm_cmp_pd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
745    static_assert_uimm_bits!(IMM5, 5);
746    vcmppd(a, b, const { IMM5 as i8 })
747}
748
749/// Compares packed double-precision (64-bit) floating-point
750/// elements in `a` and `b` based on the comparison operand
751/// specified by `IMM5`.
752///
753/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_pd)
754#[inline]
755#[target_feature(enable = "avx")]
756#[cfg_attr(test, assert_instr(vcmpeqpd, IMM5 = 0))] // TODO Validate vcmppd
757#[rustc_legacy_const_generics(2)]
758#[stable(feature = "simd_x86", since = "1.27.0")]
759pub unsafe fn _mm256_cmp_pd<const IMM5: i32>(a: __m256d, b: __m256d) -> __m256d {
760    static_assert_uimm_bits!(IMM5, 5);
761    vcmppd256(a, b, IMM5 as u8)
762}
763
764/// Compares packed single-precision (32-bit) floating-point
765/// elements in `a` and `b` based on the comparison operand
766/// specified by `IMM5`.
767///
768/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ps)
769#[inline]
770#[target_feature(enable = "avx")]
771#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
772#[rustc_legacy_const_generics(2)]
773#[stable(feature = "simd_x86", since = "1.27.0")]
774pub unsafe fn _mm_cmp_ps<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
775    static_assert_uimm_bits!(IMM5, 5);
776    vcmpps(a, b, const { IMM5 as i8 })
777}
778
779/// Compares packed single-precision (32-bit) floating-point
780/// elements in `a` and `b` based on the comparison operand
781/// specified by `IMM5`.
782///
783/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmp_ps)
784#[inline]
785#[target_feature(enable = "avx")]
786#[cfg_attr(test, assert_instr(vcmpeqps, IMM5 = 0))] // TODO Validate vcmpps
787#[rustc_legacy_const_generics(2)]
788#[stable(feature = "simd_x86", since = "1.27.0")]
789pub unsafe fn _mm256_cmp_ps<const IMM5: i32>(a: __m256, b: __m256) -> __m256 {
790    static_assert_uimm_bits!(IMM5, 5);
791    vcmpps256(a, b, const { IMM5 as u8 })
792}
793
794/// Compares the lower double-precision (64-bit) floating-point element in
795/// `a` and `b` based on the comparison operand specified by `IMM5`,
796/// store the result in the lower element of returned vector,
797/// and copies the upper element from `a` to the upper element of returned
798/// vector.
799///
800/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_sd)
801#[inline]
802#[target_feature(enable = "avx")]
803#[cfg_attr(test, assert_instr(vcmpeqsd, IMM5 = 0))] // TODO Validate vcmpsd
804#[rustc_legacy_const_generics(2)]
805#[stable(feature = "simd_x86", since = "1.27.0")]
806pub unsafe fn _mm_cmp_sd<const IMM5: i32>(a: __m128d, b: __m128d) -> __m128d {
807    static_assert_uimm_bits!(IMM5, 5);
808    vcmpsd(a, b, IMM5 as i8)
809}
810
811/// Compares the lower single-precision (32-bit) floating-point element in
812/// `a` and `b` based on the comparison operand specified by `IMM5`,
813/// store the result in the lower element of returned vector,
814/// and copies the upper 3 packed elements from `a` to the upper elements of
815/// returned vector.
816///
817/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmp_ss)
818#[inline]
819#[target_feature(enable = "avx")]
820#[cfg_attr(test, assert_instr(vcmpeqss, IMM5 = 0))] // TODO Validate vcmpss
821#[rustc_legacy_const_generics(2)]
822#[stable(feature = "simd_x86", since = "1.27.0")]
823pub unsafe fn _mm_cmp_ss<const IMM5: i32>(a: __m128, b: __m128) -> __m128 {
824    static_assert_uimm_bits!(IMM5, 5);
825    vcmpss(a, b, IMM5 as i8)
826}
827
828/// Converts packed 32-bit integers in `a` to packed double-precision (64-bit)
829/// floating-point elements.
830///
831/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_pd)
832#[inline]
833#[target_feature(enable = "avx")]
834#[cfg_attr(test, assert_instr(vcvtdq2pd))]
835#[stable(feature = "simd_x86", since = "1.27.0")]
836pub unsafe fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
837    simd_cast(a.as_i32x4())
838}
839
840/// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
841/// floating-point elements.
842///
843/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_ps)
844#[inline]
845#[target_feature(enable = "avx")]
846#[cfg_attr(test, assert_instr(vcvtdq2ps))]
847#[stable(feature = "simd_x86", since = "1.27.0")]
848pub unsafe fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
849    simd_cast(a.as_i32x8())
850}
851
852/// Converts packed double-precision (64-bit) floating-point elements in `a`
853/// to packed single-precision (32-bit) floating-point elements.
854///
855/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_ps)
856#[inline]
857#[target_feature(enable = "avx")]
858#[cfg_attr(test, assert_instr(vcvtpd2ps))]
859#[stable(feature = "simd_x86", since = "1.27.0")]
860pub unsafe fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
861    simd_cast(a)
862}
863
864/// Converts packed single-precision (32-bit) floating-point elements in `a`
865/// to packed 32-bit integers.
866///
867/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_epi32)
868#[inline]
869#[target_feature(enable = "avx")]
870#[cfg_attr(test, assert_instr(vcvtps2dq))]
871#[stable(feature = "simd_x86", since = "1.27.0")]
872pub unsafe fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
873    transmute(vcvtps2dq(a))
874}
875
876/// Converts packed single-precision (32-bit) floating-point elements in `a`
877/// to packed double-precision (64-bit) floating-point elements.
878///
879/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtps_pd)
880#[inline]
881#[target_feature(enable = "avx")]
882#[cfg_attr(test, assert_instr(vcvtps2pd))]
883#[stable(feature = "simd_x86", since = "1.27.0")]
884pub unsafe fn _mm256_cvtps_pd(a: __m128) -> __m256d {
885    simd_cast(a)
886}
887
888/// Returns the first element of the input vector of `[4 x double]`.
889///
890/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsd_f64)
891#[inline]
892#[target_feature(enable = "avx")]
893//#[cfg_attr(test, assert_instr(movsd))] FIXME
894#[stable(feature = "simd_x86", since = "1.27.0")]
895pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
896    simd_extract!(a, 0)
897}
898
899/// Converts packed double-precision (64-bit) floating-point elements in `a`
900/// to packed 32-bit integers with truncation.
901///
902/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttpd_epi32)
903#[inline]
904#[target_feature(enable = "avx")]
905#[cfg_attr(test, assert_instr(vcvttpd2dq))]
906#[stable(feature = "simd_x86", since = "1.27.0")]
907pub unsafe fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
908    transmute(vcvttpd2dq(a))
909}
910
911/// Converts packed double-precision (64-bit) floating-point elements in `a`
912/// to packed 32-bit integers.
913///
914/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtpd_epi32)
915#[inline]
916#[target_feature(enable = "avx")]
917#[cfg_attr(test, assert_instr(vcvtpd2dq))]
918#[stable(feature = "simd_x86", since = "1.27.0")]
919pub unsafe fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
920    transmute(vcvtpd2dq(a))
921}
922
923/// Converts packed single-precision (32-bit) floating-point elements in `a`
924/// to packed 32-bit integers with truncation.
925///
926/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvttps_epi32)
927#[inline]
928#[target_feature(enable = "avx")]
929#[cfg_attr(test, assert_instr(vcvttps2dq))]
930#[stable(feature = "simd_x86", since = "1.27.0")]
931pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
932    transmute(vcvttps2dq(a))
933}
934
935/// Extracts 128 bits (composed of 4 packed single-precision (32-bit)
936/// floating-point elements) from `a`, selected with `imm8`.
937///
938/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_ps)
939#[inline]
940#[target_feature(enable = "avx")]
941#[cfg_attr(
942    all(test, not(target_env = "msvc")),
943    assert_instr(vextractf128, IMM1 = 1)
944)]
945#[rustc_legacy_const_generics(1)]
946#[stable(feature = "simd_x86", since = "1.27.0")]
947pub unsafe fn _mm256_extractf128_ps<const IMM1: i32>(a: __m256) -> __m128 {
948    static_assert_uimm_bits!(IMM1, 1);
949    simd_shuffle!(
950        a,
951        _mm256_undefined_ps(),
952        [[0, 1, 2, 3], [4, 5, 6, 7]][IMM1 as usize],
953    )
954}
955
956/// Extracts 128 bits (composed of 2 packed double-precision (64-bit)
957/// floating-point elements) from `a`, selected with `imm8`.
958///
959/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_pd)
960#[inline]
961#[target_feature(enable = "avx")]
962#[cfg_attr(
963    all(test, not(target_env = "msvc")),
964    assert_instr(vextractf128, IMM1 = 1)
965)]
966#[rustc_legacy_const_generics(1)]
967#[stable(feature = "simd_x86", since = "1.27.0")]
968pub unsafe fn _mm256_extractf128_pd<const IMM1: i32>(a: __m256d) -> __m128d {
969    static_assert_uimm_bits!(IMM1, 1);
970    simd_shuffle!(a, _mm256_undefined_pd(), [[0, 1], [2, 3]][IMM1 as usize])
971}
972
973/// Extracts 128 bits (composed of integer data) from `a`, selected with `imm8`.
974///
975/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extractf128_si256)
976#[inline]
977#[target_feature(enable = "avx")]
978#[cfg_attr(
979    all(test, not(target_env = "msvc")),
980    assert_instr(vextractf128, IMM1 = 1)
981)]
982#[rustc_legacy_const_generics(1)]
983#[stable(feature = "simd_x86", since = "1.27.0")]
984pub unsafe fn _mm256_extractf128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
985    static_assert_uimm_bits!(IMM1, 1);
986    let dst: i64x2 = simd_shuffle!(a.as_i64x4(), i64x4::ZERO, [[0, 1], [2, 3]][IMM1 as usize],);
987    transmute(dst)
988}
989
990/// Extracts a 32-bit integer from `a`, selected with `INDEX`.
991///
992/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi32)
993#[inline]
994#[target_feature(enable = "avx")]
995// This intrinsic has no corresponding instruction.
996#[rustc_legacy_const_generics(1)]
997#[stable(feature = "simd_x86", since = "1.27.0")]
998pub unsafe fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
999    static_assert_uimm_bits!(INDEX, 3);
1000    simd_extract!(a.as_i32x8(), INDEX as u32)
1001}
1002
1003/// Returns the first element of the input vector of `[8 x i32]`.
1004///
1005/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtsi256_si32)
1006#[inline]
1007#[target_feature(enable = "avx")]
1008#[stable(feature = "simd_x86", since = "1.27.0")]
1009pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
1010    simd_extract!(a.as_i32x8(), 0)
1011}
1012
1013/// Zeroes the contents of all XMM or YMM registers.
1014///
1015/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroall)
1016#[inline]
1017#[target_feature(enable = "avx")]
1018#[cfg_attr(test, assert_instr(vzeroall))]
1019#[stable(feature = "simd_x86", since = "1.27.0")]
1020pub unsafe fn _mm256_zeroall() {
1021    vzeroall()
1022}
1023
1024/// Zeroes the upper 128 bits of all YMM registers;
1025/// the lower 128-bits of the registers are unmodified.
1026///
1027/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zeroupper)
1028#[inline]
1029#[target_feature(enable = "avx")]
1030#[cfg_attr(test, assert_instr(vzeroupper))]
1031#[stable(feature = "simd_x86", since = "1.27.0")]
1032pub unsafe fn _mm256_zeroupper() {
1033    vzeroupper()
1034}
1035
1036/// Shuffles single-precision (32-bit) floating-point elements in `a`
1037/// within 128-bit lanes using the control in `b`.
1038///
1039/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_ps)
1040#[inline]
1041#[target_feature(enable = "avx")]
1042#[cfg_attr(test, assert_instr(vpermilps))]
1043#[stable(feature = "simd_x86", since = "1.27.0")]
1044pub unsafe fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
1045    vpermilps256(a, b.as_i32x8())
1046}
1047
1048/// Shuffles single-precision (32-bit) floating-point elements in `a`
1049/// using the control in `b`.
1050///
1051/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_ps)
1052#[inline]
1053#[target_feature(enable = "avx")]
1054#[cfg_attr(test, assert_instr(vpermilps))]
1055#[stable(feature = "simd_x86", since = "1.27.0")]
1056pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
1057    vpermilps(a, b.as_i32x4())
1058}
1059
1060/// Shuffles single-precision (32-bit) floating-point elements in `a`
1061/// within 128-bit lanes using the control in `imm8`.
1062///
1063/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_ps)
1064#[inline]
1065#[target_feature(enable = "avx")]
1066#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1067#[rustc_legacy_const_generics(1)]
1068#[stable(feature = "simd_x86", since = "1.27.0")]
1069pub unsafe fn _mm256_permute_ps<const IMM8: i32>(a: __m256) -> __m256 {
1070    static_assert_uimm_bits!(IMM8, 8);
1071    simd_shuffle!(
1072        a,
1073        _mm256_undefined_ps(),
1074        [
1075            (IMM8 as u32 >> 0) & 0b11,
1076            (IMM8 as u32 >> 2) & 0b11,
1077            (IMM8 as u32 >> 4) & 0b11,
1078            (IMM8 as u32 >> 6) & 0b11,
1079            ((IMM8 as u32 >> 0) & 0b11) + 4,
1080            ((IMM8 as u32 >> 2) & 0b11) + 4,
1081            ((IMM8 as u32 >> 4) & 0b11) + 4,
1082            ((IMM8 as u32 >> 6) & 0b11) + 4,
1083        ],
1084    )
1085}
1086
1087/// Shuffles single-precision (32-bit) floating-point elements in `a`
1088/// using the control in `imm8`.
1089///
1090/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_ps)
1091#[inline]
1092#[target_feature(enable = "avx")]
1093#[cfg_attr(test, assert_instr(vshufps, IMM8 = 9))]
1094#[rustc_legacy_const_generics(1)]
1095#[stable(feature = "simd_x86", since = "1.27.0")]
1096pub unsafe fn _mm_permute_ps<const IMM8: i32>(a: __m128) -> __m128 {
1097    static_assert_uimm_bits!(IMM8, 8);
1098    simd_shuffle!(
1099        a,
1100        _mm_undefined_ps(),
1101        [
1102            (IMM8 as u32 >> 0) & 0b11,
1103            (IMM8 as u32 >> 2) & 0b11,
1104            (IMM8 as u32 >> 4) & 0b11,
1105            (IMM8 as u32 >> 6) & 0b11,
1106        ],
1107    )
1108}
1109
1110/// Shuffles double-precision (64-bit) floating-point elements in `a`
1111/// within 256-bit lanes using the control in `b`.
1112///
1113/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar_pd)
1114#[inline]
1115#[target_feature(enable = "avx")]
1116#[cfg_attr(test, assert_instr(vpermilpd))]
1117#[stable(feature = "simd_x86", since = "1.27.0")]
1118pub unsafe fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
1119    vpermilpd256(a, b.as_i64x4())
1120}
1121
1122/// Shuffles double-precision (64-bit) floating-point elements in `a`
1123/// using the control in `b`.
1124///
1125/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permutevar_pd)
1126#[inline]
1127#[target_feature(enable = "avx")]
1128#[cfg_attr(test, assert_instr(vpermilpd))]
1129#[stable(feature = "simd_x86", since = "1.27.0")]
1130pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
1131    vpermilpd(a, b.as_i64x2())
1132}
1133
1134/// Shuffles double-precision (64-bit) floating-point elements in `a`
1135/// within 128-bit lanes using the control in `imm8`.
1136///
1137/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute_pd)
1138#[inline]
1139#[target_feature(enable = "avx")]
1140#[cfg_attr(test, assert_instr(vshufpd, IMM4 = 0x1))]
1141#[rustc_legacy_const_generics(1)]
1142#[stable(feature = "simd_x86", since = "1.27.0")]
1143pub unsafe fn _mm256_permute_pd<const IMM4: i32>(a: __m256d) -> __m256d {
1144    static_assert_uimm_bits!(IMM4, 4);
1145    simd_shuffle!(
1146        a,
1147        _mm256_undefined_pd(),
1148        [
1149            ((IMM4 as u32 >> 0) & 1),
1150            ((IMM4 as u32 >> 1) & 1),
1151            ((IMM4 as u32 >> 2) & 1) + 2,
1152            ((IMM4 as u32 >> 3) & 1) + 2,
1153        ],
1154    )
1155}
1156
1157/// Shuffles double-precision (64-bit) floating-point elements in `a`
1158/// using the control in `imm8`.
1159///
1160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_permute_pd)
1161#[inline]
1162#[target_feature(enable = "avx")]
1163#[cfg_attr(test, assert_instr(vshufpd, IMM2 = 0x1))]
1164#[rustc_legacy_const_generics(1)]
1165#[stable(feature = "simd_x86", since = "1.27.0")]
1166pub unsafe fn _mm_permute_pd<const IMM2: i32>(a: __m128d) -> __m128d {
1167    static_assert_uimm_bits!(IMM2, 2);
1168    simd_shuffle!(
1169        a,
1170        _mm_undefined_pd(),
1171        [(IMM2 as u32) & 1, (IMM2 as u32 >> 1) & 1],
1172    )
1173}
1174
1175/// Shuffles 256 bits (composed of 8 packed single-precision (32-bit)
1176/// floating-point elements) selected by `imm8` from `a` and `b`.
1177///
1178/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_ps)
1179#[inline]
1180#[target_feature(enable = "avx")]
1181#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x5))]
1182#[rustc_legacy_const_generics(2)]
1183#[stable(feature = "simd_x86", since = "1.27.0")]
1184pub unsafe fn _mm256_permute2f128_ps<const IMM8: i32>(a: __m256, b: __m256) -> __m256 {
1185    static_assert_uimm_bits!(IMM8, 8);
1186    vperm2f128ps256(a, b, IMM8 as i8)
1187}
1188
1189/// Shuffles 256 bits (composed of 4 packed double-precision (64-bit)
1190/// floating-point elements) selected by `imm8` from `a` and `b`.
1191///
1192/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_pd)
1193#[inline]
1194#[target_feature(enable = "avx")]
1195#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1196#[rustc_legacy_const_generics(2)]
1197#[stable(feature = "simd_x86", since = "1.27.0")]
1198pub unsafe fn _mm256_permute2f128_pd<const IMM8: i32>(a: __m256d, b: __m256d) -> __m256d {
1199    static_assert_uimm_bits!(IMM8, 8);
1200    vperm2f128pd256(a, b, IMM8 as i8)
1201}
1202
1203/// Shuffles 128-bits (composed of integer data) selected by `imm8`
1204/// from `a` and `b`.
1205///
1206/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2f128_si256)
1207#[inline]
1208#[target_feature(enable = "avx")]
1209#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 0x31))]
1210#[rustc_legacy_const_generics(2)]
1211#[stable(feature = "simd_x86", since = "1.27.0")]
1212pub unsafe fn _mm256_permute2f128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
1213    static_assert_uimm_bits!(IMM8, 8);
1214    transmute(vperm2f128si256(a.as_i32x8(), b.as_i32x8(), IMM8 as i8))
1215}
1216
1217/// Broadcasts a single-precision (32-bit) floating-point element from memory
1218/// to all elements of the returned vector.
1219///
1220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ss)
1221#[inline]
1222#[target_feature(enable = "avx")]
1223#[cfg_attr(test, assert_instr(vbroadcastss))]
1224#[stable(feature = "simd_x86", since = "1.27.0")]
1225#[allow(clippy::trivially_copy_pass_by_ref)]
1226pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 {
1227    _mm256_set1_ps(*f)
1228}
1229
1230/// Broadcasts a single-precision (32-bit) floating-point element from memory
1231/// to all elements of the returned vector.
1232///
1233/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcast_ss)
1234#[inline]
1235#[target_feature(enable = "avx")]
1236#[cfg_attr(test, assert_instr(vbroadcastss))]
1237#[stable(feature = "simd_x86", since = "1.27.0")]
1238#[allow(clippy::trivially_copy_pass_by_ref)]
1239pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 {
1240    _mm_set1_ps(*f)
1241}
1242
1243/// Broadcasts a double-precision (64-bit) floating-point element from memory
1244/// to all elements of the returned vector.
1245///
1246/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_sd)
1247#[inline]
1248#[target_feature(enable = "avx")]
1249#[cfg_attr(test, assert_instr(vbroadcastsd))]
1250#[stable(feature = "simd_x86", since = "1.27.0")]
1251#[allow(clippy::trivially_copy_pass_by_ref)]
1252pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d {
1253    _mm256_set1_pd(*f)
1254}
1255
1256/// Broadcasts 128 bits from memory (composed of 4 packed single-precision
1257/// (32-bit) floating-point elements) to all elements of the returned vector.
1258///
1259/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_ps)
1260#[inline]
1261#[target_feature(enable = "avx")]
1262#[cfg_attr(test, assert_instr(vbroadcastf128))]
1263#[stable(feature = "simd_x86", since = "1.27.0")]
1264pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
1265    simd_shuffle!(*a, _mm_setzero_ps(), [0, 1, 2, 3, 0, 1, 2, 3])
1266}
1267
1268/// Broadcasts 128 bits from memory (composed of 2 packed double-precision
1269/// (64-bit) floating-point elements) to all elements of the returned vector.
1270///
1271/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcast_pd)
1272#[inline]
1273#[target_feature(enable = "avx")]
1274#[cfg_attr(test, assert_instr(vbroadcastf128))]
1275#[stable(feature = "simd_x86", since = "1.27.0")]
1276pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
1277    simd_shuffle!(*a, _mm_setzero_pd(), [0, 1, 0, 1])
1278}
1279
1280/// Copies `a` to result, then inserts 128 bits (composed of 4 packed
1281/// single-precision (32-bit) floating-point elements) from `b` into result
1282/// at the location specified by `imm8`.
1283///
1284/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_ps)
1285#[inline]
1286#[target_feature(enable = "avx")]
1287#[cfg_attr(
1288    all(test, not(target_env = "msvc")),
1289    assert_instr(vinsertf128, IMM1 = 1)
1290)]
1291#[rustc_legacy_const_generics(2)]
1292#[stable(feature = "simd_x86", since = "1.27.0")]
1293pub unsafe fn _mm256_insertf128_ps<const IMM1: i32>(a: __m256, b: __m128) -> __m256 {
1294    static_assert_uimm_bits!(IMM1, 1);
1295    simd_shuffle!(
1296        a,
1297        _mm256_castps128_ps256(b),
1298        [[8, 9, 10, 11, 4, 5, 6, 7], [0, 1, 2, 3, 8, 9, 10, 11]][IMM1 as usize],
1299    )
1300}
1301
1302/// Copies `a` to result, then inserts 128 bits (composed of 2 packed
1303/// double-precision (64-bit) floating-point elements) from `b` into result
1304/// at the location specified by `imm8`.
1305///
1306/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_pd)
1307#[inline]
1308#[target_feature(enable = "avx")]
1309#[cfg_attr(
1310    all(test, not(target_env = "msvc")),
1311    assert_instr(vinsertf128, IMM1 = 1)
1312)]
1313#[rustc_legacy_const_generics(2)]
1314#[stable(feature = "simd_x86", since = "1.27.0")]
1315pub unsafe fn _mm256_insertf128_pd<const IMM1: i32>(a: __m256d, b: __m128d) -> __m256d {
1316    static_assert_uimm_bits!(IMM1, 1);
1317    simd_shuffle!(
1318        a,
1319        _mm256_castpd128_pd256(b),
1320        [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1321    )
1322}
1323
1324/// Copies `a` to result, then inserts 128 bits from `b` into result
1325/// at the location specified by `imm8`.
1326///
1327/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insertf128_si256)
1328#[inline]
1329#[target_feature(enable = "avx")]
1330#[cfg_attr(
1331    all(test, not(target_env = "msvc")),
1332    assert_instr(vinsertf128, IMM1 = 1)
1333)]
1334#[rustc_legacy_const_generics(2)]
1335#[stable(feature = "simd_x86", since = "1.27.0")]
1336pub unsafe fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1337    static_assert_uimm_bits!(IMM1, 1);
1338    let dst: i64x4 = simd_shuffle!(
1339        a.as_i64x4(),
1340        _mm256_castsi128_si256(b).as_i64x4(),
1341        [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize],
1342    );
1343    transmute(dst)
1344}
1345
1346/// Copies `a` to result, and inserts the 8-bit integer `i` into result
1347/// at the location specified by `index`.
1348///
1349/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi8)
1350#[inline]
1351#[target_feature(enable = "avx")]
1352// This intrinsic has no corresponding instruction.
1353#[rustc_legacy_const_generics(2)]
1354#[stable(feature = "simd_x86", since = "1.27.0")]
1355pub unsafe fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
1356    static_assert_uimm_bits!(INDEX, 5);
1357    transmute(simd_insert!(a.as_i8x32(), INDEX as u32, i))
1358}
1359
1360/// Copies `a` to result, and inserts the 16-bit integer `i` into result
1361/// at the location specified by `index`.
1362///
1363/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi16)
1364#[inline]
1365#[target_feature(enable = "avx")]
1366// This intrinsic has no corresponding instruction.
1367#[rustc_legacy_const_generics(2)]
1368#[stable(feature = "simd_x86", since = "1.27.0")]
1369pub unsafe fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
1370    static_assert_uimm_bits!(INDEX, 4);
1371    transmute(simd_insert!(a.as_i16x16(), INDEX as u32, i))
1372}
1373
1374/// Copies `a` to result, and inserts the 32-bit integer `i` into result
1375/// at the location specified by `index`.
1376///
1377/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_insert_epi32)
1378#[inline]
1379#[target_feature(enable = "avx")]
1380// This intrinsic has no corresponding instruction.
1381#[rustc_legacy_const_generics(2)]
1382#[stable(feature = "simd_x86", since = "1.27.0")]
1383pub unsafe fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
1384    static_assert_uimm_bits!(INDEX, 3);
1385    transmute(simd_insert!(a.as_i32x8(), INDEX as u32, i))
1386}
1387
1388/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1389/// floating-point elements) from memory into result.
1390/// `mem_addr` must be aligned on a 32-byte boundary or a
1391/// general-protection exception may be generated.
1392///
1393/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_pd)
1394#[inline]
1395#[target_feature(enable = "avx")]
1396#[cfg_attr(test, assert_instr(vmovap))]
1397#[stable(feature = "simd_x86", since = "1.27.0")]
1398#[allow(clippy::cast_ptr_alignment)]
1399pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
1400    *(mem_addr as *const __m256d)
1401}
1402
1403/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1404/// floating-point elements) from `a` into memory.
1405/// `mem_addr` must be aligned on a 32-byte boundary or a
1406/// general-protection exception may be generated.
1407///
1408/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_pd)
1409#[inline]
1410#[target_feature(enable = "avx")]
1411#[cfg_attr(test, assert_instr(vmovap))]
1412#[stable(feature = "simd_x86", since = "1.27.0")]
1413#[allow(clippy::cast_ptr_alignment)]
1414pub unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d) {
1415    *(mem_addr as *mut __m256d) = a;
1416}
1417
1418/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1419/// floating-point elements) from memory into result.
1420/// `mem_addr` must be aligned on a 32-byte boundary or a
1421/// general-protection exception may be generated.
1422///
1423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_ps)
1424#[inline]
1425#[target_feature(enable = "avx")]
1426#[cfg_attr(test, assert_instr(vmovaps))]
1427#[stable(feature = "simd_x86", since = "1.27.0")]
1428#[allow(clippy::cast_ptr_alignment)]
1429pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
1430    *(mem_addr as *const __m256)
1431}
1432
1433/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1434/// floating-point elements) from `a` into memory.
1435/// `mem_addr` must be aligned on a 32-byte boundary or a
1436/// general-protection exception may be generated.
1437///
1438/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_ps)
1439#[inline]
1440#[target_feature(enable = "avx")]
1441#[cfg_attr(test, assert_instr(vmovaps))]
1442#[stable(feature = "simd_x86", since = "1.27.0")]
1443#[allow(clippy::cast_ptr_alignment)]
1444pub unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256) {
1445    *(mem_addr as *mut __m256) = a;
1446}
1447
1448/// Loads 256-bits (composed of 4 packed double-precision (64-bit)
1449/// floating-point elements) from memory into result.
1450/// `mem_addr` does not need to be aligned on any particular boundary.
1451///
1452/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_pd)
1453#[inline]
1454#[target_feature(enable = "avx")]
1455#[cfg_attr(test, assert_instr(vmovup))]
1456#[stable(feature = "simd_x86", since = "1.27.0")]
1457pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
1458    let mut dst = _mm256_undefined_pd();
1459    ptr::copy_nonoverlapping(
1460        mem_addr as *const u8,
1461        ptr::addr_of_mut!(dst) as *mut u8,
1462        mem::size_of::<__m256d>(),
1463    );
1464    dst
1465}
1466
1467/// Stores 256-bits (composed of 4 packed double-precision (64-bit)
1468/// floating-point elements) from `a` into memory.
1469/// `mem_addr` does not need to be aligned on any particular boundary.
1470///
1471/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_pd)
1472#[inline]
1473#[target_feature(enable = "avx")]
1474#[cfg_attr(test, assert_instr(vmovup))]
1475#[stable(feature = "simd_x86", since = "1.27.0")]
1476pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
1477    mem_addr.cast::<__m256d>().write_unaligned(a);
1478}
1479
1480/// Loads 256-bits (composed of 8 packed single-precision (32-bit)
1481/// floating-point elements) from memory into result.
1482/// `mem_addr` does not need to be aligned on any particular boundary.
1483///
1484/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_ps)
1485#[inline]
1486#[target_feature(enable = "avx")]
1487#[cfg_attr(test, assert_instr(vmovups))]
1488#[stable(feature = "simd_x86", since = "1.27.0")]
1489pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
1490    let mut dst = _mm256_undefined_ps();
1491    ptr::copy_nonoverlapping(
1492        mem_addr as *const u8,
1493        ptr::addr_of_mut!(dst) as *mut u8,
1494        mem::size_of::<__m256>(),
1495    );
1496    dst
1497}
1498
1499/// Stores 256-bits (composed of 8 packed single-precision (32-bit)
1500/// floating-point elements) from `a` into memory.
1501/// `mem_addr` does not need to be aligned on any particular boundary.
1502///
1503/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_ps)
1504#[inline]
1505#[target_feature(enable = "avx")]
1506#[cfg_attr(test, assert_instr(vmovups))]
1507#[stable(feature = "simd_x86", since = "1.27.0")]
1508pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
1509    mem_addr.cast::<__m256>().write_unaligned(a);
1510}
1511
1512/// Loads 256-bits of integer data from memory into result.
1513/// `mem_addr` must be aligned on a 32-byte boundary or a
1514/// general-protection exception may be generated.
1515///
1516/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_load_si256)
1517#[inline]
1518#[target_feature(enable = "avx")]
1519#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
1520#[stable(feature = "simd_x86", since = "1.27.0")]
1521pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
1522    *mem_addr
1523}
1524
1525/// Stores 256-bits of integer data from `a` into memory.
1526/// `mem_addr` must be aligned on a 32-byte boundary or a
1527/// general-protection exception may be generated.
1528///
1529/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_store_si256)
1530#[inline]
1531#[target_feature(enable = "avx")]
1532#[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
1533#[stable(feature = "simd_x86", since = "1.27.0")]
1534pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
1535    *mem_addr = a;
1536}
1537
1538/// Loads 256-bits of integer data from memory into result.
1539/// `mem_addr` does not need to be aligned on any particular boundary.
1540///
1541/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu_si256)
1542#[inline]
1543#[target_feature(enable = "avx")]
1544#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1545#[stable(feature = "simd_x86", since = "1.27.0")]
1546pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
1547    let mut dst = _mm256_undefined_si256();
1548    ptr::copy_nonoverlapping(
1549        mem_addr as *const u8,
1550        ptr::addr_of_mut!(dst) as *mut u8,
1551        mem::size_of::<__m256i>(),
1552    );
1553    dst
1554}
1555
1556/// Stores 256-bits of integer data from `a` into memory.
1557/// `mem_addr` does not need to be aligned on any particular boundary.
1558///
1559/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu_si256)
1560#[inline]
1561#[target_feature(enable = "avx")]
1562#[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
1563#[stable(feature = "simd_x86", since = "1.27.0")]
1564pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
1565    mem_addr.write_unaligned(a);
1566}
1567
1568/// Loads packed double-precision (64-bit) floating-point elements from memory
1569/// into result using `mask` (elements are zeroed out when the high bit of the
1570/// corresponding element is not set).
1571///
1572/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_pd)
1573#[inline]
1574#[target_feature(enable = "avx")]
1575#[cfg_attr(test, assert_instr(vmaskmovpd))]
1576#[stable(feature = "simd_x86", since = "1.27.0")]
1577pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
1578    maskloadpd256(mem_addr as *const i8, mask.as_i64x4())
1579}
1580
1581/// Stores packed double-precision (64-bit) floating-point elements from `a`
1582/// into memory using `mask`.
1583///
1584/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_pd)
1585#[inline]
1586#[target_feature(enable = "avx")]
1587#[cfg_attr(test, assert_instr(vmaskmovpd))]
1588#[stable(feature = "simd_x86", since = "1.27.0")]
1589pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
1590    maskstorepd256(mem_addr as *mut i8, mask.as_i64x4(), a);
1591}
1592
1593/// Loads packed double-precision (64-bit) floating-point elements from memory
1594/// into result using `mask` (elements are zeroed out when the high bit of the
1595/// corresponding element is not set).
1596///
1597/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_pd)
1598#[inline]
1599#[target_feature(enable = "avx")]
1600#[cfg_attr(test, assert_instr(vmaskmovpd))]
1601#[stable(feature = "simd_x86", since = "1.27.0")]
1602pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
1603    maskloadpd(mem_addr as *const i8, mask.as_i64x2())
1604}
1605
1606/// Stores packed double-precision (64-bit) floating-point elements from `a`
1607/// into memory using `mask`.
1608///
1609/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_pd)
1610#[inline]
1611#[target_feature(enable = "avx")]
1612#[cfg_attr(test, assert_instr(vmaskmovpd))]
1613#[stable(feature = "simd_x86", since = "1.27.0")]
1614pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
1615    maskstorepd(mem_addr as *mut i8, mask.as_i64x2(), a);
1616}
1617
1618/// Loads packed single-precision (32-bit) floating-point elements from memory
1619/// into result using `mask` (elements are zeroed out when the high bit of the
1620/// corresponding element is not set).
1621///
1622/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_ps)
1623#[inline]
1624#[target_feature(enable = "avx")]
1625#[cfg_attr(test, assert_instr(vmaskmovps))]
1626#[stable(feature = "simd_x86", since = "1.27.0")]
1627pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
1628    maskloadps256(mem_addr as *const i8, mask.as_i32x8())
1629}
1630
1631/// Stores packed single-precision (32-bit) floating-point elements from `a`
1632/// into memory using `mask`.
1633///
1634/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_ps)
1635#[inline]
1636#[target_feature(enable = "avx")]
1637#[cfg_attr(test, assert_instr(vmaskmovps))]
1638#[stable(feature = "simd_x86", since = "1.27.0")]
1639pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
1640    maskstoreps256(mem_addr as *mut i8, mask.as_i32x8(), a);
1641}
1642
1643/// Loads packed single-precision (32-bit) floating-point elements from memory
1644/// into result using `mask` (elements are zeroed out when the high bit of the
1645/// corresponding element is not set).
1646///
1647/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_ps)
1648#[inline]
1649#[target_feature(enable = "avx")]
1650#[cfg_attr(test, assert_instr(vmaskmovps))]
1651#[stable(feature = "simd_x86", since = "1.27.0")]
1652pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
1653    maskloadps(mem_addr as *const i8, mask.as_i32x4())
1654}
1655
1656/// Stores packed single-precision (32-bit) floating-point elements from `a`
1657/// into memory using `mask`.
1658///
1659/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_ps)
1660#[inline]
1661#[target_feature(enable = "avx")]
1662#[cfg_attr(test, assert_instr(vmaskmovps))]
1663#[stable(feature = "simd_x86", since = "1.27.0")]
1664pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
1665    maskstoreps(mem_addr as *mut i8, mask.as_i32x4(), a);
1666}
1667
1668/// Duplicate odd-indexed single-precision (32-bit) floating-point elements
1669/// from `a`, and returns the results.
1670///
1671/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movehdup_ps)
1672#[inline]
1673#[target_feature(enable = "avx")]
1674#[cfg_attr(test, assert_instr(vmovshdup))]
1675#[stable(feature = "simd_x86", since = "1.27.0")]
1676pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
1677    simd_shuffle!(a, a, [1, 1, 3, 3, 5, 5, 7, 7])
1678}
1679
1680/// Duplicate even-indexed single-precision (32-bit) floating-point elements
1681/// from `a`, and returns the results.
1682///
1683/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_moveldup_ps)
1684#[inline]
1685#[target_feature(enable = "avx")]
1686#[cfg_attr(test, assert_instr(vmovsldup))]
1687#[stable(feature = "simd_x86", since = "1.27.0")]
1688pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
1689    simd_shuffle!(a, a, [0, 0, 2, 2, 4, 4, 6, 6])
1690}
1691
1692/// Duplicate even-indexed double-precision (64-bit) floating-point elements
1693/// from `a`, and returns the results.
1694///
1695/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movedup_pd)
1696#[inline]
1697#[target_feature(enable = "avx")]
1698#[cfg_attr(test, assert_instr(vmovddup))]
1699#[stable(feature = "simd_x86", since = "1.27.0")]
1700pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d {
1701    simd_shuffle!(a, a, [0, 0, 2, 2])
1702}
1703
1704/// Loads 256-bits of integer data from unaligned memory into result.
1705/// This intrinsic may perform better than `_mm256_loadu_si256` when the
1706/// data crosses a cache line boundary.
1707///
1708/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_lddqu_si256)
1709#[inline]
1710#[target_feature(enable = "avx")]
1711#[cfg_attr(test, assert_instr(vlddqu))]
1712#[stable(feature = "simd_x86", since = "1.27.0")]
1713pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
1714    transmute(vlddqu(mem_addr as *const i8))
1715}
1716
1717/// Moves integer data from a 256-bit integer vector to a 32-byte
1718/// aligned memory location. To minimize caching, the data is flagged as
1719/// non-temporal (unlikely to be used again soon)
1720///
1721/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_si256)
1722///
1723/// # Safety of non-temporal stores
1724///
1725/// After using this intrinsic, but before any other access to the memory that this intrinsic
1726/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1727/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1728/// return.
1729///
1730/// See [`_mm_sfence`] for details.
1731#[inline]
1732#[target_feature(enable = "avx")]
1733#[cfg_attr(test, assert_instr(vmovntdq))]
1734#[stable(feature = "simd_x86", since = "1.27.0")]
1735pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1736    crate::arch::asm!(
1737        vps!("vmovntdq", ",{a}"),
1738        p = in(reg) mem_addr,
1739        a = in(ymm_reg) a,
1740        options(nostack, preserves_flags),
1741    );
1742}
1743
1744/// Moves double-precision values from a 256-bit vector of `[4 x double]`
1745/// to a 32-byte aligned memory location. To minimize caching, the data is
1746/// flagged as non-temporal (unlikely to be used again soon).
1747///
1748/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_pd)
1749///
1750/// # Safety of non-temporal stores
1751///
1752/// After using this intrinsic, but before any other access to the memory that this intrinsic
1753/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1754/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1755/// return.
1756///
1757/// See [`_mm_sfence`] for details.
1758#[inline]
1759#[target_feature(enable = "avx")]
1760#[cfg_attr(test, assert_instr(vmovntpd))]
1761#[stable(feature = "simd_x86", since = "1.27.0")]
1762#[allow(clippy::cast_ptr_alignment)]
1763pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1764    crate::arch::asm!(
1765        vps!("vmovntpd", ",{a}"),
1766        p = in(reg) mem_addr,
1767        a = in(ymm_reg) a,
1768        options(nostack, preserves_flags),
1769    );
1770}
1771
1772/// Moves single-precision floating point values from a 256-bit vector
1773/// of `[8 x float]` to a 32-byte aligned memory location. To minimize
1774/// caching, the data is flagged as non-temporal (unlikely to be used again
1775/// soon).
1776///
1777/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_ps)
1778///
1779/// # Safety of non-temporal stores
1780///
1781/// After using this intrinsic, but before any other access to the memory that this intrinsic
1782/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1783/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1784/// return.
1785///
1786/// See [`_mm_sfence`] for details.
1787#[inline]
1788#[target_feature(enable = "avx")]
1789#[cfg_attr(test, assert_instr(vmovntps))]
1790#[stable(feature = "simd_x86", since = "1.27.0")]
1791#[allow(clippy::cast_ptr_alignment)]
1792pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1793    crate::arch::asm!(
1794        vps!("vmovntps", ",{a}"),
1795        p = in(reg) mem_addr,
1796        a = in(ymm_reg) a,
1797        options(nostack, preserves_flags),
1798    );
1799}
1800
1801/// Computes the approximate reciprocal of packed single-precision (32-bit)
1802/// floating-point elements in `a`, and returns the results. The maximum
1803/// relative error for this approximation is less than 1.5*2^-12.
1804///
1805/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rcp_ps)
1806#[inline]
1807#[target_feature(enable = "avx")]
1808#[cfg_attr(test, assert_instr(vrcpps))]
1809#[stable(feature = "simd_x86", since = "1.27.0")]
1810pub unsafe fn _mm256_rcp_ps(a: __m256) -> __m256 {
1811    vrcpps(a)
1812}
1813
1814/// Computes the approximate reciprocal square root of packed single-precision
1815/// (32-bit) floating-point elements in `a`, and returns the results.
1816/// The maximum relative error for this approximation is less than 1.5*2^-12.
1817///
1818/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_rsqrt_ps)
1819#[inline]
1820#[target_feature(enable = "avx")]
1821#[cfg_attr(test, assert_instr(vrsqrtps))]
1822#[stable(feature = "simd_x86", since = "1.27.0")]
1823pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
1824    vrsqrtps(a)
1825}
1826
1827/// Unpacks and interleave double-precision (64-bit) floating-point elements
1828/// from the high half of each 128-bit lane in `a` and `b`.
1829///
1830/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_pd)
1831#[inline]
1832#[target_feature(enable = "avx")]
1833#[cfg_attr(test, assert_instr(vunpckhpd))]
1834#[stable(feature = "simd_x86", since = "1.27.0")]
1835pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
1836    simd_shuffle!(a, b, [1, 5, 3, 7])
1837}
1838
1839/// Unpacks and interleave single-precision (32-bit) floating-point elements
1840/// from the high half of each 128-bit lane in `a` and `b`.
1841///
1842/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_ps)
1843#[inline]
1844#[target_feature(enable = "avx")]
1845#[cfg_attr(test, assert_instr(vunpckhps))]
1846#[stable(feature = "simd_x86", since = "1.27.0")]
1847pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
1848    simd_shuffle!(a, b, [2, 10, 3, 11, 6, 14, 7, 15])
1849}
1850
1851/// Unpacks and interleave double-precision (64-bit) floating-point elements
1852/// from the low half of each 128-bit lane in `a` and `b`.
1853///
1854/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_pd)
1855#[inline]
1856#[target_feature(enable = "avx")]
1857#[cfg_attr(test, assert_instr(vunpcklpd))]
1858#[stable(feature = "simd_x86", since = "1.27.0")]
1859pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
1860    simd_shuffle!(a, b, [0, 4, 2, 6])
1861}
1862
1863/// Unpacks and interleave single-precision (32-bit) floating-point elements
1864/// from the low half of each 128-bit lane in `a` and `b`.
1865///
1866/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_ps)
1867#[inline]
1868#[target_feature(enable = "avx")]
1869#[cfg_attr(test, assert_instr(vunpcklps))]
1870#[stable(feature = "simd_x86", since = "1.27.0")]
1871pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
1872    simd_shuffle!(a, b, [0, 8, 1, 9, 4, 12, 5, 13])
1873}
1874
1875/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
1876/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1877/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1878/// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
1879///
1880/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_si256)
1881#[inline]
1882#[target_feature(enable = "avx")]
1883#[cfg_attr(test, assert_instr(vptest))]
1884#[stable(feature = "simd_x86", since = "1.27.0")]
1885pub unsafe fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
1886    ptestz256(a.as_i64x4(), b.as_i64x4())
1887}
1888
1889/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
1890/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1891/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1892/// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
1893///
1894/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_si256)
1895#[inline]
1896#[target_feature(enable = "avx")]
1897#[cfg_attr(test, assert_instr(vptest))]
1898#[stable(feature = "simd_x86", since = "1.27.0")]
1899pub unsafe fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
1900    ptestc256(a.as_i64x4(), b.as_i64x4())
1901}
1902
1903/// Computes the bitwise AND of 256 bits (representing integer data) in `a` and
1904/// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
1905/// Computes the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
1906/// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
1907/// `CF` values are zero, otherwise return 0.
1908///
1909/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_si256)
1910#[inline]
1911#[target_feature(enable = "avx")]
1912#[cfg_attr(test, assert_instr(vptest))]
1913#[stable(feature = "simd_x86", since = "1.27.0")]
1914pub unsafe fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
1915    ptestnzc256(a.as_i64x4(), b.as_i64x4())
1916}
1917
1918/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
1919/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1920/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1921/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1922/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1923/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1924/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
1925///
1926/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_pd)
1927#[inline]
1928#[target_feature(enable = "avx")]
1929#[cfg_attr(test, assert_instr(vtestpd))]
1930#[stable(feature = "simd_x86", since = "1.27.0")]
1931pub unsafe fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
1932    vtestzpd256(a, b)
1933}
1934
1935/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
1936/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1937/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1938/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1939/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1940/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1941/// is zero, otherwise set `CF` to 0. Return the `CF` value.
1942///
1943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_pd)
1944#[inline]
1945#[target_feature(enable = "avx")]
1946#[cfg_attr(test, assert_instr(vtestpd))]
1947#[stable(feature = "simd_x86", since = "1.27.0")]
1948pub unsafe fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
1949    vtestcpd256(a, b)
1950}
1951
1952/// Computes the bitwise AND of 256 bits (representing double-precision (64-bit)
1953/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
1954/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1955/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1956/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1957/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1958/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
1959/// are zero, otherwise return 0.
1960///
1961/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_pd)
1962#[inline]
1963#[target_feature(enable = "avx")]
1964#[cfg_attr(test, assert_instr(vtestpd))]
1965#[stable(feature = "simd_x86", since = "1.27.0")]
1966pub unsafe fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
1967    vtestnzcpd256(a, b)
1968}
1969
1970/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
1971/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
1972/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1973/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1974/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1975/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1976/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
1977///
1978/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_pd)
1979#[inline]
1980#[target_feature(enable = "avx")]
1981#[cfg_attr(test, assert_instr(vtestpd))]
1982#[stable(feature = "simd_x86", since = "1.27.0")]
1983pub unsafe fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
1984    vtestzpd(a, b)
1985}
1986
1987/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
1988/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
1989/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
1990/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
1991/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
1992/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
1993/// is zero, otherwise set `CF` to 0. Return the `CF` value.
1994///
1995/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_pd)
1996#[inline]
1997#[target_feature(enable = "avx")]
1998#[cfg_attr(test, assert_instr(vtestpd))]
1999#[stable(feature = "simd_x86", since = "1.27.0")]
2000pub unsafe fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
2001    vtestcpd(a, b)
2002}
2003
2004/// Computes the bitwise AND of 128 bits (representing double-precision (64-bit)
2005/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2006/// value, and set `ZF` to 1 if the sign bit of each 64-bit element in the
2007/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2008/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2009/// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
2010/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2011/// are zero, otherwise return 0.
2012///
2013/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_pd)
2014#[inline]
2015#[target_feature(enable = "avx")]
2016#[cfg_attr(test, assert_instr(vtestpd))]
2017#[stable(feature = "simd_x86", since = "1.27.0")]
2018pub unsafe fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
2019    vtestnzcpd(a, b)
2020}
2021
2022/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2023/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2024/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2025/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2026/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2027/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2028/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2029///
2030/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testz_ps)
2031#[inline]
2032#[target_feature(enable = "avx")]
2033#[cfg_attr(test, assert_instr(vtestps))]
2034#[stable(feature = "simd_x86", since = "1.27.0")]
2035pub unsafe fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
2036    vtestzps256(a, b)
2037}
2038
2039/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2040/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2041/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2042/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2043/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2044/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2045/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2046///
2047/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testc_ps)
2048#[inline]
2049#[target_feature(enable = "avx")]
2050#[cfg_attr(test, assert_instr(vtestps))]
2051#[stable(feature = "simd_x86", since = "1.27.0")]
2052pub unsafe fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
2053    vtestcps256(a, b)
2054}
2055
2056/// Computes the bitwise AND of 256 bits (representing single-precision (32-bit)
2057/// floating-point elements) in `a` and `b`, producing an intermediate 256-bit
2058/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2059/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2060/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2061/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2062/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2063/// are zero, otherwise return 0.
2064///
2065/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_testnzc_ps)
2066#[inline]
2067#[target_feature(enable = "avx")]
2068#[cfg_attr(test, assert_instr(vtestps))]
2069#[stable(feature = "simd_x86", since = "1.27.0")]
2070pub unsafe fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
2071    vtestnzcps256(a, b)
2072}
2073
2074/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2075/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2076/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2077/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2078/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2079/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2080/// is zero, otherwise set `CF` to 0. Return the `ZF` value.
2081///
2082/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_ps)
2083#[inline]
2084#[target_feature(enable = "avx")]
2085#[cfg_attr(test, assert_instr(vtestps))]
2086#[stable(feature = "simd_x86", since = "1.27.0")]
2087pub unsafe fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
2088    vtestzps(a, b)
2089}
2090
2091/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2092/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2093/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2094/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2095/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2096/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2097/// is zero, otherwise set `CF` to 0. Return the `CF` value.
2098///
2099/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_ps)
2100#[inline]
2101#[target_feature(enable = "avx")]
2102#[cfg_attr(test, assert_instr(vtestps))]
2103#[stable(feature = "simd_x86", since = "1.27.0")]
2104pub unsafe fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
2105    vtestcps(a, b)
2106}
2107
2108/// Computes the bitwise AND of 128 bits (representing single-precision (32-bit)
2109/// floating-point elements) in `a` and `b`, producing an intermediate 128-bit
2110/// value, and set `ZF` to 1 if the sign bit of each 32-bit element in the
2111/// intermediate value is zero, otherwise set `ZF` to 0. Compute the bitwise
2112/// NOT of `a` and then AND with `b`, producing an intermediate value, and set
2113/// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
2114/// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
2115/// are zero, otherwise return 0.
2116///
2117/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_ps)
2118#[inline]
2119#[target_feature(enable = "avx")]
2120#[cfg_attr(test, assert_instr(vtestps))]
2121#[stable(feature = "simd_x86", since = "1.27.0")]
2122pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
2123    vtestnzcps(a, b)
2124}
2125
2126/// Sets each bit of the returned mask based on the most significant bit of the
2127/// corresponding packed double-precision (64-bit) floating-point element in
2128/// `a`.
2129///
2130/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_pd)
2131#[inline]
2132#[target_feature(enable = "avx")]
2133#[cfg_attr(test, assert_instr(vmovmskpd))]
2134#[stable(feature = "simd_x86", since = "1.27.0")]
2135pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
2136    // Propagate the highest bit to the rest, because simd_bitmask
2137    // requires all-1 or all-0.
2138    let mask: i64x4 = simd_lt(transmute(a), i64x4::ZERO);
2139    simd_bitmask::<i64x4, u8>(mask).into()
2140}
2141
2142/// Sets each bit of the returned mask based on the most significant bit of the
2143/// corresponding packed single-precision (32-bit) floating-point element in
2144/// `a`.
2145///
2146/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_ps)
2147#[inline]
2148#[target_feature(enable = "avx")]
2149#[cfg_attr(test, assert_instr(vmovmskps))]
2150#[stable(feature = "simd_x86", since = "1.27.0")]
2151pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
2152    // Propagate the highest bit to the rest, because simd_bitmask
2153    // requires all-1 or all-0.
2154    let mask: i32x8 = simd_lt(transmute(a), i32x8::ZERO);
2155    simd_bitmask::<i32x8, u8>(mask).into()
2156}
2157
2158/// Returns vector of type __m256d with all elements set to zero.
2159///
2160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_pd)
2161#[inline]
2162#[target_feature(enable = "avx")]
2163#[cfg_attr(test, assert_instr(vxorp))]
2164#[stable(feature = "simd_x86", since = "1.27.0")]
2165pub unsafe fn _mm256_setzero_pd() -> __m256d {
2166    const { mem::zeroed() }
2167}
2168
2169/// Returns vector of type __m256 with all elements set to zero.
2170///
2171/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_ps)
2172#[inline]
2173#[target_feature(enable = "avx")]
2174#[cfg_attr(test, assert_instr(vxorps))]
2175#[stable(feature = "simd_x86", since = "1.27.0")]
2176pub unsafe fn _mm256_setzero_ps() -> __m256 {
2177    const { mem::zeroed() }
2178}
2179
2180/// Returns vector of type __m256i with all elements set to zero.
2181///
2182/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setzero_si256)
2183#[inline]
2184#[target_feature(enable = "avx")]
2185#[cfg_attr(test, assert_instr(vxor))]
2186#[stable(feature = "simd_x86", since = "1.27.0")]
2187pub unsafe fn _mm256_setzero_si256() -> __m256i {
2188    const { mem::zeroed() }
2189}
2190
2191/// Sets packed double-precision (64-bit) floating-point elements in returned
2192/// vector with the supplied values.
2193///
2194/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_pd)
2195#[inline]
2196#[target_feature(enable = "avx")]
2197// This intrinsic has no corresponding instruction.
2198#[cfg_attr(test, assert_instr(vinsertf128))]
2199#[stable(feature = "simd_x86", since = "1.27.0")]
2200pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2201    _mm256_setr_pd(d, c, b, a)
2202}
2203
2204/// Sets packed single-precision (32-bit) floating-point elements in returned
2205/// vector with the supplied values.
2206///
2207/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_ps)
2208#[inline]
2209#[target_feature(enable = "avx")]
2210// This intrinsic has no corresponding instruction.
2211#[stable(feature = "simd_x86", since = "1.27.0")]
2212pub unsafe fn _mm256_set_ps(
2213    a: f32,
2214    b: f32,
2215    c: f32,
2216    d: f32,
2217    e: f32,
2218    f: f32,
2219    g: f32,
2220    h: f32,
2221) -> __m256 {
2222    _mm256_setr_ps(h, g, f, e, d, c, b, a)
2223}
2224
2225/// Sets packed 8-bit integers in returned vector with the supplied values.
2226///
2227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi8)
2228#[inline]
2229#[target_feature(enable = "avx")]
2230// This intrinsic has no corresponding instruction.
2231#[stable(feature = "simd_x86", since = "1.27.0")]
2232pub unsafe fn _mm256_set_epi8(
2233    e00: i8,
2234    e01: i8,
2235    e02: i8,
2236    e03: i8,
2237    e04: i8,
2238    e05: i8,
2239    e06: i8,
2240    e07: i8,
2241    e08: i8,
2242    e09: i8,
2243    e10: i8,
2244    e11: i8,
2245    e12: i8,
2246    e13: i8,
2247    e14: i8,
2248    e15: i8,
2249    e16: i8,
2250    e17: i8,
2251    e18: i8,
2252    e19: i8,
2253    e20: i8,
2254    e21: i8,
2255    e22: i8,
2256    e23: i8,
2257    e24: i8,
2258    e25: i8,
2259    e26: i8,
2260    e27: i8,
2261    e28: i8,
2262    e29: i8,
2263    e30: i8,
2264    e31: i8,
2265) -> __m256i {
2266    #[rustfmt::skip]
2267    _mm256_setr_epi8(
2268        e31, e30, e29, e28, e27, e26, e25, e24,
2269        e23, e22, e21, e20, e19, e18, e17, e16,
2270        e15, e14, e13, e12, e11, e10, e09, e08,
2271        e07, e06, e05, e04, e03, e02, e01, e00,
2272    )
2273}
2274
2275/// Sets packed 16-bit integers in returned vector with the supplied values.
2276///
2277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi16)
2278#[inline]
2279#[target_feature(enable = "avx")]
2280// This intrinsic has no corresponding instruction.
2281#[stable(feature = "simd_x86", since = "1.27.0")]
2282pub unsafe fn _mm256_set_epi16(
2283    e00: i16,
2284    e01: i16,
2285    e02: i16,
2286    e03: i16,
2287    e04: i16,
2288    e05: i16,
2289    e06: i16,
2290    e07: i16,
2291    e08: i16,
2292    e09: i16,
2293    e10: i16,
2294    e11: i16,
2295    e12: i16,
2296    e13: i16,
2297    e14: i16,
2298    e15: i16,
2299) -> __m256i {
2300    #[rustfmt::skip]
2301    _mm256_setr_epi16(
2302        e15, e14, e13, e12,
2303        e11, e10, e09, e08,
2304        e07, e06, e05, e04,
2305        e03, e02, e01, e00,
2306    )
2307}
2308
2309/// Sets packed 32-bit integers in returned vector with the supplied values.
2310///
2311/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi32)
2312#[inline]
2313#[target_feature(enable = "avx")]
2314// This intrinsic has no corresponding instruction.
2315#[stable(feature = "simd_x86", since = "1.27.0")]
2316pub unsafe fn _mm256_set_epi32(
2317    e0: i32,
2318    e1: i32,
2319    e2: i32,
2320    e3: i32,
2321    e4: i32,
2322    e5: i32,
2323    e6: i32,
2324    e7: i32,
2325) -> __m256i {
2326    _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0)
2327}
2328
2329/// Sets packed 64-bit integers in returned vector with the supplied values.
2330///
2331/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_epi64x)
2332#[inline]
2333#[target_feature(enable = "avx")]
2334// This intrinsic has no corresponding instruction.
2335#[stable(feature = "simd_x86", since = "1.27.0")]
2336pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2337    _mm256_setr_epi64x(d, c, b, a)
2338}
2339
2340/// Sets packed double-precision (64-bit) floating-point elements in returned
2341/// vector with the supplied values in reverse order.
2342///
2343/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_pd)
2344#[inline]
2345#[target_feature(enable = "avx")]
2346// This intrinsic has no corresponding instruction.
2347#[stable(feature = "simd_x86", since = "1.27.0")]
2348pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
2349    __m256d([a, b, c, d])
2350}
2351
2352/// Sets packed single-precision (32-bit) floating-point elements in returned
2353/// vector with the supplied values in reverse order.
2354///
2355/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_ps)
2356#[inline]
2357#[target_feature(enable = "avx")]
2358// This intrinsic has no corresponding instruction.
2359#[stable(feature = "simd_x86", since = "1.27.0")]
2360pub unsafe fn _mm256_setr_ps(
2361    a: f32,
2362    b: f32,
2363    c: f32,
2364    d: f32,
2365    e: f32,
2366    f: f32,
2367    g: f32,
2368    h: f32,
2369) -> __m256 {
2370    __m256([a, b, c, d, e, f, g, h])
2371}
2372
2373/// Sets packed 8-bit integers in returned vector with the supplied values in
2374/// reverse order.
2375///
2376/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi8)
2377#[inline]
2378#[target_feature(enable = "avx")]
2379// This intrinsic has no corresponding instruction.
2380#[stable(feature = "simd_x86", since = "1.27.0")]
2381pub unsafe fn _mm256_setr_epi8(
2382    e00: i8,
2383    e01: i8,
2384    e02: i8,
2385    e03: i8,
2386    e04: i8,
2387    e05: i8,
2388    e06: i8,
2389    e07: i8,
2390    e08: i8,
2391    e09: i8,
2392    e10: i8,
2393    e11: i8,
2394    e12: i8,
2395    e13: i8,
2396    e14: i8,
2397    e15: i8,
2398    e16: i8,
2399    e17: i8,
2400    e18: i8,
2401    e19: i8,
2402    e20: i8,
2403    e21: i8,
2404    e22: i8,
2405    e23: i8,
2406    e24: i8,
2407    e25: i8,
2408    e26: i8,
2409    e27: i8,
2410    e28: i8,
2411    e29: i8,
2412    e30: i8,
2413    e31: i8,
2414) -> __m256i {
2415    #[rustfmt::skip]
2416    transmute(i8x32::new(
2417        e00, e01, e02, e03, e04, e05, e06, e07,
2418        e08, e09, e10, e11, e12, e13, e14, e15,
2419        e16, e17, e18, e19, e20, e21, e22, e23,
2420        e24, e25, e26, e27, e28, e29, e30, e31,
2421    ))
2422}
2423
2424/// Sets packed 16-bit integers in returned vector with the supplied values in
2425/// reverse order.
2426///
2427/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi16)
2428#[inline]
2429#[target_feature(enable = "avx")]
2430// This intrinsic has no corresponding instruction.
2431#[stable(feature = "simd_x86", since = "1.27.0")]
2432pub unsafe fn _mm256_setr_epi16(
2433    e00: i16,
2434    e01: i16,
2435    e02: i16,
2436    e03: i16,
2437    e04: i16,
2438    e05: i16,
2439    e06: i16,
2440    e07: i16,
2441    e08: i16,
2442    e09: i16,
2443    e10: i16,
2444    e11: i16,
2445    e12: i16,
2446    e13: i16,
2447    e14: i16,
2448    e15: i16,
2449) -> __m256i {
2450    #[rustfmt::skip]
2451    transmute(i16x16::new(
2452        e00, e01, e02, e03,
2453        e04, e05, e06, e07,
2454        e08, e09, e10, e11,
2455        e12, e13, e14, e15,
2456    ))
2457}
2458
2459/// Sets packed 32-bit integers in returned vector with the supplied values in
2460/// reverse order.
2461///
2462/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi32)
2463#[inline]
2464#[target_feature(enable = "avx")]
2465// This intrinsic has no corresponding instruction.
2466#[stable(feature = "simd_x86", since = "1.27.0")]
2467pub unsafe fn _mm256_setr_epi32(
2468    e0: i32,
2469    e1: i32,
2470    e2: i32,
2471    e3: i32,
2472    e4: i32,
2473    e5: i32,
2474    e6: i32,
2475    e7: i32,
2476) -> __m256i {
2477    transmute(i32x8::new(e0, e1, e2, e3, e4, e5, e6, e7))
2478}
2479
2480/// Sets packed 64-bit integers in returned vector with the supplied values in
2481/// reverse order.
2482///
2483/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_epi64x)
2484#[inline]
2485#[target_feature(enable = "avx")]
2486// This intrinsic has no corresponding instruction.
2487#[stable(feature = "simd_x86", since = "1.27.0")]
2488pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
2489    transmute(i64x4::new(a, b, c, d))
2490}
2491
2492/// Broadcasts double-precision (64-bit) floating-point value `a` to all
2493/// elements of returned vector.
2494///
2495/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_pd)
2496#[inline]
2497#[target_feature(enable = "avx")]
2498// This intrinsic has no corresponding instruction.
2499#[stable(feature = "simd_x86", since = "1.27.0")]
2500pub unsafe fn _mm256_set1_pd(a: f64) -> __m256d {
2501    _mm256_setr_pd(a, a, a, a)
2502}
2503
2504/// Broadcasts single-precision (32-bit) floating-point value `a` to all
2505/// elements of returned vector.
2506///
2507/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_ps)
2508#[inline]
2509#[target_feature(enable = "avx")]
2510// This intrinsic has no corresponding instruction.
2511#[stable(feature = "simd_x86", since = "1.27.0")]
2512pub unsafe fn _mm256_set1_ps(a: f32) -> __m256 {
2513    _mm256_setr_ps(a, a, a, a, a, a, a, a)
2514}
2515
2516/// Broadcasts 8-bit integer `a` to all elements of returned vector.
2517/// This intrinsic may generate the `vpbroadcastb`.
2518///
2519/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi8)
2520#[inline]
2521#[target_feature(enable = "avx")]
2522// This intrinsic has no corresponding instruction.
2523#[stable(feature = "simd_x86", since = "1.27.0")]
2524pub unsafe fn _mm256_set1_epi8(a: i8) -> __m256i {
2525    #[rustfmt::skip]
2526    _mm256_setr_epi8(
2527        a, a, a, a, a, a, a, a,
2528        a, a, a, a, a, a, a, a,
2529        a, a, a, a, a, a, a, a,
2530        a, a, a, a, a, a, a, a,
2531    )
2532}
2533
2534/// Broadcasts 16-bit integer `a` to all elements of returned vector.
2535/// This intrinsic may generate the `vpbroadcastw`.
2536///
2537/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi16)
2538#[inline]
2539#[target_feature(enable = "avx")]
2540//#[cfg_attr(test, assert_instr(vpshufb))]
2541#[cfg_attr(test, assert_instr(vinsertf128))]
2542// This intrinsic has no corresponding instruction.
2543#[stable(feature = "simd_x86", since = "1.27.0")]
2544pub unsafe fn _mm256_set1_epi16(a: i16) -> __m256i {
2545    _mm256_setr_epi16(a, a, a, a, a, a, a, a, a, a, a, a, a, a, a, a)
2546}
2547
2548/// Broadcasts 32-bit integer `a` to all elements of returned vector.
2549/// This intrinsic may generate the `vpbroadcastd`.
2550///
2551/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi32)
2552#[inline]
2553#[target_feature(enable = "avx")]
2554// This intrinsic has no corresponding instruction.
2555#[stable(feature = "simd_x86", since = "1.27.0")]
2556pub unsafe fn _mm256_set1_epi32(a: i32) -> __m256i {
2557    _mm256_setr_epi32(a, a, a, a, a, a, a, a)
2558}
2559
2560/// Broadcasts 64-bit integer `a` to all elements of returned vector.
2561/// This intrinsic may generate the `vpbroadcastq`.
2562///
2563/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set1_epi64x)
2564#[inline]
2565#[target_feature(enable = "avx")]
2566#[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(vinsertf128))]
2567#[cfg_attr(all(test, target_arch = "x86"), assert_instr(vbroadcastsd))]
2568// This intrinsic has no corresponding instruction.
2569#[stable(feature = "simd_x86", since = "1.27.0")]
2570pub unsafe fn _mm256_set1_epi64x(a: i64) -> __m256i {
2571    _mm256_setr_epi64x(a, a, a, a)
2572}
2573
2574/// Cast vector of type __m256d to type __m256.
2575///
2576/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_ps)
2577#[inline]
2578#[target_feature(enable = "avx")]
2579// This intrinsic is only used for compilation and does not generate any
2580// instructions, thus it has zero latency.
2581#[stable(feature = "simd_x86", since = "1.27.0")]
2582pub unsafe fn _mm256_castpd_ps(a: __m256d) -> __m256 {
2583    transmute(a)
2584}
2585
2586/// Cast vector of type __m256 to type __m256d.
2587///
2588/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_pd)
2589#[inline]
2590#[target_feature(enable = "avx")]
2591// This intrinsic is only used for compilation and does not generate any
2592// instructions, thus it has zero latency.
2593#[stable(feature = "simd_x86", since = "1.27.0")]
2594pub unsafe fn _mm256_castps_pd(a: __m256) -> __m256d {
2595    transmute(a)
2596}
2597
2598/// Casts vector of type __m256 to type __m256i.
2599///
2600/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps_si256)
2601#[inline]
2602#[target_feature(enable = "avx")]
2603// This intrinsic is only used for compilation and does not generate any
2604// instructions, thus it has zero latency.
2605#[stable(feature = "simd_x86", since = "1.27.0")]
2606pub unsafe fn _mm256_castps_si256(a: __m256) -> __m256i {
2607    transmute(a)
2608}
2609
2610/// Casts vector of type __m256i to type __m256.
2611///
2612/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps)
2613#[inline]
2614#[target_feature(enable = "avx")]
2615// This intrinsic is only used for compilation and does not generate any
2616// instructions, thus it has zero latency.
2617#[stable(feature = "simd_x86", since = "1.27.0")]
2618pub unsafe fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
2619    transmute(a)
2620}
2621
2622/// Casts vector of type __m256d to type __m256i.
2623///
2624/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd_si256)
2625#[inline]
2626#[target_feature(enable = "avx")]
2627// This intrinsic is only used for compilation and does not generate any
2628// instructions, thus it has zero latency.
2629#[stable(feature = "simd_x86", since = "1.27.0")]
2630pub unsafe fn _mm256_castpd_si256(a: __m256d) -> __m256i {
2631    transmute(a)
2632}
2633
2634/// Casts vector of type __m256i to type __m256d.
2635///
2636/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_pd)
2637#[inline]
2638#[target_feature(enable = "avx")]
2639// This intrinsic is only used for compilation and does not generate any
2640// instructions, thus it has zero latency.
2641#[stable(feature = "simd_x86", since = "1.27.0")]
2642pub unsafe fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
2643    transmute(a)
2644}
2645
2646/// Casts vector of type __m256 to type __m128.
2647///
2648/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps256_ps128)
2649#[inline]
2650#[target_feature(enable = "avx")]
2651// This intrinsic is only used for compilation and does not generate any
2652// instructions, thus it has zero latency.
2653#[stable(feature = "simd_x86", since = "1.27.0")]
2654pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 {
2655    simd_shuffle!(a, a, [0, 1, 2, 3])
2656}
2657
2658/// Casts vector of type __m256d to type __m128d.
2659///
2660/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd256_pd128)
2661#[inline]
2662#[target_feature(enable = "avx")]
2663// This intrinsic is only used for compilation and does not generate any
2664// instructions, thus it has zero latency.
2665#[stable(feature = "simd_x86", since = "1.27.0")]
2666pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
2667    simd_shuffle!(a, a, [0, 1])
2668}
2669
2670/// Casts vector of type __m256i to type __m128i.
2671///
2672/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_si128)
2673#[inline]
2674#[target_feature(enable = "avx")]
2675// This intrinsic is only used for compilation and does not generate any
2676// instructions, thus it has zero latency.
2677#[stable(feature = "simd_x86", since = "1.27.0")]
2678pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
2679    let a = a.as_i64x4();
2680    let dst: i64x2 = simd_shuffle!(a, a, [0, 1]);
2681    transmute(dst)
2682}
2683
2684/// Casts vector of type __m128 to type __m256;
2685/// the upper 128 bits of the result are undefined.
2686///
2687/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castps128_ps256)
2688#[inline]
2689#[target_feature(enable = "avx")]
2690// This intrinsic is only used for compilation and does not generate any
2691// instructions, thus it has zero latency.
2692#[stable(feature = "simd_x86", since = "1.27.0")]
2693pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
2694    simd_shuffle!(a, _mm_undefined_ps(), [0, 1, 2, 3, 4, 4, 4, 4])
2695}
2696
2697/// Casts vector of type __m128d to type __m256d;
2698/// the upper 128 bits of the result are undefined.
2699///
2700/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castpd128_pd256)
2701#[inline]
2702#[target_feature(enable = "avx")]
2703// This intrinsic is only used for compilation and does not generate any
2704// instructions, thus it has zero latency.
2705#[stable(feature = "simd_x86", since = "1.27.0")]
2706pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
2707    simd_shuffle!(a, _mm_undefined_pd(), [0, 1, 2, 2])
2708}
2709
2710/// Casts vector of type __m128i to type __m256i;
2711/// the upper 128 bits of the result are undefined.
2712///
2713/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi128_si256)
2714#[inline]
2715#[target_feature(enable = "avx")]
2716// This intrinsic is only used for compilation and does not generate any
2717// instructions, thus it has zero latency.
2718#[stable(feature = "simd_x86", since = "1.27.0")]
2719pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
2720    let a = a.as_i64x2();
2721    let undefined = i64x2::ZERO;
2722    let dst: i64x4 = simd_shuffle!(a, undefined, [0, 1, 2, 2]);
2723    transmute(dst)
2724}
2725
2726/// Constructs a 256-bit floating-point vector of `[8 x float]` from a
2727/// 128-bit floating-point vector of `[4 x float]`. The lower 128 bits contain
2728/// the value of the source vector. The upper 128 bits are set to zero.
2729///
2730/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextps128_ps256)
2731#[inline]
2732#[target_feature(enable = "avx")]
2733// This intrinsic is only used for compilation and does not generate any
2734// instructions, thus it has zero latency.
2735#[stable(feature = "simd_x86", since = "1.27.0")]
2736pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
2737    simd_shuffle!(a, _mm_setzero_ps(), [0, 1, 2, 3, 4, 5, 6, 7])
2738}
2739
2740/// Constructs a 256-bit integer vector from a 128-bit integer vector.
2741/// The lower 128 bits contain the value of the source vector. The upper
2742/// 128 bits are set to zero.
2743///
2744/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextsi128_si256)
2745#[inline]
2746#[target_feature(enable = "avx")]
2747// This intrinsic is only used for compilation and does not generate any
2748// instructions, thus it has zero latency.
2749#[stable(feature = "simd_x86", since = "1.27.0")]
2750pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
2751    let b = i64x2::ZERO;
2752    let dst: i64x4 = simd_shuffle!(a.as_i64x2(), b, [0, 1, 2, 3]);
2753    transmute(dst)
2754}
2755
2756/// Constructs a 256-bit floating-point vector of `[4 x double]` from a
2757/// 128-bit floating-point vector of `[2 x double]`. The lower 128 bits
2758/// contain the value of the source vector. The upper 128 bits are set
2759/// to zero.
2760///
2761/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_zextpd128_pd256)
2762#[inline]
2763#[target_feature(enable = "avx")]
2764// This intrinsic is only used for compilation and does not generate any
2765// instructions, thus it has zero latency.
2766#[stable(feature = "simd_x86", since = "1.27.0")]
2767pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
2768    simd_shuffle!(a, _mm_setzero_pd(), [0, 1, 2, 3])
2769}
2770
2771/// Returns vector of type `__m256` with indeterminate elements.
2772/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
2773/// In practice, this is equivalent to [`mem::zeroed`].
2774///
2775/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_ps)
2776#[inline]
2777#[target_feature(enable = "avx")]
2778// This intrinsic has no corresponding instruction.
2779#[stable(feature = "simd_x86", since = "1.27.0")]
2780pub unsafe fn _mm256_undefined_ps() -> __m256 {
2781    const { mem::zeroed() }
2782}
2783
2784/// Returns vector of type `__m256d` with indeterminate elements.
2785/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
2786/// In practice, this is equivalent to [`mem::zeroed`].
2787///
2788/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_pd)
2789#[inline]
2790#[target_feature(enable = "avx")]
2791// This intrinsic has no corresponding instruction.
2792#[stable(feature = "simd_x86", since = "1.27.0")]
2793pub unsafe fn _mm256_undefined_pd() -> __m256d {
2794    const { mem::zeroed() }
2795}
2796
2797/// Returns vector of type __m256i with with indeterminate elements.
2798/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
2799/// In practice, this is equivalent to [`mem::zeroed`].
2800///
2801/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_undefined_si256)
2802#[inline]
2803#[target_feature(enable = "avx")]
2804// This intrinsic has no corresponding instruction.
2805#[stable(feature = "simd_x86", since = "1.27.0")]
2806pub unsafe fn _mm256_undefined_si256() -> __m256i {
2807    const { mem::zeroed() }
2808}
2809
2810/// Sets packed __m256 returned vector with the supplied values.
2811///
2812/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128)
2813#[inline]
2814#[target_feature(enable = "avx")]
2815#[cfg_attr(test, assert_instr(vinsertf128))]
2816#[stable(feature = "simd_x86", since = "1.27.0")]
2817pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
2818    simd_shuffle!(lo, hi, [0, 1, 2, 3, 4, 5, 6, 7])
2819}
2820
2821/// Sets packed __m256d returned vector with the supplied values.
2822///
2823/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128d)
2824#[inline]
2825#[target_feature(enable = "avx")]
2826#[cfg_attr(test, assert_instr(vinsertf128))]
2827#[stable(feature = "simd_x86", since = "1.27.0")]
2828pub unsafe fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
2829    let hi: __m128 = transmute(hi);
2830    let lo: __m128 = transmute(lo);
2831    transmute(_mm256_set_m128(hi, lo))
2832}
2833
2834/// Sets packed __m256i returned vector with the supplied values.
2835///
2836/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_set_m128i)
2837#[inline]
2838#[target_feature(enable = "avx")]
2839#[cfg_attr(test, assert_instr(vinsertf128))]
2840#[stable(feature = "simd_x86", since = "1.27.0")]
2841pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
2842    let hi: __m128 = transmute(hi);
2843    let lo: __m128 = transmute(lo);
2844    transmute(_mm256_set_m128(hi, lo))
2845}
2846
2847/// Sets packed __m256 returned vector with the supplied values.
2848///
2849/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128)
2850#[inline]
2851#[target_feature(enable = "avx")]
2852#[cfg_attr(test, assert_instr(vinsertf128))]
2853#[stable(feature = "simd_x86", since = "1.27.0")]
2854pub unsafe fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
2855    _mm256_set_m128(hi, lo)
2856}
2857
2858/// Sets packed __m256d returned vector with the supplied values.
2859///
2860/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128d)
2861#[inline]
2862#[target_feature(enable = "avx")]
2863#[cfg_attr(test, assert_instr(vinsertf128))]
2864#[stable(feature = "simd_x86", since = "1.27.0")]
2865pub unsafe fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
2866    _mm256_set_m128d(hi, lo)
2867}
2868
2869/// Sets packed __m256i returned vector with the supplied values.
2870///
2871/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_setr_m128i)
2872#[inline]
2873#[target_feature(enable = "avx")]
2874#[cfg_attr(test, assert_instr(vinsertf128))]
2875#[stable(feature = "simd_x86", since = "1.27.0")]
2876pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
2877    _mm256_set_m128i(hi, lo)
2878}
2879
2880/// Loads two 128-bit values (composed of 4 packed single-precision (32-bit)
2881/// floating-point elements) from memory, and combine them into a 256-bit
2882/// value.
2883/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2884///
2885/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128)
2886#[inline]
2887#[target_feature(enable = "avx")]
2888// This intrinsic has no corresponding instruction.
2889#[stable(feature = "simd_x86", since = "1.27.0")]
2890pub unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256 {
2891    let a = _mm256_castps128_ps256(_mm_loadu_ps(loaddr));
2892    _mm256_insertf128_ps::<1>(a, _mm_loadu_ps(hiaddr))
2893}
2894
2895/// Loads two 128-bit values (composed of 2 packed double-precision (64-bit)
2896/// floating-point elements) from memory, and combine them into a 256-bit
2897/// value.
2898/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2899///
2900/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128d)
2901#[inline]
2902#[target_feature(enable = "avx")]
2903// This intrinsic has no corresponding instruction.
2904#[stable(feature = "simd_x86", since = "1.27.0")]
2905pub unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d {
2906    let a = _mm256_castpd128_pd256(_mm_loadu_pd(loaddr));
2907    _mm256_insertf128_pd::<1>(a, _mm_loadu_pd(hiaddr))
2908}
2909
2910/// Loads two 128-bit values (composed of integer data) from memory, and combine
2911/// them into a 256-bit value.
2912/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2913///
2914/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_loadu2_m128i)
2915#[inline]
2916#[target_feature(enable = "avx")]
2917// This intrinsic has no corresponding instruction.
2918#[stable(feature = "simd_x86", since = "1.27.0")]
2919pub unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i {
2920    let a = _mm256_castsi128_si256(_mm_loadu_si128(loaddr));
2921    _mm256_insertf128_si256::<1>(a, _mm_loadu_si128(hiaddr))
2922}
2923
2924/// Stores the high and low 128-bit halves (each composed of 4 packed
2925/// single-precision (32-bit) floating-point elements) from `a` into memory two
2926/// different 128-bit locations.
2927/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2928///
2929/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128)
2930#[inline]
2931#[target_feature(enable = "avx")]
2932// This intrinsic has no corresponding instruction.
2933#[stable(feature = "simd_x86", since = "1.27.0")]
2934pub unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256) {
2935    let lo = _mm256_castps256_ps128(a);
2936    _mm_storeu_ps(loaddr, lo);
2937    let hi = _mm256_extractf128_ps::<1>(a);
2938    _mm_storeu_ps(hiaddr, hi);
2939}
2940
2941/// Stores the high and low 128-bit halves (each composed of 2 packed
2942/// double-precision (64-bit) floating-point elements) from `a` into memory two
2943/// different 128-bit locations.
2944/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2945///
2946/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128d)
2947#[inline]
2948#[target_feature(enable = "avx")]
2949// This intrinsic has no corresponding instruction.
2950#[stable(feature = "simd_x86", since = "1.27.0")]
2951pub unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d) {
2952    let lo = _mm256_castpd256_pd128(a);
2953    _mm_storeu_pd(loaddr, lo);
2954    let hi = _mm256_extractf128_pd::<1>(a);
2955    _mm_storeu_pd(hiaddr, hi);
2956}
2957
2958/// Stores the high and low 128-bit halves (each composed of integer data) from
2959/// `a` into memory two different 128-bit locations.
2960/// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
2961///
2962/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_storeu2_m128i)
2963#[inline]
2964#[target_feature(enable = "avx")]
2965// This intrinsic has no corresponding instruction.
2966#[stable(feature = "simd_x86", since = "1.27.0")]
2967pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i) {
2968    let lo = _mm256_castsi256_si128(a);
2969    _mm_storeu_si128(loaddr, lo);
2970    let hi = _mm256_extractf128_si256::<1>(a);
2971    _mm_storeu_si128(hiaddr, hi);
2972}
2973
2974/// Returns the first element of the input vector of `[8 x float]`.
2975///
2976/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtss_f32)
2977#[inline]
2978#[target_feature(enable = "avx")]
2979//#[cfg_attr(test, assert_instr(movss))] FIXME
2980#[stable(feature = "simd_x86", since = "1.27.0")]
2981pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 {
2982    simd_extract!(a, 0)
2983}
2984
2985// LLVM intrinsics used in the above functions
2986#[allow(improper_ctypes)]
2987extern "C" {
2988    #[link_name = "llvm.x86.avx.round.pd.256"]
2989    fn roundpd256(a: __m256d, b: i32) -> __m256d;
2990    #[link_name = "llvm.x86.avx.round.ps.256"]
2991    fn roundps256(a: __m256, b: i32) -> __m256;
2992    #[link_name = "llvm.x86.avx.dp.ps.256"]
2993    fn vdpps(a: __m256, b: __m256, imm8: i32) -> __m256;
2994    #[link_name = "llvm.x86.avx.hadd.pd.256"]
2995    fn vhaddpd(a: __m256d, b: __m256d) -> __m256d;
2996    #[link_name = "llvm.x86.avx.hadd.ps.256"]
2997    fn vhaddps(a: __m256, b: __m256) -> __m256;
2998    #[link_name = "llvm.x86.avx.hsub.pd.256"]
2999    fn vhsubpd(a: __m256d, b: __m256d) -> __m256d;
3000    #[link_name = "llvm.x86.avx.hsub.ps.256"]
3001    fn vhsubps(a: __m256, b: __m256) -> __m256;
3002    #[link_name = "llvm.x86.sse2.cmp.pd"]
3003    fn vcmppd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3004    #[link_name = "llvm.x86.avx.cmp.pd.256"]
3005    fn vcmppd256(a: __m256d, b: __m256d, imm8: u8) -> __m256d;
3006    #[link_name = "llvm.x86.sse.cmp.ps"]
3007    fn vcmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
3008    #[link_name = "llvm.x86.avx.cmp.ps.256"]
3009    fn vcmpps256(a: __m256, b: __m256, imm8: u8) -> __m256;
3010    #[link_name = "llvm.x86.sse2.cmp.sd"]
3011    fn vcmpsd(a: __m128d, b: __m128d, imm8: i8) -> __m128d;
3012    #[link_name = "llvm.x86.sse.cmp.ss"]
3013    fn vcmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
3014    #[link_name = "llvm.x86.avx.cvt.ps2dq.256"]
3015    fn vcvtps2dq(a: __m256) -> i32x8;
3016    #[link_name = "llvm.x86.avx.cvtt.pd2dq.256"]
3017    fn vcvttpd2dq(a: __m256d) -> i32x4;
3018    #[link_name = "llvm.x86.avx.cvt.pd2dq.256"]
3019    fn vcvtpd2dq(a: __m256d) -> i32x4;
3020    #[link_name = "llvm.x86.avx.cvtt.ps2dq.256"]
3021    fn vcvttps2dq(a: __m256) -> i32x8;
3022    #[link_name = "llvm.x86.avx.vzeroall"]
3023    fn vzeroall();
3024    #[link_name = "llvm.x86.avx.vzeroupper"]
3025    fn vzeroupper();
3026    #[link_name = "llvm.x86.avx.vpermilvar.ps.256"]
3027    fn vpermilps256(a: __m256, b: i32x8) -> __m256;
3028    #[link_name = "llvm.x86.avx.vpermilvar.ps"]
3029    fn vpermilps(a: __m128, b: i32x4) -> __m128;
3030    #[link_name = "llvm.x86.avx.vpermilvar.pd.256"]
3031    fn vpermilpd256(a: __m256d, b: i64x4) -> __m256d;
3032    #[link_name = "llvm.x86.avx.vpermilvar.pd"]
3033    fn vpermilpd(a: __m128d, b: i64x2) -> __m128d;
3034    #[link_name = "llvm.x86.avx.vperm2f128.ps.256"]
3035    fn vperm2f128ps256(a: __m256, b: __m256, imm8: i8) -> __m256;
3036    #[link_name = "llvm.x86.avx.vperm2f128.pd.256"]
3037    fn vperm2f128pd256(a: __m256d, b: __m256d, imm8: i8) -> __m256d;
3038    #[link_name = "llvm.x86.avx.vperm2f128.si.256"]
3039    fn vperm2f128si256(a: i32x8, b: i32x8, imm8: i8) -> i32x8;
3040    #[link_name = "llvm.x86.avx.maskload.pd.256"]
3041    fn maskloadpd256(mem_addr: *const i8, mask: i64x4) -> __m256d;
3042    #[link_name = "llvm.x86.avx.maskstore.pd.256"]
3043    fn maskstorepd256(mem_addr: *mut i8, mask: i64x4, a: __m256d);
3044    #[link_name = "llvm.x86.avx.maskload.pd"]
3045    fn maskloadpd(mem_addr: *const i8, mask: i64x2) -> __m128d;
3046    #[link_name = "llvm.x86.avx.maskstore.pd"]
3047    fn maskstorepd(mem_addr: *mut i8, mask: i64x2, a: __m128d);
3048    #[link_name = "llvm.x86.avx.maskload.ps.256"]
3049    fn maskloadps256(mem_addr: *const i8, mask: i32x8) -> __m256;
3050    #[link_name = "llvm.x86.avx.maskstore.ps.256"]
3051    fn maskstoreps256(mem_addr: *mut i8, mask: i32x8, a: __m256);
3052    #[link_name = "llvm.x86.avx.maskload.ps"]
3053    fn maskloadps(mem_addr: *const i8, mask: i32x4) -> __m128;
3054    #[link_name = "llvm.x86.avx.maskstore.ps"]
3055    fn maskstoreps(mem_addr: *mut i8, mask: i32x4, a: __m128);
3056    #[link_name = "llvm.x86.avx.ldu.dq.256"]
3057    fn vlddqu(mem_addr: *const i8) -> i8x32;
3058    #[link_name = "llvm.x86.avx.rcp.ps.256"]
3059    fn vrcpps(a: __m256) -> __m256;
3060    #[link_name = "llvm.x86.avx.rsqrt.ps.256"]
3061    fn vrsqrtps(a: __m256) -> __m256;
3062    #[link_name = "llvm.x86.avx.ptestz.256"]
3063    fn ptestz256(a: i64x4, b: i64x4) -> i32;
3064    #[link_name = "llvm.x86.avx.ptestc.256"]
3065    fn ptestc256(a: i64x4, b: i64x4) -> i32;
3066    #[link_name = "llvm.x86.avx.ptestnzc.256"]
3067    fn ptestnzc256(a: i64x4, b: i64x4) -> i32;
3068    #[link_name = "llvm.x86.avx.vtestz.pd.256"]
3069    fn vtestzpd256(a: __m256d, b: __m256d) -> i32;
3070    #[link_name = "llvm.x86.avx.vtestc.pd.256"]
3071    fn vtestcpd256(a: __m256d, b: __m256d) -> i32;
3072    #[link_name = "llvm.x86.avx.vtestnzc.pd.256"]
3073    fn vtestnzcpd256(a: __m256d, b: __m256d) -> i32;
3074    #[link_name = "llvm.x86.avx.vtestz.pd"]
3075    fn vtestzpd(a: __m128d, b: __m128d) -> i32;
3076    #[link_name = "llvm.x86.avx.vtestc.pd"]
3077    fn vtestcpd(a: __m128d, b: __m128d) -> i32;
3078    #[link_name = "llvm.x86.avx.vtestnzc.pd"]
3079    fn vtestnzcpd(a: __m128d, b: __m128d) -> i32;
3080    #[link_name = "llvm.x86.avx.vtestz.ps.256"]
3081    fn vtestzps256(a: __m256, b: __m256) -> i32;
3082    #[link_name = "llvm.x86.avx.vtestc.ps.256"]
3083    fn vtestcps256(a: __m256, b: __m256) -> i32;
3084    #[link_name = "llvm.x86.avx.vtestnzc.ps.256"]
3085    fn vtestnzcps256(a: __m256, b: __m256) -> i32;
3086    #[link_name = "llvm.x86.avx.vtestz.ps"]
3087    fn vtestzps(a: __m128, b: __m128) -> i32;
3088    #[link_name = "llvm.x86.avx.vtestc.ps"]
3089    fn vtestcps(a: __m128, b: __m128) -> i32;
3090    #[link_name = "llvm.x86.avx.vtestnzc.ps"]
3091    fn vtestnzcps(a: __m128, b: __m128) -> i32;
3092    #[link_name = "llvm.x86.avx.min.ps.256"]
3093    fn vminps(a: __m256, b: __m256) -> __m256;
3094    #[link_name = "llvm.x86.avx.max.ps.256"]
3095    fn vmaxps(a: __m256, b: __m256) -> __m256;
3096    #[link_name = "llvm.x86.avx.min.pd.256"]
3097    fn vminpd(a: __m256d, b: __m256d) -> __m256d;
3098    #[link_name = "llvm.x86.avx.max.pd.256"]
3099    fn vmaxpd(a: __m256d, b: __m256d) -> __m256d;
3100}
3101
3102#[cfg(test)]
3103mod tests {
3104    use crate::hint::black_box;
3105    use crate::ptr;
3106    use stdarch_test::simd_test;
3107
3108    use crate::core_arch::x86::*;
3109
3110    #[simd_test(enable = "avx")]
3111    unsafe fn test_mm256_add_pd() {
3112        let a = _mm256_setr_pd(1., 2., 3., 4.);
3113        let b = _mm256_setr_pd(5., 6., 7., 8.);
3114        let r = _mm256_add_pd(a, b);
3115        let e = _mm256_setr_pd(6., 8., 10., 12.);
3116        assert_eq_m256d(r, e);
3117    }
3118
3119    #[simd_test(enable = "avx")]
3120    unsafe fn test_mm256_add_ps() {
3121        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3122        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3123        let r = _mm256_add_ps(a, b);
3124        let e = _mm256_setr_ps(10., 12., 14., 16., 18., 20., 22., 24.);
3125        assert_eq_m256(r, e);
3126    }
3127
3128    #[simd_test(enable = "avx")]
3129    unsafe fn test_mm256_and_pd() {
3130        let a = _mm256_set1_pd(1.);
3131        let b = _mm256_set1_pd(0.6);
3132        let r = _mm256_and_pd(a, b);
3133        let e = _mm256_set1_pd(0.5);
3134        assert_eq_m256d(r, e);
3135    }
3136
3137    #[simd_test(enable = "avx")]
3138    unsafe fn test_mm256_and_ps() {
3139        let a = _mm256_set1_ps(1.);
3140        let b = _mm256_set1_ps(0.6);
3141        let r = _mm256_and_ps(a, b);
3142        let e = _mm256_set1_ps(0.5);
3143        assert_eq_m256(r, e);
3144    }
3145
3146    #[simd_test(enable = "avx")]
3147    unsafe fn test_mm256_or_pd() {
3148        let a = _mm256_set1_pd(1.);
3149        let b = _mm256_set1_pd(0.6);
3150        let r = _mm256_or_pd(a, b);
3151        let e = _mm256_set1_pd(1.2);
3152        assert_eq_m256d(r, e);
3153    }
3154
3155    #[simd_test(enable = "avx")]
3156    unsafe fn test_mm256_or_ps() {
3157        let a = _mm256_set1_ps(1.);
3158        let b = _mm256_set1_ps(0.6);
3159        let r = _mm256_or_ps(a, b);
3160        let e = _mm256_set1_ps(1.2);
3161        assert_eq_m256(r, e);
3162    }
3163
3164    #[simd_test(enable = "avx")]
3165    unsafe fn test_mm256_shuffle_pd() {
3166        let a = _mm256_setr_pd(1., 4., 5., 8.);
3167        let b = _mm256_setr_pd(2., 3., 6., 7.);
3168        let r = _mm256_shuffle_pd::<0b11_11_11_11>(a, b);
3169        let e = _mm256_setr_pd(4., 3., 8., 7.);
3170        assert_eq_m256d(r, e);
3171    }
3172
3173    #[simd_test(enable = "avx")]
3174    unsafe fn test_mm256_shuffle_ps() {
3175        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3176        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3177        let r = _mm256_shuffle_ps::<0b00_00_11_11>(a, b);
3178        let e = _mm256_setr_ps(8., 8., 2., 2., 16., 16., 10., 10.);
3179        assert_eq_m256(r, e);
3180    }
3181
3182    #[simd_test(enable = "avx")]
3183    unsafe fn test_mm256_andnot_pd() {
3184        let a = _mm256_set1_pd(0.);
3185        let b = _mm256_set1_pd(0.6);
3186        let r = _mm256_andnot_pd(a, b);
3187        assert_eq_m256d(r, b);
3188    }
3189
3190    #[simd_test(enable = "avx")]
3191    unsafe fn test_mm256_andnot_ps() {
3192        let a = _mm256_set1_ps(0.);
3193        let b = _mm256_set1_ps(0.6);
3194        let r = _mm256_andnot_ps(a, b);
3195        assert_eq_m256(r, b);
3196    }
3197
3198    #[simd_test(enable = "avx")]
3199    unsafe fn test_mm256_max_pd() {
3200        let a = _mm256_setr_pd(1., 4., 5., 8.);
3201        let b = _mm256_setr_pd(2., 3., 6., 7.);
3202        let r = _mm256_max_pd(a, b);
3203        let e = _mm256_setr_pd(2., 4., 6., 8.);
3204        assert_eq_m256d(r, e);
3205        // > If the values being compared are both 0.0s (of either sign), the
3206        // > value in the second operand (source operand) is returned.
3207        let w = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3208        let x = _mm256_max_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3209        let wu: [u64; 4] = transmute(w);
3210        let xu: [u64; 4] = transmute(x);
3211        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3212        assert_eq!(xu, [0u64; 4]);
3213        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3214        // > second operand (source operand), either a NaN or a valid
3215        // > floating-point value, is written to the result.
3216        let y = _mm256_max_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3217        let z = _mm256_max_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3218        let yf: [f64; 4] = transmute(y);
3219        let zf: [f64; 4] = transmute(z);
3220        assert_eq!(yf, [0.0; 4]);
3221        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3222    }
3223
3224    #[simd_test(enable = "avx")]
3225    unsafe fn test_mm256_max_ps() {
3226        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3227        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3228        let r = _mm256_max_ps(a, b);
3229        let e = _mm256_setr_ps(2., 4., 6., 8., 10., 12., 14., 16.);
3230        assert_eq_m256(r, e);
3231        // > If the values being compared are both 0.0s (of either sign), the
3232        // > value in the second operand (source operand) is returned.
3233        let w = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3234        let x = _mm256_max_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3235        let wu: [u32; 8] = transmute(w);
3236        let xu: [u32; 8] = transmute(x);
3237        assert_eq!(wu, [0x8000_0000u32; 8]);
3238        assert_eq!(xu, [0u32; 8]);
3239        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3240        // > second operand (source operand), either a NaN or a valid
3241        // > floating-point value, is written to the result.
3242        let y = _mm256_max_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3243        let z = _mm256_max_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3244        let yf: [f32; 8] = transmute(y);
3245        let zf: [f32; 8] = transmute(z);
3246        assert_eq!(yf, [0.0; 8]);
3247        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3248    }
3249
3250    #[simd_test(enable = "avx")]
3251    unsafe fn test_mm256_min_pd() {
3252        let a = _mm256_setr_pd(1., 4., 5., 8.);
3253        let b = _mm256_setr_pd(2., 3., 6., 7.);
3254        let r = _mm256_min_pd(a, b);
3255        let e = _mm256_setr_pd(1., 3., 5., 7.);
3256        assert_eq_m256d(r, e);
3257        // > If the values being compared are both 0.0s (of either sign), the
3258        // > value in the second operand (source operand) is returned.
3259        let w = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(-0.0));
3260        let x = _mm256_min_pd(_mm256_set1_pd(-0.0), _mm256_set1_pd(0.0));
3261        let wu: [u64; 4] = transmute(w);
3262        let xu: [u64; 4] = transmute(x);
3263        assert_eq!(wu, [0x8000_0000_0000_0000u64; 4]);
3264        assert_eq!(xu, [0u64; 4]);
3265        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3266        // > second operand (source operand), either a NaN or a valid
3267        // > floating-point value, is written to the result.
3268        let y = _mm256_min_pd(_mm256_set1_pd(f64::NAN), _mm256_set1_pd(0.0));
3269        let z = _mm256_min_pd(_mm256_set1_pd(0.0), _mm256_set1_pd(f64::NAN));
3270        let yf: [f64; 4] = transmute(y);
3271        let zf: [f64; 4] = transmute(z);
3272        assert_eq!(yf, [0.0; 4]);
3273        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3274    }
3275
3276    #[simd_test(enable = "avx")]
3277    unsafe fn test_mm256_min_ps() {
3278        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3279        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3280        let r = _mm256_min_ps(a, b);
3281        let e = _mm256_setr_ps(1., 3., 5., 7., 9., 11., 13., 15.);
3282        assert_eq_m256(r, e);
3283        // > If the values being compared are both 0.0s (of either sign), the
3284        // > value in the second operand (source operand) is returned.
3285        let w = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(-0.0));
3286        let x = _mm256_min_ps(_mm256_set1_ps(-0.0), _mm256_set1_ps(0.0));
3287        let wu: [u32; 8] = transmute(w);
3288        let xu: [u32; 8] = transmute(x);
3289        assert_eq!(wu, [0x8000_0000u32; 8]);
3290        assert_eq!(xu, [0u32; 8]);
3291        // > If only one value is a NaN (SNaN or QNaN) for this instruction, the
3292        // > second operand (source operand), either a NaN or a valid
3293        // > floating-point value, is written to the result.
3294        let y = _mm256_min_ps(_mm256_set1_ps(f32::NAN), _mm256_set1_ps(0.0));
3295        let z = _mm256_min_ps(_mm256_set1_ps(0.0), _mm256_set1_ps(f32::NAN));
3296        let yf: [f32; 8] = transmute(y);
3297        let zf: [f32; 8] = transmute(z);
3298        assert_eq!(yf, [0.0; 8]);
3299        assert!(zf.iter().all(|f| f.is_nan()), "{:?}", zf);
3300    }
3301
3302    #[simd_test(enable = "avx")]
3303    unsafe fn test_mm256_mul_pd() {
3304        let a = _mm256_setr_pd(1., 2., 3., 4.);
3305        let b = _mm256_setr_pd(5., 6., 7., 8.);
3306        let r = _mm256_mul_pd(a, b);
3307        let e = _mm256_setr_pd(5., 12., 21., 32.);
3308        assert_eq_m256d(r, e);
3309    }
3310
3311    #[simd_test(enable = "avx")]
3312    unsafe fn test_mm256_mul_ps() {
3313        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
3314        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
3315        let r = _mm256_mul_ps(a, b);
3316        let e = _mm256_setr_ps(9., 20., 33., 48., 65., 84., 105., 128.);
3317        assert_eq_m256(r, e);
3318    }
3319
3320    #[simd_test(enable = "avx")]
3321    unsafe fn test_mm256_addsub_pd() {
3322        let a = _mm256_setr_pd(1., 2., 3., 4.);
3323        let b = _mm256_setr_pd(5., 6., 7., 8.);
3324        let r = _mm256_addsub_pd(a, b);
3325        let e = _mm256_setr_pd(-4., 8., -4., 12.);
3326        assert_eq_m256d(r, e);
3327    }
3328
3329    #[simd_test(enable = "avx")]
3330    unsafe fn test_mm256_addsub_ps() {
3331        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3332        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3333        let r = _mm256_addsub_ps(a, b);
3334        let e = _mm256_setr_ps(-4., 8., -4., 12., -4., 8., -4., 12.);
3335        assert_eq_m256(r, e);
3336    }
3337
3338    #[simd_test(enable = "avx")]
3339    unsafe fn test_mm256_sub_pd() {
3340        let a = _mm256_setr_pd(1., 2., 3., 4.);
3341        let b = _mm256_setr_pd(5., 6., 7., 8.);
3342        let r = _mm256_sub_pd(a, b);
3343        let e = _mm256_setr_pd(-4., -4., -4., -4.);
3344        assert_eq_m256d(r, e);
3345    }
3346
3347    #[simd_test(enable = "avx")]
3348    unsafe fn test_mm256_sub_ps() {
3349        let a = _mm256_setr_ps(1., 2., 3., 4., -1., -2., -3., -4.);
3350        let b = _mm256_setr_ps(5., 6., 7., 8., 3., 2., 1., 0.);
3351        let r = _mm256_sub_ps(a, b);
3352        let e = _mm256_setr_ps(-4., -4., -4., -4., -4., -4., -4., -4.);
3353        assert_eq_m256(r, e);
3354    }
3355
3356    #[simd_test(enable = "avx")]
3357    unsafe fn test_mm256_round_pd() {
3358        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3359        let result_closest = _mm256_round_pd::<0b0000>(a);
3360        let result_down = _mm256_round_pd::<0b0001>(a);
3361        let result_up = _mm256_round_pd::<0b0010>(a);
3362        let expected_closest = _mm256_setr_pd(2., 2., 4., -1.);
3363        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3364        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3365        assert_eq_m256d(result_closest, expected_closest);
3366        assert_eq_m256d(result_down, expected_down);
3367        assert_eq_m256d(result_up, expected_up);
3368    }
3369
3370    #[simd_test(enable = "avx")]
3371    unsafe fn test_mm256_floor_pd() {
3372        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3373        let result_down = _mm256_floor_pd(a);
3374        let expected_down = _mm256_setr_pd(1., 2., 3., -2.);
3375        assert_eq_m256d(result_down, expected_down);
3376    }
3377
3378    #[simd_test(enable = "avx")]
3379    unsafe fn test_mm256_ceil_pd() {
3380        let a = _mm256_setr_pd(1.55, 2.2, 3.99, -1.2);
3381        let result_up = _mm256_ceil_pd(a);
3382        let expected_up = _mm256_setr_pd(2., 3., 4., -1.);
3383        assert_eq_m256d(result_up, expected_up);
3384    }
3385
3386    #[simd_test(enable = "avx")]
3387    unsafe fn test_mm256_round_ps() {
3388        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3389        let result_closest = _mm256_round_ps::<0b0000>(a);
3390        let result_down = _mm256_round_ps::<0b0001>(a);
3391        let result_up = _mm256_round_ps::<0b0010>(a);
3392        let expected_closest = _mm256_setr_ps(2., 2., 4., -1., 2., 2., 4., -1.);
3393        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3394        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3395        assert_eq_m256(result_closest, expected_closest);
3396        assert_eq_m256(result_down, expected_down);
3397        assert_eq_m256(result_up, expected_up);
3398    }
3399
3400    #[simd_test(enable = "avx")]
3401    unsafe fn test_mm256_floor_ps() {
3402        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3403        let result_down = _mm256_floor_ps(a);
3404        let expected_down = _mm256_setr_ps(1., 2., 3., -2., 1., 2., 3., -2.);
3405        assert_eq_m256(result_down, expected_down);
3406    }
3407
3408    #[simd_test(enable = "avx")]
3409    unsafe fn test_mm256_ceil_ps() {
3410        let a = _mm256_setr_ps(1.55, 2.2, 3.99, -1.2, 1.55, 2.2, 3.99, -1.2);
3411        let result_up = _mm256_ceil_ps(a);
3412        let expected_up = _mm256_setr_ps(2., 3., 4., -1., 2., 3., 4., -1.);
3413        assert_eq_m256(result_up, expected_up);
3414    }
3415
3416    #[simd_test(enable = "avx")]
3417    unsafe fn test_mm256_sqrt_pd() {
3418        let a = _mm256_setr_pd(4., 9., 16., 25.);
3419        let r = _mm256_sqrt_pd(a);
3420        let e = _mm256_setr_pd(2., 3., 4., 5.);
3421        assert_eq_m256d(r, e);
3422    }
3423
3424    #[simd_test(enable = "avx")]
3425    unsafe fn test_mm256_sqrt_ps() {
3426        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3427        let r = _mm256_sqrt_ps(a);
3428        let e = _mm256_setr_ps(2., 3., 4., 5., 2., 3., 4., 5.);
3429        assert_eq_m256(r, e);
3430    }
3431
3432    #[simd_test(enable = "avx")]
3433    unsafe fn test_mm256_div_ps() {
3434        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3435        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3436        let r = _mm256_div_ps(a, b);
3437        let e = _mm256_setr_ps(1., 3., 8., 5., 0.5, 1., 0.25, 0.5);
3438        assert_eq_m256(r, e);
3439    }
3440
3441    #[simd_test(enable = "avx")]
3442    unsafe fn test_mm256_div_pd() {
3443        let a = _mm256_setr_pd(4., 9., 16., 25.);
3444        let b = _mm256_setr_pd(4., 3., 2., 5.);
3445        let r = _mm256_div_pd(a, b);
3446        let e = _mm256_setr_pd(1., 3., 8., 5.);
3447        assert_eq_m256d(r, e);
3448    }
3449
3450    #[simd_test(enable = "avx")]
3451    unsafe fn test_mm256_blend_pd() {
3452        let a = _mm256_setr_pd(4., 9., 16., 25.);
3453        let b = _mm256_setr_pd(4., 3., 2., 5.);
3454        let r = _mm256_blend_pd::<0x0>(a, b);
3455        assert_eq_m256d(r, _mm256_setr_pd(4., 9., 16., 25.));
3456        let r = _mm256_blend_pd::<0x3>(a, b);
3457        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 16., 25.));
3458        let r = _mm256_blend_pd::<0xF>(a, b);
3459        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 5.));
3460    }
3461
3462    #[simd_test(enable = "avx")]
3463    unsafe fn test_mm256_blend_ps() {
3464        let a = _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.);
3465        let b = _mm256_setr_ps(2., 3., 6., 7., 10., 11., 14., 15.);
3466        let r = _mm256_blend_ps::<0x0>(a, b);
3467        assert_eq_m256(r, _mm256_setr_ps(1., 4., 5., 8., 9., 12., 13., 16.));
3468        let r = _mm256_blend_ps::<0x3>(a, b);
3469        assert_eq_m256(r, _mm256_setr_ps(2., 3., 5., 8., 9., 12., 13., 16.));
3470        let r = _mm256_blend_ps::<0xF>(a, b);
3471        assert_eq_m256(r, _mm256_setr_ps(2., 3., 6., 7., 9., 12., 13., 16.));
3472    }
3473
3474    #[simd_test(enable = "avx")]
3475    unsafe fn test_mm256_blendv_pd() {
3476        let a = _mm256_setr_pd(4., 9., 16., 25.);
3477        let b = _mm256_setr_pd(4., 3., 2., 5.);
3478        let c = _mm256_setr_pd(0., 0., !0 as f64, !0 as f64);
3479        let r = _mm256_blendv_pd(a, b, c);
3480        let e = _mm256_setr_pd(4., 9., 2., 5.);
3481        assert_eq_m256d(r, e);
3482    }
3483
3484    #[simd_test(enable = "avx")]
3485    unsafe fn test_mm256_blendv_ps() {
3486        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3487        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3488        #[rustfmt::skip]
3489        let c = _mm256_setr_ps(
3490            0., 0., 0., 0., !0 as f32, !0 as f32, !0 as f32, !0 as f32,
3491        );
3492        let r = _mm256_blendv_ps(a, b, c);
3493        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3494        assert_eq_m256(r, e);
3495    }
3496
3497    #[simd_test(enable = "avx")]
3498    unsafe fn test_mm256_dp_ps() {
3499        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3500        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3501        let r = _mm256_dp_ps::<0xFF>(a, b);
3502        let e = _mm256_setr_ps(200., 200., 200., 200., 2387., 2387., 2387., 2387.);
3503        assert_eq_m256(r, e);
3504    }
3505
3506    #[simd_test(enable = "avx")]
3507    unsafe fn test_mm256_hadd_pd() {
3508        let a = _mm256_setr_pd(4., 9., 16., 25.);
3509        let b = _mm256_setr_pd(4., 3., 2., 5.);
3510        let r = _mm256_hadd_pd(a, b);
3511        let e = _mm256_setr_pd(13., 7., 41., 7.);
3512        assert_eq_m256d(r, e);
3513
3514        let a = _mm256_setr_pd(1., 2., 3., 4.);
3515        let b = _mm256_setr_pd(5., 6., 7., 8.);
3516        let r = _mm256_hadd_pd(a, b);
3517        let e = _mm256_setr_pd(3., 11., 7., 15.);
3518        assert_eq_m256d(r, e);
3519    }
3520
3521    #[simd_test(enable = "avx")]
3522    unsafe fn test_mm256_hadd_ps() {
3523        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3524        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3525        let r = _mm256_hadd_ps(a, b);
3526        let e = _mm256_setr_ps(13., 41., 7., 7., 13., 41., 17., 114.);
3527        assert_eq_m256(r, e);
3528
3529        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3530        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3531        let r = _mm256_hadd_ps(a, b);
3532        let e = _mm256_setr_ps(3., 7., 11., 15., 3., 7., 11., 15.);
3533        assert_eq_m256(r, e);
3534    }
3535
3536    #[simd_test(enable = "avx")]
3537    unsafe fn test_mm256_hsub_pd() {
3538        let a = _mm256_setr_pd(4., 9., 16., 25.);
3539        let b = _mm256_setr_pd(4., 3., 2., 5.);
3540        let r = _mm256_hsub_pd(a, b);
3541        let e = _mm256_setr_pd(-5., 1., -9., -3.);
3542        assert_eq_m256d(r, e);
3543
3544        let a = _mm256_setr_pd(1., 2., 3., 4.);
3545        let b = _mm256_setr_pd(5., 6., 7., 8.);
3546        let r = _mm256_hsub_pd(a, b);
3547        let e = _mm256_setr_pd(-1., -1., -1., -1.);
3548        assert_eq_m256d(r, e);
3549    }
3550
3551    #[simd_test(enable = "avx")]
3552    unsafe fn test_mm256_hsub_ps() {
3553        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3554        let b = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3555        let r = _mm256_hsub_ps(a, b);
3556        let e = _mm256_setr_ps(-5., -9., 1., -3., -5., -9., -1., 14.);
3557        assert_eq_m256(r, e);
3558
3559        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3560        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3561        let r = _mm256_hsub_ps(a, b);
3562        let e = _mm256_setr_ps(-1., -1., -1., -1., -1., -1., -1., -1.);
3563        assert_eq_m256(r, e);
3564    }
3565
3566    #[simd_test(enable = "avx")]
3567    unsafe fn test_mm256_xor_pd() {
3568        let a = _mm256_setr_pd(4., 9., 16., 25.);
3569        let b = _mm256_set1_pd(0.);
3570        let r = _mm256_xor_pd(a, b);
3571        assert_eq_m256d(r, a);
3572    }
3573
3574    #[simd_test(enable = "avx")]
3575    unsafe fn test_mm256_xor_ps() {
3576        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3577        let b = _mm256_set1_ps(0.);
3578        let r = _mm256_xor_ps(a, b);
3579        assert_eq_m256(r, a);
3580    }
3581
3582    #[simd_test(enable = "avx")]
3583    unsafe fn test_mm_cmp_pd() {
3584        let a = _mm_setr_pd(4., 9.);
3585        let b = _mm_setr_pd(4., 3.);
3586        let r = _mm_cmp_pd::<_CMP_GE_OS>(a, b);
3587        assert!(get_m128d(r, 0).is_nan());
3588        assert!(get_m128d(r, 1).is_nan());
3589    }
3590
3591    #[simd_test(enable = "avx")]
3592    unsafe fn test_mm256_cmp_pd() {
3593        let a = _mm256_setr_pd(1., 2., 3., 4.);
3594        let b = _mm256_setr_pd(5., 6., 7., 8.);
3595        let r = _mm256_cmp_pd::<_CMP_GE_OS>(a, b);
3596        let e = _mm256_set1_pd(0.);
3597        assert_eq_m256d(r, e);
3598    }
3599
3600    #[simd_test(enable = "avx")]
3601    unsafe fn test_mm_cmp_ps() {
3602        let a = _mm_setr_ps(4., 3., 2., 5.);
3603        let b = _mm_setr_ps(4., 9., 16., 25.);
3604        let r = _mm_cmp_ps::<_CMP_GE_OS>(a, b);
3605        assert!(get_m128(r, 0).is_nan());
3606        assert_eq!(get_m128(r, 1), 0.);
3607        assert_eq!(get_m128(r, 2), 0.);
3608        assert_eq!(get_m128(r, 3), 0.);
3609    }
3610
3611    #[simd_test(enable = "avx")]
3612    unsafe fn test_mm256_cmp_ps() {
3613        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3614        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3615        let r = _mm256_cmp_ps::<_CMP_GE_OS>(a, b);
3616        let e = _mm256_set1_ps(0.);
3617        assert_eq_m256(r, e);
3618    }
3619
3620    #[simd_test(enable = "avx")]
3621    unsafe fn test_mm_cmp_sd() {
3622        let a = _mm_setr_pd(4., 9.);
3623        let b = _mm_setr_pd(4., 3.);
3624        let r = _mm_cmp_sd::<_CMP_GE_OS>(a, b);
3625        assert!(get_m128d(r, 0).is_nan());
3626        assert_eq!(get_m128d(r, 1), 9.);
3627    }
3628
3629    #[simd_test(enable = "avx")]
3630    unsafe fn test_mm_cmp_ss() {
3631        let a = _mm_setr_ps(4., 3., 2., 5.);
3632        let b = _mm_setr_ps(4., 9., 16., 25.);
3633        let r = _mm_cmp_ss::<_CMP_GE_OS>(a, b);
3634        assert!(get_m128(r, 0).is_nan());
3635        assert_eq!(get_m128(r, 1), 3.);
3636        assert_eq!(get_m128(r, 2), 2.);
3637        assert_eq!(get_m128(r, 3), 5.);
3638    }
3639
3640    #[simd_test(enable = "avx")]
3641    unsafe fn test_mm256_cvtepi32_pd() {
3642        let a = _mm_setr_epi32(4, 9, 16, 25);
3643        let r = _mm256_cvtepi32_pd(a);
3644        let e = _mm256_setr_pd(4., 9., 16., 25.);
3645        assert_eq_m256d(r, e);
3646    }
3647
3648    #[simd_test(enable = "avx")]
3649    unsafe fn test_mm256_cvtepi32_ps() {
3650        let a = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3651        let r = _mm256_cvtepi32_ps(a);
3652        let e = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3653        assert_eq_m256(r, e);
3654    }
3655
3656    #[simd_test(enable = "avx")]
3657    unsafe fn test_mm256_cvtpd_ps() {
3658        let a = _mm256_setr_pd(4., 9., 16., 25.);
3659        let r = _mm256_cvtpd_ps(a);
3660        let e = _mm_setr_ps(4., 9., 16., 25.);
3661        assert_eq_m128(r, e);
3662    }
3663
3664    #[simd_test(enable = "avx")]
3665    unsafe fn test_mm256_cvtps_epi32() {
3666        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3667        let r = _mm256_cvtps_epi32(a);
3668        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3669        assert_eq_m256i(r, e);
3670    }
3671
3672    #[simd_test(enable = "avx")]
3673    unsafe fn test_mm256_cvtps_pd() {
3674        let a = _mm_setr_ps(4., 9., 16., 25.);
3675        let r = _mm256_cvtps_pd(a);
3676        let e = _mm256_setr_pd(4., 9., 16., 25.);
3677        assert_eq_m256d(r, e);
3678    }
3679
3680    #[simd_test(enable = "avx")]
3681    unsafe fn test_mm256_cvtsd_f64() {
3682        let a = _mm256_setr_pd(1., 2., 3., 4.);
3683        let r = _mm256_cvtsd_f64(a);
3684        assert_eq!(r, 1.);
3685    }
3686
3687    #[simd_test(enable = "avx")]
3688    unsafe fn test_mm256_cvttpd_epi32() {
3689        let a = _mm256_setr_pd(4., 9., 16., 25.);
3690        let r = _mm256_cvttpd_epi32(a);
3691        let e = _mm_setr_epi32(4, 9, 16, 25);
3692        assert_eq_m128i(r, e);
3693    }
3694
3695    #[simd_test(enable = "avx")]
3696    unsafe fn test_mm256_cvtpd_epi32() {
3697        let a = _mm256_setr_pd(4., 9., 16., 25.);
3698        let r = _mm256_cvtpd_epi32(a);
3699        let e = _mm_setr_epi32(4, 9, 16, 25);
3700        assert_eq_m128i(r, e);
3701    }
3702
3703    #[simd_test(enable = "avx")]
3704    unsafe fn test_mm256_cvttps_epi32() {
3705        let a = _mm256_setr_ps(4., 9., 16., 25., 4., 9., 16., 25.);
3706        let r = _mm256_cvttps_epi32(a);
3707        let e = _mm256_setr_epi32(4, 9, 16, 25, 4, 9, 16, 25);
3708        assert_eq_m256i(r, e);
3709    }
3710
3711    #[simd_test(enable = "avx")]
3712    unsafe fn test_mm256_extractf128_ps() {
3713        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3714        let r = _mm256_extractf128_ps::<0>(a);
3715        let e = _mm_setr_ps(4., 3., 2., 5.);
3716        assert_eq_m128(r, e);
3717    }
3718
3719    #[simd_test(enable = "avx")]
3720    unsafe fn test_mm256_extractf128_pd() {
3721        let a = _mm256_setr_pd(4., 3., 2., 5.);
3722        let r = _mm256_extractf128_pd::<0>(a);
3723        let e = _mm_setr_pd(4., 3.);
3724        assert_eq_m128d(r, e);
3725    }
3726
3727    #[simd_test(enable = "avx")]
3728    unsafe fn test_mm256_extractf128_si256() {
3729        let a = _mm256_setr_epi64x(4, 3, 2, 5);
3730        let r = _mm256_extractf128_si256::<0>(a);
3731        let e = _mm_setr_epi64x(4, 3);
3732        assert_eq_m128i(r, e);
3733    }
3734
3735    #[simd_test(enable = "avx")]
3736    unsafe fn test_mm256_extract_epi32() {
3737        let a = _mm256_setr_epi32(-1, 1, 2, 3, 4, 5, 6, 7);
3738        let r1 = _mm256_extract_epi32::<0>(a);
3739        let r2 = _mm256_extract_epi32::<3>(a);
3740        assert_eq!(r1, -1);
3741        assert_eq!(r2, 3);
3742    }
3743
3744    #[simd_test(enable = "avx")]
3745    unsafe fn test_mm256_cvtsi256_si32() {
3746        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3747        let r = _mm256_cvtsi256_si32(a);
3748        assert_eq!(r, 1);
3749    }
3750
3751    #[simd_test(enable = "avx")]
3752    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
3753    unsafe fn test_mm256_zeroall() {
3754        _mm256_zeroall();
3755    }
3756
3757    #[simd_test(enable = "avx")]
3758    #[cfg_attr(miri, ignore)] // Register-level operation not supported by Miri
3759    unsafe fn test_mm256_zeroupper() {
3760        _mm256_zeroupper();
3761    }
3762
3763    #[simd_test(enable = "avx")]
3764    unsafe fn test_mm256_permutevar_ps() {
3765        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3766        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3767        let r = _mm256_permutevar_ps(a, b);
3768        let e = _mm256_setr_ps(3., 2., 5., 4., 9., 64., 50., 8.);
3769        assert_eq_m256(r, e);
3770    }
3771
3772    #[simd_test(enable = "avx")]
3773    unsafe fn test_mm_permutevar_ps() {
3774        let a = _mm_setr_ps(4., 3., 2., 5.);
3775        let b = _mm_setr_epi32(1, 2, 3, 4);
3776        let r = _mm_permutevar_ps(a, b);
3777        let e = _mm_setr_ps(3., 2., 5., 4.);
3778        assert_eq_m128(r, e);
3779    }
3780
3781    #[simd_test(enable = "avx")]
3782    unsafe fn test_mm256_permute_ps() {
3783        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3784        let r = _mm256_permute_ps::<0x1b>(a);
3785        let e = _mm256_setr_ps(5., 2., 3., 4., 50., 64., 9., 8.);
3786        assert_eq_m256(r, e);
3787    }
3788
3789    #[simd_test(enable = "avx")]
3790    unsafe fn test_mm_permute_ps() {
3791        let a = _mm_setr_ps(4., 3., 2., 5.);
3792        let r = _mm_permute_ps::<0x1b>(a);
3793        let e = _mm_setr_ps(5., 2., 3., 4.);
3794        assert_eq_m128(r, e);
3795    }
3796
3797    #[simd_test(enable = "avx")]
3798    unsafe fn test_mm256_permutevar_pd() {
3799        let a = _mm256_setr_pd(4., 3., 2., 5.);
3800        let b = _mm256_setr_epi64x(1, 2, 3, 4);
3801        let r = _mm256_permutevar_pd(a, b);
3802        let e = _mm256_setr_pd(4., 3., 5., 2.);
3803        assert_eq_m256d(r, e);
3804    }
3805
3806    #[simd_test(enable = "avx")]
3807    unsafe fn test_mm_permutevar_pd() {
3808        let a = _mm_setr_pd(4., 3.);
3809        let b = _mm_setr_epi64x(3, 0);
3810        let r = _mm_permutevar_pd(a, b);
3811        let e = _mm_setr_pd(3., 4.);
3812        assert_eq_m128d(r, e);
3813    }
3814
3815    #[simd_test(enable = "avx")]
3816    unsafe fn test_mm256_permute_pd() {
3817        let a = _mm256_setr_pd(4., 3., 2., 5.);
3818        let r = _mm256_permute_pd::<5>(a);
3819        let e = _mm256_setr_pd(3., 4., 5., 2.);
3820        assert_eq_m256d(r, e);
3821    }
3822
3823    #[simd_test(enable = "avx")]
3824    unsafe fn test_mm_permute_pd() {
3825        let a = _mm_setr_pd(4., 3.);
3826        let r = _mm_permute_pd::<1>(a);
3827        let e = _mm_setr_pd(3., 4.);
3828        assert_eq_m128d(r, e);
3829    }
3830
3831    #[simd_test(enable = "avx")]
3832    unsafe fn test_mm256_permute2f128_ps() {
3833        let a = _mm256_setr_ps(1., 2., 3., 4., 1., 2., 3., 4.);
3834        let b = _mm256_setr_ps(5., 6., 7., 8., 5., 6., 7., 8.);
3835        let r = _mm256_permute2f128_ps::<0x13>(a, b);
3836        let e = _mm256_setr_ps(5., 6., 7., 8., 1., 2., 3., 4.);
3837        assert_eq_m256(r, e);
3838    }
3839
3840    #[simd_test(enable = "avx")]
3841    unsafe fn test_mm256_permute2f128_pd() {
3842        let a = _mm256_setr_pd(1., 2., 3., 4.);
3843        let b = _mm256_setr_pd(5., 6., 7., 8.);
3844        let r = _mm256_permute2f128_pd::<0x31>(a, b);
3845        let e = _mm256_setr_pd(3., 4., 7., 8.);
3846        assert_eq_m256d(r, e);
3847    }
3848
3849    #[simd_test(enable = "avx")]
3850    unsafe fn test_mm256_permute2f128_si256() {
3851        let a = _mm256_setr_epi32(1, 2, 3, 4, 1, 2, 3, 4);
3852        let b = _mm256_setr_epi32(5, 6, 7, 8, 5, 6, 7, 8);
3853        let r = _mm256_permute2f128_si256::<0x20>(a, b);
3854        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3855        assert_eq_m256i(r, e);
3856    }
3857
3858    #[simd_test(enable = "avx")]
3859    unsafe fn test_mm256_broadcast_ss() {
3860        let r = _mm256_broadcast_ss(&3.);
3861        let e = _mm256_set1_ps(3.);
3862        assert_eq_m256(r, e);
3863    }
3864
3865    #[simd_test(enable = "avx")]
3866    unsafe fn test_mm_broadcast_ss() {
3867        let r = _mm_broadcast_ss(&3.);
3868        let e = _mm_set1_ps(3.);
3869        assert_eq_m128(r, e);
3870    }
3871
3872    #[simd_test(enable = "avx")]
3873    unsafe fn test_mm256_broadcast_sd() {
3874        let r = _mm256_broadcast_sd(&3.);
3875        let e = _mm256_set1_pd(3.);
3876        assert_eq_m256d(r, e);
3877    }
3878
3879    #[simd_test(enable = "avx")]
3880    unsafe fn test_mm256_broadcast_ps() {
3881        let a = _mm_setr_ps(4., 3., 2., 5.);
3882        let r = _mm256_broadcast_ps(&a);
3883        let e = _mm256_setr_ps(4., 3., 2., 5., 4., 3., 2., 5.);
3884        assert_eq_m256(r, e);
3885    }
3886
3887    #[simd_test(enable = "avx")]
3888    unsafe fn test_mm256_broadcast_pd() {
3889        let a = _mm_setr_pd(4., 3.);
3890        let r = _mm256_broadcast_pd(&a);
3891        let e = _mm256_setr_pd(4., 3., 4., 3.);
3892        assert_eq_m256d(r, e);
3893    }
3894
3895    #[simd_test(enable = "avx")]
3896    unsafe fn test_mm256_insertf128_ps() {
3897        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3898        let b = _mm_setr_ps(4., 9., 16., 25.);
3899        let r = _mm256_insertf128_ps::<0>(a, b);
3900        let e = _mm256_setr_ps(4., 9., 16., 25., 8., 9., 64., 50.);
3901        assert_eq_m256(r, e);
3902    }
3903
3904    #[simd_test(enable = "avx")]
3905    unsafe fn test_mm256_insertf128_pd() {
3906        let a = _mm256_setr_pd(1., 2., 3., 4.);
3907        let b = _mm_setr_pd(5., 6.);
3908        let r = _mm256_insertf128_pd::<0>(a, b);
3909        let e = _mm256_setr_pd(5., 6., 3., 4.);
3910        assert_eq_m256d(r, e);
3911    }
3912
3913    #[simd_test(enable = "avx")]
3914    unsafe fn test_mm256_insertf128_si256() {
3915        let a = _mm256_setr_epi64x(1, 2, 3, 4);
3916        let b = _mm_setr_epi64x(5, 6);
3917        let r = _mm256_insertf128_si256::<0>(a, b);
3918        let e = _mm256_setr_epi64x(5, 6, 3, 4);
3919        assert_eq_m256i(r, e);
3920    }
3921
3922    #[simd_test(enable = "avx")]
3923    unsafe fn test_mm256_insert_epi8() {
3924        #[rustfmt::skip]
3925        let a = _mm256_setr_epi8(
3926            1, 2, 3, 4, 5, 6, 7, 8,
3927            9, 10, 11, 12, 13, 14, 15, 16,
3928            17, 18, 19, 20, 21, 22, 23, 24,
3929            25, 26, 27, 28, 29, 30, 31, 32,
3930        );
3931        let r = _mm256_insert_epi8::<31>(a, 0);
3932        #[rustfmt::skip]
3933        let e = _mm256_setr_epi8(
3934            1, 2, 3, 4, 5, 6, 7, 8,
3935            9, 10, 11, 12, 13, 14, 15, 16,
3936            17, 18, 19, 20, 21, 22, 23, 24,
3937            25, 26, 27, 28, 29, 30, 31, 0,
3938        );
3939        assert_eq_m256i(r, e);
3940    }
3941
3942    #[simd_test(enable = "avx")]
3943    unsafe fn test_mm256_insert_epi16() {
3944        #[rustfmt::skip]
3945        let a = _mm256_setr_epi16(
3946            0, 1, 2, 3, 4, 5, 6, 7,
3947            8, 9, 10, 11, 12, 13, 14, 15,
3948        );
3949        let r = _mm256_insert_epi16::<15>(a, 0);
3950        #[rustfmt::skip]
3951        let e = _mm256_setr_epi16(
3952            0, 1, 2, 3, 4, 5, 6, 7,
3953            8, 9, 10, 11, 12, 13, 14, 0,
3954        );
3955        assert_eq_m256i(r, e);
3956    }
3957
3958    #[simd_test(enable = "avx")]
3959    unsafe fn test_mm256_insert_epi32() {
3960        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3961        let r = _mm256_insert_epi32::<7>(a, 0);
3962        let e = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
3963        assert_eq_m256i(r, e);
3964    }
3965
3966    #[simd_test(enable = "avx")]
3967    unsafe fn test_mm256_load_pd() {
3968        let a = _mm256_setr_pd(1., 2., 3., 4.);
3969        let p = ptr::addr_of!(a) as *const f64;
3970        let r = _mm256_load_pd(p);
3971        let e = _mm256_setr_pd(1., 2., 3., 4.);
3972        assert_eq_m256d(r, e);
3973    }
3974
3975    #[simd_test(enable = "avx")]
3976    unsafe fn test_mm256_store_pd() {
3977        let a = _mm256_setr_pd(1., 2., 3., 4.);
3978        let mut r = _mm256_undefined_pd();
3979        _mm256_store_pd(ptr::addr_of_mut!(r) as *mut f64, a);
3980        assert_eq_m256d(r, a);
3981    }
3982
3983    #[simd_test(enable = "avx")]
3984    unsafe fn test_mm256_load_ps() {
3985        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3986        let p = ptr::addr_of!(a) as *const f32;
3987        let r = _mm256_load_ps(p);
3988        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3989        assert_eq_m256(r, e);
3990    }
3991
3992    #[simd_test(enable = "avx")]
3993    unsafe fn test_mm256_store_ps() {
3994        let a = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
3995        let mut r = _mm256_undefined_ps();
3996        _mm256_store_ps(ptr::addr_of_mut!(r) as *mut f32, a);
3997        assert_eq_m256(r, a);
3998    }
3999
4000    #[simd_test(enable = "avx")]
4001    unsafe fn test_mm256_loadu_pd() {
4002        let a = &[1.0f64, 2., 3., 4.];
4003        let p = a.as_ptr();
4004        let r = _mm256_loadu_pd(black_box(p));
4005        let e = _mm256_setr_pd(1., 2., 3., 4.);
4006        assert_eq_m256d(r, e);
4007    }
4008
4009    #[simd_test(enable = "avx")]
4010    unsafe fn test_mm256_storeu_pd() {
4011        let a = _mm256_set1_pd(9.);
4012        let mut r = _mm256_undefined_pd();
4013        _mm256_storeu_pd(ptr::addr_of_mut!(r) as *mut f64, a);
4014        assert_eq_m256d(r, a);
4015    }
4016
4017    #[simd_test(enable = "avx")]
4018    unsafe fn test_mm256_loadu_ps() {
4019        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
4020        let p = a.as_ptr();
4021        let r = _mm256_loadu_ps(black_box(p));
4022        let e = _mm256_setr_ps(4., 3., 2., 5., 8., 9., 64., 50.);
4023        assert_eq_m256(r, e);
4024    }
4025
4026    #[simd_test(enable = "avx")]
4027    unsafe fn test_mm256_storeu_ps() {
4028        let a = _mm256_set1_ps(9.);
4029        let mut r = _mm256_undefined_ps();
4030        _mm256_storeu_ps(ptr::addr_of_mut!(r) as *mut f32, a);
4031        assert_eq_m256(r, a);
4032    }
4033
4034    #[simd_test(enable = "avx")]
4035    unsafe fn test_mm256_load_si256() {
4036        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4037        let p = ptr::addr_of!(a);
4038        let r = _mm256_load_si256(p);
4039        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4040        assert_eq_m256i(r, e);
4041    }
4042
4043    #[simd_test(enable = "avx")]
4044    unsafe fn test_mm256_store_si256() {
4045        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4046        let mut r = _mm256_undefined_si256();
4047        _mm256_store_si256(ptr::addr_of_mut!(r), a);
4048        assert_eq_m256i(r, a);
4049    }
4050
4051    #[simd_test(enable = "avx")]
4052    unsafe fn test_mm256_loadu_si256() {
4053        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4054        let p = ptr::addr_of!(a);
4055        let r = _mm256_loadu_si256(black_box(p));
4056        let e = _mm256_setr_epi64x(1, 2, 3, 4);
4057        assert_eq_m256i(r, e);
4058    }
4059
4060    #[simd_test(enable = "avx")]
4061    unsafe fn test_mm256_storeu_si256() {
4062        let a = _mm256_set1_epi8(9);
4063        let mut r = _mm256_undefined_si256();
4064        _mm256_storeu_si256(ptr::addr_of_mut!(r), a);
4065        assert_eq_m256i(r, a);
4066    }
4067
4068    #[simd_test(enable = "avx")]
4069    unsafe fn test_mm256_maskload_pd() {
4070        let a = &[1.0f64, 2., 3., 4.];
4071        let p = a.as_ptr();
4072        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4073        let r = _mm256_maskload_pd(black_box(p), mask);
4074        let e = _mm256_setr_pd(0., 2., 0., 4.);
4075        assert_eq_m256d(r, e);
4076    }
4077
4078    #[simd_test(enable = "avx")]
4079    unsafe fn test_mm256_maskstore_pd() {
4080        let mut r = _mm256_set1_pd(0.);
4081        let mask = _mm256_setr_epi64x(0, !0, 0, !0);
4082        let a = _mm256_setr_pd(1., 2., 3., 4.);
4083        _mm256_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4084        let e = _mm256_setr_pd(0., 2., 0., 4.);
4085        assert_eq_m256d(r, e);
4086    }
4087
4088    #[simd_test(enable = "avx")]
4089    unsafe fn test_mm_maskload_pd() {
4090        let a = &[1.0f64, 2.];
4091        let p = a.as_ptr();
4092        let mask = _mm_setr_epi64x(0, !0);
4093        let r = _mm_maskload_pd(black_box(p), mask);
4094        let e = _mm_setr_pd(0., 2.);
4095        assert_eq_m128d(r, e);
4096    }
4097
4098    #[simd_test(enable = "avx")]
4099    unsafe fn test_mm_maskstore_pd() {
4100        let mut r = _mm_set1_pd(0.);
4101        let mask = _mm_setr_epi64x(0, !0);
4102        let a = _mm_setr_pd(1., 2.);
4103        _mm_maskstore_pd(ptr::addr_of_mut!(r) as *mut f64, mask, a);
4104        let e = _mm_setr_pd(0., 2.);
4105        assert_eq_m128d(r, e);
4106    }
4107
4108    #[simd_test(enable = "avx")]
4109    unsafe fn test_mm256_maskload_ps() {
4110        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
4111        let p = a.as_ptr();
4112        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4113        let r = _mm256_maskload_ps(black_box(p), mask);
4114        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4115        assert_eq_m256(r, e);
4116    }
4117
4118    #[simd_test(enable = "avx")]
4119    unsafe fn test_mm256_maskstore_ps() {
4120        let mut r = _mm256_set1_ps(0.);
4121        let mask = _mm256_setr_epi32(0, !0, 0, !0, 0, !0, 0, !0);
4122        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4123        _mm256_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4124        let e = _mm256_setr_ps(0., 2., 0., 4., 0., 6., 0., 8.);
4125        assert_eq_m256(r, e);
4126    }
4127
4128    #[simd_test(enable = "avx")]
4129    unsafe fn test_mm_maskload_ps() {
4130        let a = &[1.0f32, 2., 3., 4.];
4131        let p = a.as_ptr();
4132        let mask = _mm_setr_epi32(0, !0, 0, !0);
4133        let r = _mm_maskload_ps(black_box(p), mask);
4134        let e = _mm_setr_ps(0., 2., 0., 4.);
4135        assert_eq_m128(r, e);
4136    }
4137
4138    #[simd_test(enable = "avx")]
4139    unsafe fn test_mm_maskstore_ps() {
4140        let mut r = _mm_set1_ps(0.);
4141        let mask = _mm_setr_epi32(0, !0, 0, !0);
4142        let a = _mm_setr_ps(1., 2., 3., 4.);
4143        _mm_maskstore_ps(ptr::addr_of_mut!(r) as *mut f32, mask, a);
4144        let e = _mm_setr_ps(0., 2., 0., 4.);
4145        assert_eq_m128(r, e);
4146    }
4147
4148    #[simd_test(enable = "avx")]
4149    unsafe fn test_mm256_movehdup_ps() {
4150        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4151        let r = _mm256_movehdup_ps(a);
4152        let e = _mm256_setr_ps(2., 2., 4., 4., 6., 6., 8., 8.);
4153        assert_eq_m256(r, e);
4154    }
4155
4156    #[simd_test(enable = "avx")]
4157    unsafe fn test_mm256_moveldup_ps() {
4158        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4159        let r = _mm256_moveldup_ps(a);
4160        let e = _mm256_setr_ps(1., 1., 3., 3., 5., 5., 7., 7.);
4161        assert_eq_m256(r, e);
4162    }
4163
4164    #[simd_test(enable = "avx")]
4165    unsafe fn test_mm256_movedup_pd() {
4166        let a = _mm256_setr_pd(1., 2., 3., 4.);
4167        let r = _mm256_movedup_pd(a);
4168        let e = _mm256_setr_pd(1., 1., 3., 3.);
4169        assert_eq_m256d(r, e);
4170    }
4171
4172    #[simd_test(enable = "avx")]
4173    unsafe fn test_mm256_lddqu_si256() {
4174        #[rustfmt::skip]
4175        let a = _mm256_setr_epi8(
4176            1, 2, 3, 4, 5, 6, 7, 8,
4177            9, 10, 11, 12, 13, 14, 15, 16,
4178            17, 18, 19, 20, 21, 22, 23, 24,
4179            25, 26, 27, 28, 29, 30, 31, 32,
4180        );
4181        let p = ptr::addr_of!(a);
4182        let r = _mm256_lddqu_si256(black_box(p));
4183        #[rustfmt::skip]
4184        let e = _mm256_setr_epi8(
4185            1, 2, 3, 4, 5, 6, 7, 8,
4186            9, 10, 11, 12, 13, 14, 15, 16,
4187            17, 18, 19, 20, 21, 22, 23, 24,
4188            25, 26, 27, 28, 29, 30, 31, 32,
4189        );
4190        assert_eq_m256i(r, e);
4191    }
4192
4193    #[simd_test(enable = "avx")]
4194    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4195    unsafe fn test_mm256_stream_si256() {
4196        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4197        let mut r = _mm256_undefined_si256();
4198        _mm256_stream_si256(ptr::addr_of_mut!(r), a);
4199        assert_eq_m256i(r, a);
4200    }
4201
4202    #[simd_test(enable = "avx")]
4203    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4204    unsafe fn test_mm256_stream_pd() {
4205        #[repr(align(32))]
4206        struct Memory {
4207            pub data: [f64; 4],
4208        }
4209        let a = _mm256_set1_pd(7.0);
4210        let mut mem = Memory { data: [-1.0; 4] };
4211
4212        _mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4213        for i in 0..4 {
4214            assert_eq!(mem.data[i], get_m256d(a, i));
4215        }
4216    }
4217
4218    #[simd_test(enable = "avx")]
4219    #[cfg_attr(miri, ignore)] // Non-temporal store, which is not supported by Miri
4220    unsafe fn test_mm256_stream_ps() {
4221        #[repr(align(32))]
4222        struct Memory {
4223            pub data: [f32; 8],
4224        }
4225        let a = _mm256_set1_ps(7.0);
4226        let mut mem = Memory { data: [-1.0; 8] };
4227
4228        _mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
4229        for i in 0..8 {
4230            assert_eq!(mem.data[i], get_m256(a, i));
4231        }
4232    }
4233
4234    #[simd_test(enable = "avx")]
4235    unsafe fn test_mm256_rcp_ps() {
4236        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4237        let r = _mm256_rcp_ps(a);
4238        #[rustfmt::skip]
4239        let e = _mm256_setr_ps(
4240            0.99975586, 0.49987793, 0.33325195, 0.24993896,
4241            0.19995117, 0.16662598, 0.14282227, 0.12496948,
4242        );
4243        let rel_err = 0.00048828125;
4244        for i in 0..8 {
4245            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4246        }
4247    }
4248
4249    #[simd_test(enable = "avx")]
4250    unsafe fn test_mm256_rsqrt_ps() {
4251        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4252        let r = _mm256_rsqrt_ps(a);
4253        #[rustfmt::skip]
4254        let e = _mm256_setr_ps(
4255            0.99975586, 0.7069092, 0.5772705, 0.49987793,
4256            0.44714355, 0.40820313, 0.3779297, 0.3534546,
4257        );
4258        let rel_err = 0.00048828125;
4259        for i in 0..8 {
4260            assert_approx_eq!(get_m256(r, i), get_m256(e, i), 2. * rel_err);
4261        }
4262    }
4263
4264    #[simd_test(enable = "avx")]
4265    unsafe fn test_mm256_unpackhi_pd() {
4266        let a = _mm256_setr_pd(1., 2., 3., 4.);
4267        let b = _mm256_setr_pd(5., 6., 7., 8.);
4268        let r = _mm256_unpackhi_pd(a, b);
4269        let e = _mm256_setr_pd(2., 6., 4., 8.);
4270        assert_eq_m256d(r, e);
4271    }
4272
4273    #[simd_test(enable = "avx")]
4274    unsafe fn test_mm256_unpackhi_ps() {
4275        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4276        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4277        let r = _mm256_unpackhi_ps(a, b);
4278        let e = _mm256_setr_ps(3., 11., 4., 12., 7., 15., 8., 16.);
4279        assert_eq_m256(r, e);
4280    }
4281
4282    #[simd_test(enable = "avx")]
4283    unsafe fn test_mm256_unpacklo_pd() {
4284        let a = _mm256_setr_pd(1., 2., 3., 4.);
4285        let b = _mm256_setr_pd(5., 6., 7., 8.);
4286        let r = _mm256_unpacklo_pd(a, b);
4287        let e = _mm256_setr_pd(1., 5., 3., 7.);
4288        assert_eq_m256d(r, e);
4289    }
4290
4291    #[simd_test(enable = "avx")]
4292    unsafe fn test_mm256_unpacklo_ps() {
4293        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4294        let b = _mm256_setr_ps(9., 10., 11., 12., 13., 14., 15., 16.);
4295        let r = _mm256_unpacklo_ps(a, b);
4296        let e = _mm256_setr_ps(1., 9., 2., 10., 5., 13., 6., 14.);
4297        assert_eq_m256(r, e);
4298    }
4299
4300    #[simd_test(enable = "avx")]
4301    unsafe fn test_mm256_testz_si256() {
4302        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4303        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4304        let r = _mm256_testz_si256(a, b);
4305        assert_eq!(r, 0);
4306        let b = _mm256_set1_epi64x(0);
4307        let r = _mm256_testz_si256(a, b);
4308        assert_eq!(r, 1);
4309    }
4310
4311    #[simd_test(enable = "avx")]
4312    unsafe fn test_mm256_testc_si256() {
4313        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4314        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4315        let r = _mm256_testc_si256(a, b);
4316        assert_eq!(r, 0);
4317        let b = _mm256_set1_epi64x(0);
4318        let r = _mm256_testc_si256(a, b);
4319        assert_eq!(r, 1);
4320    }
4321
4322    #[simd_test(enable = "avx")]
4323    unsafe fn test_mm256_testnzc_si256() {
4324        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4325        let b = _mm256_setr_epi64x(5, 6, 7, 8);
4326        let r = _mm256_testnzc_si256(a, b);
4327        assert_eq!(r, 1);
4328        let a = _mm256_setr_epi64x(0, 0, 0, 0);
4329        let b = _mm256_setr_epi64x(0, 0, 0, 0);
4330        let r = _mm256_testnzc_si256(a, b);
4331        assert_eq!(r, 0);
4332    }
4333
4334    #[simd_test(enable = "avx")]
4335    unsafe fn test_mm256_testz_pd() {
4336        let a = _mm256_setr_pd(1., 2., 3., 4.);
4337        let b = _mm256_setr_pd(5., 6., 7., 8.);
4338        let r = _mm256_testz_pd(a, b);
4339        assert_eq!(r, 1);
4340        let a = _mm256_set1_pd(-1.);
4341        let r = _mm256_testz_pd(a, a);
4342        assert_eq!(r, 0);
4343    }
4344
4345    #[simd_test(enable = "avx")]
4346    unsafe fn test_mm256_testc_pd() {
4347        let a = _mm256_setr_pd(1., 2., 3., 4.);
4348        let b = _mm256_setr_pd(5., 6., 7., 8.);
4349        let r = _mm256_testc_pd(a, b);
4350        assert_eq!(r, 1);
4351        let a = _mm256_set1_pd(1.);
4352        let b = _mm256_set1_pd(-1.);
4353        let r = _mm256_testc_pd(a, b);
4354        assert_eq!(r, 0);
4355    }
4356
4357    #[simd_test(enable = "avx")]
4358    unsafe fn test_mm256_testnzc_pd() {
4359        let a = _mm256_setr_pd(1., 2., 3., 4.);
4360        let b = _mm256_setr_pd(5., 6., 7., 8.);
4361        let r = _mm256_testnzc_pd(a, b);
4362        assert_eq!(r, 0);
4363        let a = _mm256_setr_pd(1., -1., -1., -1.);
4364        let b = _mm256_setr_pd(-1., -1., 1., 1.);
4365        let r = _mm256_testnzc_pd(a, b);
4366        assert_eq!(r, 1);
4367    }
4368
4369    #[simd_test(enable = "avx")]
4370    unsafe fn test_mm_testz_pd() {
4371        let a = _mm_setr_pd(1., 2.);
4372        let b = _mm_setr_pd(5., 6.);
4373        let r = _mm_testz_pd(a, b);
4374        assert_eq!(r, 1);
4375        let a = _mm_set1_pd(-1.);
4376        let r = _mm_testz_pd(a, a);
4377        assert_eq!(r, 0);
4378    }
4379
4380    #[simd_test(enable = "avx")]
4381    unsafe fn test_mm_testc_pd() {
4382        let a = _mm_setr_pd(1., 2.);
4383        let b = _mm_setr_pd(5., 6.);
4384        let r = _mm_testc_pd(a, b);
4385        assert_eq!(r, 1);
4386        let a = _mm_set1_pd(1.);
4387        let b = _mm_set1_pd(-1.);
4388        let r = _mm_testc_pd(a, b);
4389        assert_eq!(r, 0);
4390    }
4391
4392    #[simd_test(enable = "avx")]
4393    unsafe fn test_mm_testnzc_pd() {
4394        let a = _mm_setr_pd(1., 2.);
4395        let b = _mm_setr_pd(5., 6.);
4396        let r = _mm_testnzc_pd(a, b);
4397        assert_eq!(r, 0);
4398        let a = _mm_setr_pd(1., -1.);
4399        let b = _mm_setr_pd(-1., -1.);
4400        let r = _mm_testnzc_pd(a, b);
4401        assert_eq!(r, 1);
4402    }
4403
4404    #[simd_test(enable = "avx")]
4405    unsafe fn test_mm256_testz_ps() {
4406        let a = _mm256_set1_ps(1.);
4407        let r = _mm256_testz_ps(a, a);
4408        assert_eq!(r, 1);
4409        let a = _mm256_set1_ps(-1.);
4410        let r = _mm256_testz_ps(a, a);
4411        assert_eq!(r, 0);
4412    }
4413
4414    #[simd_test(enable = "avx")]
4415    unsafe fn test_mm256_testc_ps() {
4416        let a = _mm256_set1_ps(1.);
4417        let r = _mm256_testc_ps(a, a);
4418        assert_eq!(r, 1);
4419        let b = _mm256_set1_ps(-1.);
4420        let r = _mm256_testc_ps(a, b);
4421        assert_eq!(r, 0);
4422    }
4423
4424    #[simd_test(enable = "avx")]
4425    unsafe fn test_mm256_testnzc_ps() {
4426        let a = _mm256_set1_ps(1.);
4427        let r = _mm256_testnzc_ps(a, a);
4428        assert_eq!(r, 0);
4429        let a = _mm256_setr_ps(1., -1., -1., -1., -1., -1., -1., -1.);
4430        let b = _mm256_setr_ps(-1., -1., 1., 1., 1., 1., 1., 1.);
4431        let r = _mm256_testnzc_ps(a, b);
4432        assert_eq!(r, 1);
4433    }
4434
4435    #[simd_test(enable = "avx")]
4436    unsafe fn test_mm_testz_ps() {
4437        let a = _mm_set1_ps(1.);
4438        let r = _mm_testz_ps(a, a);
4439        assert_eq!(r, 1);
4440        let a = _mm_set1_ps(-1.);
4441        let r = _mm_testz_ps(a, a);
4442        assert_eq!(r, 0);
4443    }
4444
4445    #[simd_test(enable = "avx")]
4446    unsafe fn test_mm_testc_ps() {
4447        let a = _mm_set1_ps(1.);
4448        let r = _mm_testc_ps(a, a);
4449        assert_eq!(r, 1);
4450        let b = _mm_set1_ps(-1.);
4451        let r = _mm_testc_ps(a, b);
4452        assert_eq!(r, 0);
4453    }
4454
4455    #[simd_test(enable = "avx")]
4456    unsafe fn test_mm_testnzc_ps() {
4457        let a = _mm_set1_ps(1.);
4458        let r = _mm_testnzc_ps(a, a);
4459        assert_eq!(r, 0);
4460        let a = _mm_setr_ps(1., -1., -1., -1.);
4461        let b = _mm_setr_ps(-1., -1., 1., 1.);
4462        let r = _mm_testnzc_ps(a, b);
4463        assert_eq!(r, 1);
4464    }
4465
4466    #[simd_test(enable = "avx")]
4467    unsafe fn test_mm256_movemask_pd() {
4468        let a = _mm256_setr_pd(1., -2., 3., -4.);
4469        let r = _mm256_movemask_pd(a);
4470        assert_eq!(r, 0xA);
4471    }
4472
4473    #[simd_test(enable = "avx")]
4474    unsafe fn test_mm256_movemask_ps() {
4475        let a = _mm256_setr_ps(1., -2., 3., -4., 1., -2., 3., -4.);
4476        let r = _mm256_movemask_ps(a);
4477        assert_eq!(r, 0xAA);
4478    }
4479
4480    #[simd_test(enable = "avx")]
4481    unsafe fn test_mm256_setzero_pd() {
4482        let r = _mm256_setzero_pd();
4483        assert_eq_m256d(r, _mm256_set1_pd(0.));
4484    }
4485
4486    #[simd_test(enable = "avx")]
4487    unsafe fn test_mm256_setzero_ps() {
4488        let r = _mm256_setzero_ps();
4489        assert_eq_m256(r, _mm256_set1_ps(0.));
4490    }
4491
4492    #[simd_test(enable = "avx")]
4493    unsafe fn test_mm256_setzero_si256() {
4494        let r = _mm256_setzero_si256();
4495        assert_eq_m256i(r, _mm256_set1_epi8(0));
4496    }
4497
4498    #[simd_test(enable = "avx")]
4499    unsafe fn test_mm256_set_pd() {
4500        let r = _mm256_set_pd(1., 2., 3., 4.);
4501        assert_eq_m256d(r, _mm256_setr_pd(4., 3., 2., 1.));
4502    }
4503
4504    #[simd_test(enable = "avx")]
4505    unsafe fn test_mm256_set_ps() {
4506        let r = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4507        assert_eq_m256(r, _mm256_setr_ps(8., 7., 6., 5., 4., 3., 2., 1.));
4508    }
4509
4510    #[simd_test(enable = "avx")]
4511    unsafe fn test_mm256_set_epi8() {
4512        #[rustfmt::skip]
4513        let r = _mm256_set_epi8(
4514            1, 2, 3, 4, 5, 6, 7, 8,
4515            9, 10, 11, 12, 13, 14, 15, 16,
4516            17, 18, 19, 20, 21, 22, 23, 24,
4517            25, 26, 27, 28, 29, 30, 31, 32,
4518        );
4519        #[rustfmt::skip]
4520        let e = _mm256_setr_epi8(
4521            32, 31, 30, 29, 28, 27, 26, 25,
4522            24, 23, 22, 21, 20, 19, 18, 17,
4523            16, 15, 14, 13, 12, 11, 10, 9,
4524            8, 7, 6, 5, 4, 3, 2, 1
4525        );
4526        assert_eq_m256i(r, e);
4527    }
4528
4529    #[simd_test(enable = "avx")]
4530    unsafe fn test_mm256_set_epi16() {
4531        #[rustfmt::skip]
4532        let r = _mm256_set_epi16(
4533            1, 2, 3, 4, 5, 6, 7, 8,
4534            9, 10, 11, 12, 13, 14, 15, 16,
4535        );
4536        #[rustfmt::skip]
4537        let e = _mm256_setr_epi16(
4538            16, 15, 14, 13, 12, 11, 10, 9, 8,
4539            7, 6, 5, 4, 3, 2, 1,
4540        );
4541        assert_eq_m256i(r, e);
4542    }
4543
4544    #[simd_test(enable = "avx")]
4545    unsafe fn test_mm256_set_epi32() {
4546        let r = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4547        assert_eq_m256i(r, _mm256_setr_epi32(8, 7, 6, 5, 4, 3, 2, 1));
4548    }
4549
4550    #[simd_test(enable = "avx")]
4551    unsafe fn test_mm256_set_epi64x() {
4552        let r = _mm256_set_epi64x(1, 2, 3, 4);
4553        assert_eq_m256i(r, _mm256_setr_epi64x(4, 3, 2, 1));
4554    }
4555
4556    #[simd_test(enable = "avx")]
4557    unsafe fn test_mm256_setr_pd() {
4558        let r = _mm256_setr_pd(1., 2., 3., 4.);
4559        assert_eq_m256d(r, _mm256_setr_pd(1., 2., 3., 4.));
4560    }
4561
4562    #[simd_test(enable = "avx")]
4563    unsafe fn test_mm256_setr_ps() {
4564        let r = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4565        assert_eq_m256(r, _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.));
4566    }
4567
4568    #[simd_test(enable = "avx")]
4569    unsafe fn test_mm256_setr_epi8() {
4570        #[rustfmt::skip]
4571        let r = _mm256_setr_epi8(
4572            1, 2, 3, 4, 5, 6, 7, 8,
4573            9, 10, 11, 12, 13, 14, 15, 16,
4574            17, 18, 19, 20, 21, 22, 23, 24,
4575            25, 26, 27, 28, 29, 30, 31, 32,
4576        );
4577        #[rustfmt::skip]
4578        let e = _mm256_setr_epi8(
4579            1, 2, 3, 4, 5, 6, 7, 8,
4580            9, 10, 11, 12, 13, 14, 15, 16,
4581            17, 18, 19, 20, 21, 22, 23, 24,
4582            25, 26, 27, 28, 29, 30, 31, 32
4583        );
4584
4585        assert_eq_m256i(r, e);
4586    }
4587
4588    #[simd_test(enable = "avx")]
4589    unsafe fn test_mm256_setr_epi16() {
4590        #[rustfmt::skip]
4591        let r = _mm256_setr_epi16(
4592            1, 2, 3, 4, 5, 6, 7, 8,
4593            9, 10, 11, 12, 13, 14, 15, 16,
4594        );
4595        #[rustfmt::skip]
4596        let e = _mm256_setr_epi16(
4597            1, 2, 3, 4, 5, 6, 7, 8,
4598            9, 10, 11, 12, 13, 14, 15, 16,
4599        );
4600        assert_eq_m256i(r, e);
4601    }
4602
4603    #[simd_test(enable = "avx")]
4604    unsafe fn test_mm256_setr_epi32() {
4605        let r = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4606        assert_eq_m256i(r, _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8));
4607    }
4608
4609    #[simd_test(enable = "avx")]
4610    unsafe fn test_mm256_setr_epi64x() {
4611        let r = _mm256_setr_epi64x(1, 2, 3, 4);
4612        assert_eq_m256i(r, _mm256_setr_epi64x(1, 2, 3, 4));
4613    }
4614
4615    #[simd_test(enable = "avx")]
4616    unsafe fn test_mm256_set1_pd() {
4617        let r = _mm256_set1_pd(1.);
4618        assert_eq_m256d(r, _mm256_set1_pd(1.));
4619    }
4620
4621    #[simd_test(enable = "avx")]
4622    unsafe fn test_mm256_set1_ps() {
4623        let r = _mm256_set1_ps(1.);
4624        assert_eq_m256(r, _mm256_set1_ps(1.));
4625    }
4626
4627    #[simd_test(enable = "avx")]
4628    unsafe fn test_mm256_set1_epi8() {
4629        let r = _mm256_set1_epi8(1);
4630        assert_eq_m256i(r, _mm256_set1_epi8(1));
4631    }
4632
4633    #[simd_test(enable = "avx")]
4634    unsafe fn test_mm256_set1_epi16() {
4635        let r = _mm256_set1_epi16(1);
4636        assert_eq_m256i(r, _mm256_set1_epi16(1));
4637    }
4638
4639    #[simd_test(enable = "avx")]
4640    unsafe fn test_mm256_set1_epi32() {
4641        let r = _mm256_set1_epi32(1);
4642        assert_eq_m256i(r, _mm256_set1_epi32(1));
4643    }
4644
4645    #[simd_test(enable = "avx")]
4646    unsafe fn test_mm256_set1_epi64x() {
4647        let r = _mm256_set1_epi64x(1);
4648        assert_eq_m256i(r, _mm256_set1_epi64x(1));
4649    }
4650
4651    #[simd_test(enable = "avx")]
4652    unsafe fn test_mm256_castpd_ps() {
4653        let a = _mm256_setr_pd(1., 2., 3., 4.);
4654        let r = _mm256_castpd_ps(a);
4655        let e = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4656        assert_eq_m256(r, e);
4657    }
4658
4659    #[simd_test(enable = "avx")]
4660    unsafe fn test_mm256_castps_pd() {
4661        let a = _mm256_setr_ps(0., 1.875, 0., 2., 0., 2.125, 0., 2.25);
4662        let r = _mm256_castps_pd(a);
4663        let e = _mm256_setr_pd(1., 2., 3., 4.);
4664        assert_eq_m256d(r, e);
4665    }
4666
4667    #[simd_test(enable = "avx")]
4668    unsafe fn test_mm256_castps_si256() {
4669        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4670        let r = _mm256_castps_si256(a);
4671        #[rustfmt::skip]
4672        let e = _mm256_setr_epi8(
4673            0, 0, -128, 63, 0, 0, 0, 64,
4674            0, 0, 64, 64, 0, 0, -128, 64,
4675            0, 0, -96, 64, 0, 0, -64, 64,
4676            0, 0, -32, 64, 0, 0, 0, 65,
4677        );
4678        assert_eq_m256i(r, e);
4679    }
4680
4681    #[simd_test(enable = "avx")]
4682    unsafe fn test_mm256_castsi256_ps() {
4683        #[rustfmt::skip]
4684        let a = _mm256_setr_epi8(
4685            0, 0, -128, 63, 0, 0, 0, 64,
4686            0, 0, 64, 64, 0, 0, -128, 64,
4687            0, 0, -96, 64, 0, 0, -64, 64,
4688            0, 0, -32, 64, 0, 0, 0, 65,
4689        );
4690        let r = _mm256_castsi256_ps(a);
4691        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4692        assert_eq_m256(r, e);
4693    }
4694
4695    #[simd_test(enable = "avx")]
4696    unsafe fn test_mm256_castpd_si256() {
4697        let a = _mm256_setr_pd(1., 2., 3., 4.);
4698        let r = _mm256_castpd_si256(a);
4699        assert_eq_m256d(transmute(r), a);
4700    }
4701
4702    #[simd_test(enable = "avx")]
4703    unsafe fn test_mm256_castsi256_pd() {
4704        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4705        let r = _mm256_castsi256_pd(a);
4706        assert_eq_m256d(r, transmute(a));
4707    }
4708
4709    #[simd_test(enable = "avx")]
4710    unsafe fn test_mm256_castps256_ps128() {
4711        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4712        let r = _mm256_castps256_ps128(a);
4713        assert_eq_m128(r, _mm_setr_ps(1., 2., 3., 4.));
4714    }
4715
4716    #[simd_test(enable = "avx")]
4717    unsafe fn test_mm256_castpd256_pd128() {
4718        let a = _mm256_setr_pd(1., 2., 3., 4.);
4719        let r = _mm256_castpd256_pd128(a);
4720        assert_eq_m128d(r, _mm_setr_pd(1., 2.));
4721    }
4722
4723    #[simd_test(enable = "avx")]
4724    unsafe fn test_mm256_castsi256_si128() {
4725        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4726        let r = _mm256_castsi256_si128(a);
4727        assert_eq_m128i(r, _mm_setr_epi64x(1, 2));
4728    }
4729
4730    #[simd_test(enable = "avx")]
4731    unsafe fn test_mm256_castps128_ps256() {
4732        let a = _mm_setr_ps(1., 2., 3., 4.);
4733        let r = _mm256_castps128_ps256(a);
4734        assert_eq_m128(_mm256_castps256_ps128(r), a);
4735    }
4736
4737    #[simd_test(enable = "avx")]
4738    unsafe fn test_mm256_castpd128_pd256() {
4739        let a = _mm_setr_pd(1., 2.);
4740        let r = _mm256_castpd128_pd256(a);
4741        assert_eq_m128d(_mm256_castpd256_pd128(r), a);
4742    }
4743
4744    #[simd_test(enable = "avx")]
4745    unsafe fn test_mm256_castsi128_si256() {
4746        let a = _mm_setr_epi32(1, 2, 3, 4);
4747        let r = _mm256_castsi128_si256(a);
4748        assert_eq_m128i(_mm256_castsi256_si128(r), a);
4749    }
4750
4751    #[simd_test(enable = "avx")]
4752    unsafe fn test_mm256_zextps128_ps256() {
4753        let a = _mm_setr_ps(1., 2., 3., 4.);
4754        let r = _mm256_zextps128_ps256(a);
4755        let e = _mm256_setr_ps(1., 2., 3., 4., 0., 0., 0., 0.);
4756        assert_eq_m256(r, e);
4757    }
4758
4759    #[simd_test(enable = "avx")]
4760    unsafe fn test_mm256_zextsi128_si256() {
4761        let a = _mm_setr_epi64x(1, 2);
4762        let r = _mm256_zextsi128_si256(a);
4763        let e = _mm256_setr_epi64x(1, 2, 0, 0);
4764        assert_eq_m256i(r, e);
4765    }
4766
4767    #[simd_test(enable = "avx")]
4768    unsafe fn test_mm256_zextpd128_pd256() {
4769        let a = _mm_setr_pd(1., 2.);
4770        let r = _mm256_zextpd128_pd256(a);
4771        let e = _mm256_setr_pd(1., 2., 0., 0.);
4772        assert_eq_m256d(r, e);
4773    }
4774
4775    #[simd_test(enable = "avx")]
4776    unsafe fn test_mm256_set_m128() {
4777        let hi = _mm_setr_ps(5., 6., 7., 8.);
4778        let lo = _mm_setr_ps(1., 2., 3., 4.);
4779        let r = _mm256_set_m128(hi, lo);
4780        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4781        assert_eq_m256(r, e);
4782    }
4783
4784    #[simd_test(enable = "avx")]
4785    unsafe fn test_mm256_set_m128d() {
4786        let hi = _mm_setr_pd(3., 4.);
4787        let lo = _mm_setr_pd(1., 2.);
4788        let r = _mm256_set_m128d(hi, lo);
4789        let e = _mm256_setr_pd(1., 2., 3., 4.);
4790        assert_eq_m256d(r, e);
4791    }
4792
4793    #[simd_test(enable = "avx")]
4794    unsafe fn test_mm256_set_m128i() {
4795        #[rustfmt::skip]
4796        let hi = _mm_setr_epi8(
4797            17, 18, 19, 20,
4798            21, 22, 23, 24,
4799            25, 26, 27, 28,
4800            29, 30, 31, 32,
4801        );
4802        #[rustfmt::skip]
4803        let lo = _mm_setr_epi8(
4804            1, 2, 3, 4,
4805            5, 6, 7, 8,
4806            9, 10, 11, 12,
4807            13, 14, 15, 16,
4808        );
4809        let r = _mm256_set_m128i(hi, lo);
4810        #[rustfmt::skip]
4811        let e = _mm256_setr_epi8(
4812            1, 2, 3, 4, 5, 6, 7, 8,
4813            9, 10, 11, 12, 13, 14, 15, 16,
4814            17, 18, 19, 20, 21, 22, 23, 24,
4815            25, 26, 27, 28, 29, 30, 31, 32,
4816        );
4817        assert_eq_m256i(r, e);
4818    }
4819
4820    #[simd_test(enable = "avx")]
4821    unsafe fn test_mm256_setr_m128() {
4822        let lo = _mm_setr_ps(1., 2., 3., 4.);
4823        let hi = _mm_setr_ps(5., 6., 7., 8.);
4824        let r = _mm256_setr_m128(lo, hi);
4825        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4826        assert_eq_m256(r, e);
4827    }
4828
4829    #[simd_test(enable = "avx")]
4830    unsafe fn test_mm256_setr_m128d() {
4831        let lo = _mm_setr_pd(1., 2.);
4832        let hi = _mm_setr_pd(3., 4.);
4833        let r = _mm256_setr_m128d(lo, hi);
4834        let e = _mm256_setr_pd(1., 2., 3., 4.);
4835        assert_eq_m256d(r, e);
4836    }
4837
4838    #[simd_test(enable = "avx")]
4839    unsafe fn test_mm256_setr_m128i() {
4840        #[rustfmt::skip]
4841        let lo = _mm_setr_epi8(
4842            1, 2, 3, 4,
4843            5, 6, 7, 8,
4844            9, 10, 11, 12,
4845            13, 14, 15, 16,
4846        );
4847        #[rustfmt::skip]
4848        let hi = _mm_setr_epi8(
4849            17, 18, 19, 20, 21, 22, 23, 24,
4850            25, 26, 27, 28, 29, 30, 31, 32,
4851        );
4852        let r = _mm256_setr_m128i(lo, hi);
4853        #[rustfmt::skip]
4854        let e = _mm256_setr_epi8(
4855            1, 2, 3, 4, 5, 6, 7, 8,
4856            9, 10, 11, 12, 13, 14, 15, 16,
4857            17, 18, 19, 20, 21, 22, 23, 24,
4858            25, 26, 27, 28, 29, 30, 31, 32,
4859        );
4860        assert_eq_m256i(r, e);
4861    }
4862
4863    #[simd_test(enable = "avx")]
4864    unsafe fn test_mm256_loadu2_m128() {
4865        let hi = &[5., 6., 7., 8.];
4866        let hiaddr = hi.as_ptr();
4867        let lo = &[1., 2., 3., 4.];
4868        let loaddr = lo.as_ptr();
4869        let r = _mm256_loadu2_m128(hiaddr, loaddr);
4870        let e = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4871        assert_eq_m256(r, e);
4872    }
4873
4874    #[simd_test(enable = "avx")]
4875    unsafe fn test_mm256_loadu2_m128d() {
4876        let hi = &[3., 4.];
4877        let hiaddr = hi.as_ptr();
4878        let lo = &[1., 2.];
4879        let loaddr = lo.as_ptr();
4880        let r = _mm256_loadu2_m128d(hiaddr, loaddr);
4881        let e = _mm256_setr_pd(1., 2., 3., 4.);
4882        assert_eq_m256d(r, e);
4883    }
4884
4885    #[simd_test(enable = "avx")]
4886    unsafe fn test_mm256_loadu2_m128i() {
4887        #[rustfmt::skip]
4888        let hi = _mm_setr_epi8(
4889            17, 18, 19, 20, 21, 22, 23, 24,
4890            25, 26, 27, 28, 29, 30, 31, 32,
4891        );
4892        #[rustfmt::skip]
4893        let lo = _mm_setr_epi8(
4894            1, 2, 3, 4, 5, 6, 7, 8,
4895            9, 10, 11, 12, 13, 14, 15, 16,
4896        );
4897        let r = _mm256_loadu2_m128i(ptr::addr_of!(hi) as *const _, ptr::addr_of!(lo) as *const _);
4898        #[rustfmt::skip]
4899        let e = _mm256_setr_epi8(
4900            1, 2, 3, 4, 5, 6, 7, 8,
4901            9, 10, 11, 12, 13, 14, 15, 16,
4902            17, 18, 19, 20, 21, 22, 23, 24,
4903            25, 26, 27, 28, 29, 30, 31, 32,
4904        );
4905        assert_eq_m256i(r, e);
4906    }
4907
4908    #[simd_test(enable = "avx")]
4909    unsafe fn test_mm256_storeu2_m128() {
4910        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4911        let mut hi = _mm_undefined_ps();
4912        let mut lo = _mm_undefined_ps();
4913        _mm256_storeu2_m128(
4914            ptr::addr_of_mut!(hi) as *mut f32,
4915            ptr::addr_of_mut!(lo) as *mut f32,
4916            a,
4917        );
4918        assert_eq_m128(hi, _mm_setr_ps(5., 6., 7., 8.));
4919        assert_eq_m128(lo, _mm_setr_ps(1., 2., 3., 4.));
4920    }
4921
4922    #[simd_test(enable = "avx")]
4923    unsafe fn test_mm256_storeu2_m128d() {
4924        let a = _mm256_setr_pd(1., 2., 3., 4.);
4925        let mut hi = _mm_undefined_pd();
4926        let mut lo = _mm_undefined_pd();
4927        _mm256_storeu2_m128d(
4928            ptr::addr_of_mut!(hi) as *mut f64,
4929            ptr::addr_of_mut!(lo) as *mut f64,
4930            a,
4931        );
4932        assert_eq_m128d(hi, _mm_setr_pd(3., 4.));
4933        assert_eq_m128d(lo, _mm_setr_pd(1., 2.));
4934    }
4935
4936    #[simd_test(enable = "avx")]
4937    unsafe fn test_mm256_storeu2_m128i() {
4938        #[rustfmt::skip]
4939        let a = _mm256_setr_epi8(
4940            1, 2, 3, 4, 5, 6, 7, 8,
4941            9, 10, 11, 12, 13, 14, 15, 16,
4942            17, 18, 19, 20, 21, 22, 23, 24,
4943            25, 26, 27, 28, 29, 30, 31, 32,
4944        );
4945        let mut hi = _mm_undefined_si128();
4946        let mut lo = _mm_undefined_si128();
4947        _mm256_storeu2_m128i(ptr::addr_of_mut!(hi), ptr::addr_of_mut!(lo), a);
4948        #[rustfmt::skip]
4949        let e_hi = _mm_setr_epi8(
4950            17, 18, 19, 20, 21, 22, 23, 24,
4951            25, 26, 27, 28, 29, 30, 31, 32
4952        );
4953        #[rustfmt::skip]
4954        let e_lo = _mm_setr_epi8(
4955            1, 2, 3, 4, 5, 6, 7, 8,
4956            9, 10, 11, 12, 13, 14, 15, 16
4957        );
4958
4959        assert_eq_m128i(hi, e_hi);
4960        assert_eq_m128i(lo, e_lo);
4961    }
4962
4963    #[simd_test(enable = "avx")]
4964    unsafe fn test_mm256_cvtss_f32() {
4965        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
4966        let r = _mm256_cvtss_f32(a);
4967        assert_eq!(r, 1.);
4968    }
4969}