core/stdarch/crates/core_arch/src/x86/
sse41.rs

1//! Streaming SIMD Extensions 4.1 (SSE4.1)
2
3use crate::core_arch::{simd::*, x86::*};
4use crate::intrinsics::simd::*;
5
6#[cfg(test)]
7use stdarch_test::assert_instr;
8
9// SSE4 rounding constants
10/// round to nearest
11#[stable(feature = "simd_x86", since = "1.27.0")]
12pub const _MM_FROUND_TO_NEAREST_INT: i32 = 0x00;
13/// round down
14#[stable(feature = "simd_x86", since = "1.27.0")]
15pub const _MM_FROUND_TO_NEG_INF: i32 = 0x01;
16/// round up
17#[stable(feature = "simd_x86", since = "1.27.0")]
18pub const _MM_FROUND_TO_POS_INF: i32 = 0x02;
19/// truncate
20#[stable(feature = "simd_x86", since = "1.27.0")]
21pub const _MM_FROUND_TO_ZERO: i32 = 0x03;
22/// use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`
23#[stable(feature = "simd_x86", since = "1.27.0")]
24pub const _MM_FROUND_CUR_DIRECTION: i32 = 0x04;
25/// do not suppress exceptions
26#[stable(feature = "simd_x86", since = "1.27.0")]
27pub const _MM_FROUND_RAISE_EXC: i32 = 0x00;
28/// suppress exceptions
29#[stable(feature = "simd_x86", since = "1.27.0")]
30pub const _MM_FROUND_NO_EXC: i32 = 0x08;
31/// round to nearest and do not suppress exceptions
32#[stable(feature = "simd_x86", since = "1.27.0")]
33pub const _MM_FROUND_NINT: i32 = 0x00;
34/// round down and do not suppress exceptions
35#[stable(feature = "simd_x86", since = "1.27.0")]
36pub const _MM_FROUND_FLOOR: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF;
37/// round up and do not suppress exceptions
38#[stable(feature = "simd_x86", since = "1.27.0")]
39pub const _MM_FROUND_CEIL: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF;
40/// truncate and do not suppress exceptions
41#[stable(feature = "simd_x86", since = "1.27.0")]
42pub const _MM_FROUND_TRUNC: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO;
43/// use MXCSR.RC and do not suppress exceptions; see
44/// `vendor::_MM_SET_ROUNDING_MODE`
45#[stable(feature = "simd_x86", since = "1.27.0")]
46pub const _MM_FROUND_RINT: i32 = _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION;
47/// use MXCSR.RC and suppress exceptions; see `vendor::_MM_SET_ROUNDING_MODE`
48#[stable(feature = "simd_x86", since = "1.27.0")]
49pub const _MM_FROUND_NEARBYINT: i32 = _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION;
50
51/// Blend packed 8-bit integers from `a` and `b` using `mask`
52///
53/// The high bit of each corresponding mask byte determines the selection.
54/// If the high bit is set, the element of `b` is selected.
55/// Otherwise, the element of `a` is selected.
56///
57/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8)
58#[inline]
59#[target_feature(enable = "sse4.1")]
60#[cfg_attr(test, assert_instr(pblendvb))]
61#[stable(feature = "simd_x86", since = "1.27.0")]
62pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
63    let mask: i8x16 = simd_lt(mask.as_i8x16(), i8x16::ZERO);
64    transmute(simd_select(mask, b.as_i8x16(), a.as_i8x16()))
65}
66
67/// Blend packed 16-bit integers from `a` and `b` using the mask `IMM8`.
68///
69/// The mask bits determine the selection. A clear bit selects the
70/// corresponding element of `a`, and a set bit the corresponding
71/// element of `b`.
72///
73/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16)
74#[inline]
75#[target_feature(enable = "sse4.1")]
76#[cfg_attr(test, assert_instr(pblendw, IMM8 = 0xB1))]
77#[rustc_legacy_const_generics(2)]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79pub unsafe fn _mm_blend_epi16<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
80    static_assert_uimm_bits!(IMM8, 8);
81    transmute::<i16x8, _>(simd_shuffle!(
82        a.as_i16x8(),
83        b.as_i16x8(),
84        [
85            [0, 8][IMM8 as usize & 1],
86            [1, 9][(IMM8 >> 1) as usize & 1],
87            [2, 10][(IMM8 >> 2) as usize & 1],
88            [3, 11][(IMM8 >> 3) as usize & 1],
89            [4, 12][(IMM8 >> 4) as usize & 1],
90            [5, 13][(IMM8 >> 5) as usize & 1],
91            [6, 14][(IMM8 >> 6) as usize & 1],
92            [7, 15][(IMM8 >> 7) as usize & 1],
93        ]
94    ))
95}
96
97/// Blend packed double-precision (64-bit) floating-point elements from `a`
98/// and `b` using `mask`
99///
100/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd)
101#[inline]
102#[target_feature(enable = "sse4.1")]
103#[cfg_attr(test, assert_instr(blendvpd))]
104#[stable(feature = "simd_x86", since = "1.27.0")]
105pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
106    let mask: i64x2 = simd_lt(transmute::<_, i64x2>(mask), i64x2::ZERO);
107    transmute(simd_select(mask, b.as_f64x2(), a.as_f64x2()))
108}
109
110/// Blend packed single-precision (32-bit) floating-point elements from `a`
111/// and `b` using `mask`
112///
113/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps)
114#[inline]
115#[target_feature(enable = "sse4.1")]
116#[cfg_attr(test, assert_instr(blendvps))]
117#[stable(feature = "simd_x86", since = "1.27.0")]
118pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
119    let mask: i32x4 = simd_lt(transmute::<_, i32x4>(mask), i32x4::ZERO);
120    transmute(simd_select(mask, b.as_f32x4(), a.as_f32x4()))
121}
122
123/// Blend packed double-precision (64-bit) floating-point elements from `a`
124/// and `b` using control mask `IMM2`
125///
126/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd)
127#[inline]
128#[target_feature(enable = "sse4.1")]
129// Note: LLVM7 prefers the single-precision floating-point domain when possible
130// see https://bugs.llvm.org/show_bug.cgi?id=38195
131// #[cfg_attr(test, assert_instr(blendpd, IMM2 = 0b10))]
132#[cfg_attr(test, assert_instr(blendps, IMM2 = 0b10))]
133#[rustc_legacy_const_generics(2)]
134#[stable(feature = "simd_x86", since = "1.27.0")]
135pub unsafe fn _mm_blend_pd<const IMM2: i32>(a: __m128d, b: __m128d) -> __m128d {
136    static_assert_uimm_bits!(IMM2, 2);
137    transmute::<f64x2, _>(simd_shuffle!(
138        a.as_f64x2(),
139        b.as_f64x2(),
140        [[0, 2][IMM2 as usize & 1], [1, 3][(IMM2 >> 1) as usize & 1]]
141    ))
142}
143
144/// Blend packed single-precision (32-bit) floating-point elements from `a`
145/// and `b` using mask `IMM4`
146///
147/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps)
148#[inline]
149#[target_feature(enable = "sse4.1")]
150#[cfg_attr(test, assert_instr(blendps, IMM4 = 0b0101))]
151#[rustc_legacy_const_generics(2)]
152#[stable(feature = "simd_x86", since = "1.27.0")]
153pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
154    static_assert_uimm_bits!(IMM4, 4);
155    transmute::<f32x4, _>(simd_shuffle!(
156        a.as_f32x4(),
157        b.as_f32x4(),
158        [
159            [0, 4][IMM4 as usize & 1],
160            [1, 5][(IMM4 >> 1) as usize & 1],
161            [2, 6][(IMM4 >> 2) as usize & 1],
162            [3, 7][(IMM4 >> 3) as usize & 1],
163        ]
164    ))
165}
166
167/// Extracts a single-precision (32-bit) floating-point element from `a`,
168/// selected with `IMM8`. The returned `i32` stores the float's bit-pattern,
169/// and may be converted back to a floating point number via casting.
170///
171/// # Example
172/// ```rust
173/// # #[cfg(target_arch = "x86")]
174/// # use std::arch::x86::*;
175/// # #[cfg(target_arch = "x86_64")]
176/// # use std::arch::x86_64::*;
177/// # fn main() {
178/// #    if is_x86_feature_detected!("sse4.1") {
179/// #       #[target_feature(enable = "sse4.1")]
180/// #       unsafe fn worker() {
181/// let mut float_store = vec![1.0, 1.0, 2.0, 3.0];
182/// let simd_floats = _mm_set_ps(2.5, 5.0, 7.5, 10.0);
183/// let x: i32 = _mm_extract_ps::<2>(simd_floats);
184/// float_store.push(f32::from_bits(x as u32));
185/// assert_eq!(float_store, vec![1.0, 1.0, 2.0, 3.0, 5.0]);
186/// #       }
187/// #       unsafe { worker() }
188/// #   }
189/// # }
190/// ```
191/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_ps)
192#[inline]
193#[target_feature(enable = "sse4.1")]
194#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(extractps, IMM8 = 0))]
195#[rustc_legacy_const_generics(1)]
196#[stable(feature = "simd_x86", since = "1.27.0")]
197pub unsafe fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
198    static_assert_uimm_bits!(IMM8, 2);
199    simd_extract!(a, IMM8 as u32, f32).to_bits() as i32
200}
201
202/// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
203/// integer containing the zero-extended integer data.
204///
205/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
206///
207/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8)
208#[inline]
209#[target_feature(enable = "sse4.1")]
210#[cfg_attr(test, assert_instr(pextrb, IMM8 = 0))]
211#[rustc_legacy_const_generics(1)]
212#[stable(feature = "simd_x86", since = "1.27.0")]
213pub unsafe fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
214    static_assert_uimm_bits!(IMM8, 4);
215    simd_extract!(a.as_u8x16(), IMM8 as u32, u8) as i32
216}
217
218/// Extracts an 32-bit integer from `a` selected with `IMM8`
219///
220/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32)
221#[inline]
222#[target_feature(enable = "sse4.1")]
223#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(extractps, IMM8 = 1))]
224#[rustc_legacy_const_generics(1)]
225#[stable(feature = "simd_x86", since = "1.27.0")]
226pub unsafe fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
227    static_assert_uimm_bits!(IMM8, 2);
228    simd_extract!(a.as_i32x4(), IMM8 as u32, i32)
229}
230
231/// Select a single value in `b` to store at some position in `a`,
232/// Then zero elements according to `IMM8`.
233///
234/// `IMM8` specifies which bits from operand `b` will be copied, which bits in
235/// the result they will be copied to, and which bits in the result will be
236/// cleared. The following assignments are made:
237///
238/// * Bits `[7:6]` specify the bits to copy from operand `b`:
239///     - `00`: Selects bits `[31:0]` from operand `b`.
240///     - `01`: Selects bits `[63:32]` from operand `b`.
241///     - `10`: Selects bits `[95:64]` from operand `b`.
242///     - `11`: Selects bits `[127:96]` from operand `b`.
243///
244/// * Bits `[5:4]` specify the bits in the result to which the selected bits
245///   from operand `b` are copied:
246///     - `00`: Copies the selected bits from `b` to result bits `[31:0]`.
247///     - `01`: Copies the selected bits from `b` to result bits `[63:32]`.
248///     - `10`: Copies the selected bits from `b` to result bits `[95:64]`.
249///     - `11`: Copies the selected bits from `b` to result bits `[127:96]`.
250///
251/// * Bits `[3:0]`: If any of these bits are set, the corresponding result
252///   element is cleared.
253///
254/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_ps)
255#[inline]
256#[target_feature(enable = "sse4.1")]
257#[cfg_attr(test, assert_instr(insertps, IMM8 = 0b1010))]
258#[rustc_legacy_const_generics(2)]
259#[stable(feature = "simd_x86", since = "1.27.0")]
260pub unsafe fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
261    static_assert_uimm_bits!(IMM8, 8);
262    insertps(a, b, IMM8 as u8)
263}
264
265/// Returns a copy of `a` with the 8-bit integer from `i` inserted at a
266/// location specified by `IMM8`.
267///
268/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8)
269#[inline]
270#[target_feature(enable = "sse4.1")]
271#[cfg_attr(test, assert_instr(pinsrb, IMM8 = 0))]
272#[rustc_legacy_const_generics(2)]
273#[stable(feature = "simd_x86", since = "1.27.0")]
274pub unsafe fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
275    static_assert_uimm_bits!(IMM8, 4);
276    transmute(simd_insert!(a.as_i8x16(), IMM8 as u32, i as i8))
277}
278
279/// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
280/// location specified by `IMM8`.
281///
282/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32)
283#[inline]
284#[target_feature(enable = "sse4.1")]
285#[cfg_attr(test, assert_instr(pinsrd, IMM8 = 0))]
286#[rustc_legacy_const_generics(2)]
287#[stable(feature = "simd_x86", since = "1.27.0")]
288pub unsafe fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
289    static_assert_uimm_bits!(IMM8, 2);
290    transmute(simd_insert!(a.as_i32x4(), IMM8 as u32, i))
291}
292
293/// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
294/// values in dst.
295///
296/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8)
297#[inline]
298#[target_feature(enable = "sse4.1")]
299#[cfg_attr(test, assert_instr(pmaxsb))]
300#[stable(feature = "simd_x86", since = "1.27.0")]
301pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
302    let a = a.as_i8x16();
303    let b = b.as_i8x16();
304    transmute(simd_select::<i8x16, _>(simd_gt(a, b), a, b))
305}
306
307/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
308/// maximum.
309///
310/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16)
311#[inline]
312#[target_feature(enable = "sse4.1")]
313#[cfg_attr(test, assert_instr(pmaxuw))]
314#[stable(feature = "simd_x86", since = "1.27.0")]
315pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
316    let a = a.as_u16x8();
317    let b = b.as_u16x8();
318    transmute(simd_select::<i16x8, _>(simd_gt(a, b), a, b))
319}
320
321/// Compares packed 32-bit integers in `a` and `b`, and returns packed maximum
322/// values.
323///
324/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32)
325#[inline]
326#[target_feature(enable = "sse4.1")]
327#[cfg_attr(test, assert_instr(pmaxsd))]
328#[stable(feature = "simd_x86", since = "1.27.0")]
329pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
330    let a = a.as_i32x4();
331    let b = b.as_i32x4();
332    transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
333}
334
335/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
336/// maximum values.
337///
338/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32)
339#[inline]
340#[target_feature(enable = "sse4.1")]
341#[cfg_attr(test, assert_instr(pmaxud))]
342#[stable(feature = "simd_x86", since = "1.27.0")]
343pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
344    let a = a.as_u32x4();
345    let b = b.as_u32x4();
346    transmute(simd_select::<i32x4, _>(simd_gt(a, b), a, b))
347}
348
349/// Compares packed 8-bit integers in `a` and `b` and returns packed minimum
350/// values in dst.
351///
352/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8)
353#[inline]
354#[target_feature(enable = "sse4.1")]
355#[cfg_attr(test, assert_instr(pminsb))]
356#[stable(feature = "simd_x86", since = "1.27.0")]
357pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
358    let a = a.as_i8x16();
359    let b = b.as_i8x16();
360    transmute(simd_select::<i8x16, _>(simd_lt(a, b), a, b))
361}
362
363/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns packed
364/// minimum.
365///
366/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16)
367#[inline]
368#[target_feature(enable = "sse4.1")]
369#[cfg_attr(test, assert_instr(pminuw))]
370#[stable(feature = "simd_x86", since = "1.27.0")]
371pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
372    let a = a.as_u16x8();
373    let b = b.as_u16x8();
374    transmute(simd_select::<i16x8, _>(simd_lt(a, b), a, b))
375}
376
377/// Compares packed 32-bit integers in `a` and `b`, and returns packed minimum
378/// values.
379///
380/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32)
381#[inline]
382#[target_feature(enable = "sse4.1")]
383#[cfg_attr(test, assert_instr(pminsd))]
384#[stable(feature = "simd_x86", since = "1.27.0")]
385pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
386    let a = a.as_i32x4();
387    let b = b.as_i32x4();
388    transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
389}
390
391/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns packed
392/// minimum values.
393///
394/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu32)
395#[inline]
396#[target_feature(enable = "sse4.1")]
397#[cfg_attr(test, assert_instr(pminud))]
398#[stable(feature = "simd_x86", since = "1.27.0")]
399pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
400    let a = a.as_u32x4();
401    let b = b.as_u32x4();
402    transmute(simd_select::<i32x4, _>(simd_lt(a, b), a, b))
403}
404
405/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
406/// using unsigned saturation
407///
408/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32)
409#[inline]
410#[target_feature(enable = "sse4.1")]
411#[cfg_attr(test, assert_instr(packusdw))]
412#[stable(feature = "simd_x86", since = "1.27.0")]
413pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
414    transmute(packusdw(a.as_i32x4(), b.as_i32x4()))
415}
416
417/// Compares packed 64-bit integers in `a` and `b` for equality
418///
419/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi64)
420#[inline]
421#[target_feature(enable = "sse4.1")]
422#[cfg_attr(test, assert_instr(pcmpeqq))]
423#[stable(feature = "simd_x86", since = "1.27.0")]
424pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
425    transmute(simd_eq::<_, i64x2>(a.as_i64x2(), b.as_i64x2()))
426}
427
428/// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
429///
430/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16)
431#[inline]
432#[target_feature(enable = "sse4.1")]
433#[cfg_attr(test, assert_instr(pmovsxbw))]
434#[stable(feature = "simd_x86", since = "1.27.0")]
435pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
436    let a = a.as_i8x16();
437    let a: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
438    transmute(simd_cast::<_, i16x8>(a))
439}
440
441/// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
442///
443/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32)
444#[inline]
445#[target_feature(enable = "sse4.1")]
446#[cfg_attr(test, assert_instr(pmovsxbd))]
447#[stable(feature = "simd_x86", since = "1.27.0")]
448pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
449    let a = a.as_i8x16();
450    let a: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
451    transmute(simd_cast::<_, i32x4>(a))
452}
453
454/// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
455/// 64-bit integers
456///
457/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64)
458#[inline]
459#[target_feature(enable = "sse4.1")]
460#[cfg_attr(test, assert_instr(pmovsxbq))]
461#[stable(feature = "simd_x86", since = "1.27.0")]
462pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
463    let a = a.as_i8x16();
464    let a: i8x2 = simd_shuffle!(a, a, [0, 1]);
465    transmute(simd_cast::<_, i64x2>(a))
466}
467
468/// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
469///
470/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32)
471#[inline]
472#[target_feature(enable = "sse4.1")]
473#[cfg_attr(test, assert_instr(pmovsxwd))]
474#[stable(feature = "simd_x86", since = "1.27.0")]
475pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
476    let a = a.as_i16x8();
477    let a: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
478    transmute(simd_cast::<_, i32x4>(a))
479}
480
481/// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
482///
483/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64)
484#[inline]
485#[target_feature(enable = "sse4.1")]
486#[cfg_attr(test, assert_instr(pmovsxwq))]
487#[stable(feature = "simd_x86", since = "1.27.0")]
488pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
489    let a = a.as_i16x8();
490    let a: i16x2 = simd_shuffle!(a, a, [0, 1]);
491    transmute(simd_cast::<_, i64x2>(a))
492}
493
494/// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
495///
496/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64)
497#[inline]
498#[target_feature(enable = "sse4.1")]
499#[cfg_attr(test, assert_instr(pmovsxdq))]
500#[stable(feature = "simd_x86", since = "1.27.0")]
501pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
502    let a = a.as_i32x4();
503    let a: i32x2 = simd_shuffle!(a, a, [0, 1]);
504    transmute(simd_cast::<_, i64x2>(a))
505}
506
507/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
508///
509/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16)
510#[inline]
511#[target_feature(enable = "sse4.1")]
512#[cfg_attr(test, assert_instr(pmovzxbw))]
513#[stable(feature = "simd_x86", since = "1.27.0")]
514pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
515    let a = a.as_u8x16();
516    let a: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
517    transmute(simd_cast::<_, i16x8>(a))
518}
519
520/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
521///
522/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32)
523#[inline]
524#[target_feature(enable = "sse4.1")]
525#[cfg_attr(test, assert_instr(pmovzxbd))]
526#[stable(feature = "simd_x86", since = "1.27.0")]
527pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
528    let a = a.as_u8x16();
529    let a: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
530    transmute(simd_cast::<_, i32x4>(a))
531}
532
533/// Zeroes extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
534///
535/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64)
536#[inline]
537#[target_feature(enable = "sse4.1")]
538#[cfg_attr(test, assert_instr(pmovzxbq))]
539#[stable(feature = "simd_x86", since = "1.27.0")]
540pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
541    let a = a.as_u8x16();
542    let a: u8x2 = simd_shuffle!(a, a, [0, 1]);
543    transmute(simd_cast::<_, i64x2>(a))
544}
545
546/// Zeroes extend packed unsigned 16-bit integers in `a`
547/// to packed 32-bit integers
548///
549/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32)
550#[inline]
551#[target_feature(enable = "sse4.1")]
552#[cfg_attr(test, assert_instr(pmovzxwd))]
553#[stable(feature = "simd_x86", since = "1.27.0")]
554pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
555    let a = a.as_u16x8();
556    let a: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
557    transmute(simd_cast::<_, i32x4>(a))
558}
559
560/// Zeroes extend packed unsigned 16-bit integers in `a`
561/// to packed 64-bit integers
562///
563/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64)
564#[inline]
565#[target_feature(enable = "sse4.1")]
566#[cfg_attr(test, assert_instr(pmovzxwq))]
567#[stable(feature = "simd_x86", since = "1.27.0")]
568pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
569    let a = a.as_u16x8();
570    let a: u16x2 = simd_shuffle!(a, a, [0, 1]);
571    transmute(simd_cast::<_, i64x2>(a))
572}
573
574/// Zeroes extend packed unsigned 32-bit integers in `a`
575/// to packed 64-bit integers
576///
577/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64)
578#[inline]
579#[target_feature(enable = "sse4.1")]
580#[cfg_attr(test, assert_instr(pmovzxdq))]
581#[stable(feature = "simd_x86", since = "1.27.0")]
582pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
583    let a = a.as_u32x4();
584    let a: u32x2 = simd_shuffle!(a, a, [0, 1]);
585    transmute(simd_cast::<_, i64x2>(a))
586}
587
588/// Returns the dot product of two __m128d vectors.
589///
590/// `IMM8[1:0]` is the broadcast mask, and `IMM8[5:4]` is the condition mask.
591/// If a condition mask bit is zero, the corresponding multiplication is
592/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
593/// the dot product will be stored in the return value component. Otherwise if
594/// the broadcast mask bit is zero then the return component will be zero.
595///
596/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd)
597#[inline]
598#[target_feature(enable = "sse4.1")]
599#[cfg_attr(test, assert_instr(dppd, IMM8 = 0))]
600#[rustc_legacy_const_generics(2)]
601#[stable(feature = "simd_x86", since = "1.27.0")]
602pub unsafe fn _mm_dp_pd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
603    static_assert_uimm_bits!(IMM8, 8);
604    dppd(a, b, IMM8 as u8)
605}
606
607/// Returns the dot product of two __m128 vectors.
608///
609/// `IMM8[3:0]` is the broadcast mask, and `IMM8[7:4]` is the condition mask.
610/// If a condition mask bit is zero, the corresponding multiplication is
611/// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
612/// the dot product will be stored in the return value component. Otherwise if
613/// the broadcast mask bit is zero then the return component will be zero.
614///
615/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps)
616#[inline]
617#[target_feature(enable = "sse4.1")]
618#[cfg_attr(test, assert_instr(dpps, IMM8 = 0))]
619#[rustc_legacy_const_generics(2)]
620#[stable(feature = "simd_x86", since = "1.27.0")]
621pub unsafe fn _mm_dp_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
622    static_assert_uimm_bits!(IMM8, 8);
623    dpps(a, b, IMM8 as u8)
624}
625
626/// Round the packed double-precision (64-bit) floating-point elements in `a`
627/// down to an integer value, and stores the results as packed double-precision
628/// floating-point elements.
629///
630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd)
631#[inline]
632#[target_feature(enable = "sse4.1")]
633#[cfg_attr(test, assert_instr(roundpd))]
634#[stable(feature = "simd_x86", since = "1.27.0")]
635pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
636    simd_floor(a)
637}
638
639/// Round the packed single-precision (32-bit) floating-point elements in `a`
640/// down to an integer value, and stores the results as packed single-precision
641/// floating-point elements.
642///
643/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps)
644#[inline]
645#[target_feature(enable = "sse4.1")]
646#[cfg_attr(test, assert_instr(roundps))]
647#[stable(feature = "simd_x86", since = "1.27.0")]
648pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
649    simd_floor(a)
650}
651
652/// Round the lower double-precision (64-bit) floating-point element in `b`
653/// down to an integer value, store the result as a double-precision
654/// floating-point element in the lower element of the intrinsic result,
655/// and copies the upper element from `a` to the upper element of the intrinsic
656/// result.
657///
658/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd)
659#[inline]
660#[target_feature(enable = "sse4.1")]
661#[cfg_attr(test, assert_instr(roundsd))]
662#[stable(feature = "simd_x86", since = "1.27.0")]
663pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
664    roundsd(a, b, _MM_FROUND_FLOOR)
665}
666
667/// Round the lower single-precision (32-bit) floating-point element in `b`
668/// down to an integer value, store the result as a single-precision
669/// floating-point element in the lower element of the intrinsic result,
670/// and copies the upper 3 packed elements from `a` to the upper elements
671/// of the intrinsic result.
672///
673/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss)
674#[inline]
675#[target_feature(enable = "sse4.1")]
676#[cfg_attr(test, assert_instr(roundss))]
677#[stable(feature = "simd_x86", since = "1.27.0")]
678pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
679    roundss(a, b, _MM_FROUND_FLOOR)
680}
681
682/// Round the packed double-precision (64-bit) floating-point elements in `a`
683/// up to an integer value, and stores the results as packed double-precision
684/// floating-point elements.
685///
686/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd)
687#[inline]
688#[target_feature(enable = "sse4.1")]
689#[cfg_attr(test, assert_instr(roundpd))]
690#[stable(feature = "simd_x86", since = "1.27.0")]
691pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
692    simd_ceil(a)
693}
694
695/// Round the packed single-precision (32-bit) floating-point elements in `a`
696/// up to an integer value, and stores the results as packed single-precision
697/// floating-point elements.
698///
699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps)
700#[inline]
701#[target_feature(enable = "sse4.1")]
702#[cfg_attr(test, assert_instr(roundps))]
703#[stable(feature = "simd_x86", since = "1.27.0")]
704pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
705    simd_ceil(a)
706}
707
708/// Round the lower double-precision (64-bit) floating-point element in `b`
709/// up to an integer value, store the result as a double-precision
710/// floating-point element in the lower element of the intrinsic result,
711/// and copies the upper element from `a` to the upper element
712/// of the intrinsic result.
713///
714/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd)
715#[inline]
716#[target_feature(enable = "sse4.1")]
717#[cfg_attr(test, assert_instr(roundsd))]
718#[stable(feature = "simd_x86", since = "1.27.0")]
719pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
720    roundsd(a, b, _MM_FROUND_CEIL)
721}
722
723/// Round the lower single-precision (32-bit) floating-point element in `b`
724/// up to an integer value, store the result as a single-precision
725/// floating-point element in the lower element of the intrinsic result,
726/// and copies the upper 3 packed elements from `a` to the upper elements
727/// of the intrinsic result.
728///
729/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss)
730#[inline]
731#[target_feature(enable = "sse4.1")]
732#[cfg_attr(test, assert_instr(roundss))]
733#[stable(feature = "simd_x86", since = "1.27.0")]
734pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
735    roundss(a, b, _MM_FROUND_CEIL)
736}
737
738/// Round the packed double-precision (64-bit) floating-point elements in `a`
739/// using the `ROUNDING` parameter, and stores the results as packed
740/// double-precision floating-point elements.
741/// Rounding is done according to the rounding parameter, which can be one of:
742///
743/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
744/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
745/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
746/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
747/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
748///
749/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd)
750#[inline]
751#[target_feature(enable = "sse4.1")]
752#[cfg_attr(test, assert_instr(roundpd, ROUNDING = 0))]
753#[rustc_legacy_const_generics(1)]
754#[stable(feature = "simd_x86", since = "1.27.0")]
755pub unsafe fn _mm_round_pd<const ROUNDING: i32>(a: __m128d) -> __m128d {
756    static_assert_uimm_bits!(ROUNDING, 4);
757    roundpd(a, ROUNDING)
758}
759
760/// Round the packed single-precision (32-bit) floating-point elements in `a`
761/// using the `ROUNDING` parameter, and stores the results as packed
762/// single-precision floating-point elements.
763/// Rounding is done according to the rounding parameter, which can be one of:
764///
765/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
766/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
767/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
768/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
769/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
770///
771/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ps)
772#[inline]
773#[target_feature(enable = "sse4.1")]
774#[cfg_attr(test, assert_instr(roundps, ROUNDING = 0))]
775#[rustc_legacy_const_generics(1)]
776#[stable(feature = "simd_x86", since = "1.27.0")]
777pub unsafe fn _mm_round_ps<const ROUNDING: i32>(a: __m128) -> __m128 {
778    static_assert_uimm_bits!(ROUNDING, 4);
779    roundps(a, ROUNDING)
780}
781
782/// Round the lower double-precision (64-bit) floating-point element in `b`
783/// using the `ROUNDING` parameter, store the result as a double-precision
784/// floating-point element in the lower element of the intrinsic result,
785/// and copies the upper element from `a` to the upper element of the intrinsic
786/// result.
787/// Rounding is done according to the rounding parameter, which can be one of:
788///
789/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
790/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
791/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
792/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
793/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
794///
795/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd)
796#[inline]
797#[target_feature(enable = "sse4.1")]
798#[cfg_attr(test, assert_instr(roundsd, ROUNDING = 0))]
799#[rustc_legacy_const_generics(2)]
800#[stable(feature = "simd_x86", since = "1.27.0")]
801pub unsafe fn _mm_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
802    static_assert_uimm_bits!(ROUNDING, 4);
803    roundsd(a, b, ROUNDING)
804}
805
806/// Round the lower single-precision (32-bit) floating-point element in `b`
807/// using the `ROUNDING` parameter, store the result as a single-precision
808/// floating-point element in the lower element of the intrinsic result,
809/// and copies the upper 3 packed elements from `a` to the upper elements
810/// of the intrinsic result.
811/// Rounding is done according to the rounding parameter, which can be one of:
812///
813/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
814/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
815/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
816/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
817/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
818///
819/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss)
820#[inline]
821#[target_feature(enable = "sse4.1")]
822#[cfg_attr(test, assert_instr(roundss, ROUNDING = 0))]
823#[rustc_legacy_const_generics(2)]
824#[stable(feature = "simd_x86", since = "1.27.0")]
825pub unsafe fn _mm_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
826    static_assert_uimm_bits!(ROUNDING, 4);
827    roundss(a, b, ROUNDING)
828}
829
830/// Finds the minimum unsigned 16-bit element in the 128-bit __m128i vector,
831/// returning a vector containing its value in its first position, and its
832/// index
833/// in its second position; all other elements are set to zero.
834///
835/// This intrinsic corresponds to the `VPHMINPOSUW` / `PHMINPOSUW`
836/// instruction.
837///
838/// Arguments:
839///
840/// * `a` - A 128-bit vector of type `__m128i`.
841///
842/// Returns:
843///
844/// A 128-bit value where:
845///
846/// * bits `[15:0]` - contain the minimum value found in parameter `a`,
847/// * bits `[18:16]` - contain the index of the minimum value
848/// * remaining bits are set to `0`.
849///
850/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16)
851#[inline]
852#[target_feature(enable = "sse4.1")]
853#[cfg_attr(test, assert_instr(phminposuw))]
854#[stable(feature = "simd_x86", since = "1.27.0")]
855pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
856    transmute(phminposuw(a.as_u16x8()))
857}
858
859/// Multiplies the low 32-bit integers from each packed 64-bit
860/// element in `a` and `b`, and returns the signed 64-bit result.
861///
862/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32)
863#[inline]
864#[target_feature(enable = "sse4.1")]
865#[cfg_attr(test, assert_instr(pmuldq))]
866#[stable(feature = "simd_x86", since = "1.27.0")]
867pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
868    let a = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(a.as_i64x2()));
869    let b = simd_cast::<_, i64x2>(simd_cast::<_, i32x2>(b.as_i64x2()));
870    transmute(simd_mul(a, b))
871}
872
873/// Multiplies the packed 32-bit integers in `a` and `b`, producing intermediate
874/// 64-bit integers, and returns the lowest 32-bit, whatever they might be,
875/// reinterpreted as a signed integer. While `pmulld __m128i::splat(2),
876/// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
877/// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would
878/// return a negative number.
879///
880/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32)
881#[inline]
882#[target_feature(enable = "sse4.1")]
883#[cfg_attr(test, assert_instr(pmulld))]
884#[stable(feature = "simd_x86", since = "1.27.0")]
885pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
886    transmute(simd_mul(a.as_i32x4(), b.as_i32x4()))
887}
888
889/// Subtracts 8-bit unsigned integer values and computes the absolute
890/// values of the differences to the corresponding bits in the destination.
891/// Then sums of the absolute differences are returned according to the bit
892/// fields in the immediate operand.
893///
894/// The following algorithm is performed:
895///
896/// ```ignore
897/// i = IMM8[2] * 4
898/// j = IMM8[1:0] * 4
899/// for k := 0 to 7
900///     d0 = abs(a[i + k + 0] - b[j + 0])
901///     d1 = abs(a[i + k + 1] - b[j + 1])
902///     d2 = abs(a[i + k + 2] - b[j + 2])
903///     d3 = abs(a[i + k + 3] - b[j + 3])
904///     r[k] = d0 + d1 + d2 + d3
905/// ```
906///
907/// Arguments:
908///
909/// * `a` - A 128-bit vector of type `__m128i`.
910/// * `b` - A 128-bit vector of type `__m128i`.
911/// * `IMM8` - An 8-bit immediate operand specifying how the absolute
912///   differences are to be calculated
913///     * Bit `[2]` specify the offset for operand `a`
914///     * Bits `[1:0]` specify the offset for operand `b`
915///
916/// Returns:
917///
918/// * A `__m128i` vector containing the sums of the sets of   absolute
919///   differences between both operands.
920///
921/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8)
922#[inline]
923#[target_feature(enable = "sse4.1")]
924#[cfg_attr(test, assert_instr(mpsadbw, IMM8 = 0))]
925#[rustc_legacy_const_generics(2)]
926#[stable(feature = "simd_x86", since = "1.27.0")]
927pub unsafe fn _mm_mpsadbw_epu8<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
928    static_assert_uimm_bits!(IMM8, 3);
929    transmute(mpsadbw(a.as_u8x16(), b.as_u8x16(), IMM8 as u8))
930}
931
932/// Tests whether the specified bits in a 128-bit integer vector are all
933/// zeros.
934///
935/// Arguments:
936///
937/// * `a` - A 128-bit integer vector containing the bits to be tested.
938/// * `mask` - A 128-bit integer vector selecting which bits to test in
939///   operand `a`.
940///
941/// Returns:
942///
943/// * `1` - if the specified bits are all zeros,
944/// * `0` - otherwise.
945///
946/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128)
947#[inline]
948#[target_feature(enable = "sse4.1")]
949#[cfg_attr(test, assert_instr(ptest))]
950#[stable(feature = "simd_x86", since = "1.27.0")]
951pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
952    ptestz(a.as_i64x2(), mask.as_i64x2())
953}
954
955/// Tests whether the specified bits in a 128-bit integer vector are all
956/// ones.
957///
958/// Arguments:
959///
960/// * `a` - A 128-bit integer vector containing the bits to be tested.
961/// * `mask` - A 128-bit integer vector selecting which bits to test in
962///   operand `a`.
963///
964/// Returns:
965///
966/// * `1` - if the specified bits are all ones,
967/// * `0` - otherwise.
968///
969/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128)
970#[inline]
971#[target_feature(enable = "sse4.1")]
972#[cfg_attr(test, assert_instr(ptest))]
973#[stable(feature = "simd_x86", since = "1.27.0")]
974pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
975    ptestc(a.as_i64x2(), mask.as_i64x2())
976}
977
978/// Tests whether the specified bits in a 128-bit integer vector are
979/// neither all zeros nor all ones.
980///
981/// Arguments:
982///
983/// * `a` - A 128-bit integer vector containing the bits to be tested.
984/// * `mask` - A 128-bit integer vector selecting which bits to test in
985///   operand `a`.
986///
987/// Returns:
988///
989/// * `1` - if the specified bits are neither all zeros nor all ones,
990/// * `0` - otherwise.
991///
992/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128)
993#[inline]
994#[target_feature(enable = "sse4.1")]
995#[cfg_attr(test, assert_instr(ptest))]
996#[stable(feature = "simd_x86", since = "1.27.0")]
997pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
998    ptestnzc(a.as_i64x2(), mask.as_i64x2())
999}
1000
1001/// Tests whether the specified bits in a 128-bit integer vector are all
1002/// zeros.
1003///
1004/// Arguments:
1005///
1006/// * `a` - A 128-bit integer vector containing the bits to be tested.
1007/// * `mask` - A 128-bit integer vector selecting which bits to test in
1008///   operand `a`.
1009///
1010/// Returns:
1011///
1012/// * `1` - if the specified bits are all zeros,
1013/// * `0` - otherwise.
1014///
1015/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros)
1016#[inline]
1017#[target_feature(enable = "sse4.1")]
1018#[cfg_attr(test, assert_instr(ptest))]
1019#[stable(feature = "simd_x86", since = "1.27.0")]
1020pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
1021    _mm_testz_si128(a, mask)
1022}
1023
1024/// Tests whether the specified bits in `a` 128-bit integer vector are all
1025/// ones.
1026///
1027/// Argument:
1028///
1029/// * `a` - A 128-bit integer vector containing the bits to be tested.
1030///
1031/// Returns:
1032///
1033/// * `1` - if the bits specified in the operand are all set to 1,
1034/// * `0` - otherwise.
1035///
1036/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones)
1037#[inline]
1038#[target_feature(enable = "sse4.1")]
1039#[cfg_attr(test, assert_instr(pcmpeqd))]
1040#[cfg_attr(test, assert_instr(ptest))]
1041#[stable(feature = "simd_x86", since = "1.27.0")]
1042pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
1043    _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
1044}
1045
1046/// Tests whether the specified bits in a 128-bit integer vector are
1047/// neither all zeros nor all ones.
1048///
1049/// Arguments:
1050///
1051/// * `a` - A 128-bit integer vector containing the bits to be tested.
1052/// * `mask` - A 128-bit integer vector selecting which bits to test in
1053///   operand `a`.
1054///
1055/// Returns:
1056///
1057/// * `1` - if the specified bits are neither all zeros nor all ones,
1058/// * `0` - otherwise.
1059///
1060/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_mix_ones_zeros)
1061#[inline]
1062#[target_feature(enable = "sse4.1")]
1063#[cfg_attr(test, assert_instr(ptest))]
1064#[stable(feature = "simd_x86", since = "1.27.0")]
1065pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
1066    _mm_testnzc_si128(a, mask)
1067}
1068
1069/// Load 128-bits of integer data from memory into dst. mem_addr must be aligned on a 16-byte
1070/// boundary or a general-protection exception may be generated. To minimize caching, the data
1071/// is flagged as non-temporal (unlikely to be used again soon)
1072///
1073/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128)
1074#[inline]
1075#[target_feature(enable = "sse4.1")]
1076#[cfg_attr(test, assert_instr(movntdqa))]
1077#[stable(feature = "simd_x86_updates", since = "1.82.0")]
1078pub unsafe fn _mm_stream_load_si128(mem_addr: *const __m128i) -> __m128i {
1079    let dst: __m128i;
1080    crate::arch::asm!(
1081        vpl!("movntdqa {a}"),
1082        a = out(xmm_reg) dst,
1083        p = in(reg) mem_addr,
1084        options(pure, readonly, nostack, preserves_flags),
1085    );
1086    dst
1087}
1088
1089#[allow(improper_ctypes)]
1090extern "C" {
1091    #[link_name = "llvm.x86.sse41.insertps"]
1092    fn insertps(a: __m128, b: __m128, imm8: u8) -> __m128;
1093    #[link_name = "llvm.x86.sse41.packusdw"]
1094    fn packusdw(a: i32x4, b: i32x4) -> u16x8;
1095    #[link_name = "llvm.x86.sse41.dppd"]
1096    fn dppd(a: __m128d, b: __m128d, imm8: u8) -> __m128d;
1097    #[link_name = "llvm.x86.sse41.dpps"]
1098    fn dpps(a: __m128, b: __m128, imm8: u8) -> __m128;
1099    #[link_name = "llvm.x86.sse41.round.pd"]
1100    fn roundpd(a: __m128d, rounding: i32) -> __m128d;
1101    #[link_name = "llvm.x86.sse41.round.ps"]
1102    fn roundps(a: __m128, rounding: i32) -> __m128;
1103    #[link_name = "llvm.x86.sse41.round.sd"]
1104    fn roundsd(a: __m128d, b: __m128d, rounding: i32) -> __m128d;
1105    #[link_name = "llvm.x86.sse41.round.ss"]
1106    fn roundss(a: __m128, b: __m128, rounding: i32) -> __m128;
1107    #[link_name = "llvm.x86.sse41.phminposuw"]
1108    fn phminposuw(a: u16x8) -> u16x8;
1109    #[link_name = "llvm.x86.sse41.mpsadbw"]
1110    fn mpsadbw(a: u8x16, b: u8x16, imm8: u8) -> u16x8;
1111    #[link_name = "llvm.x86.sse41.ptestz"]
1112    fn ptestz(a: i64x2, mask: i64x2) -> i32;
1113    #[link_name = "llvm.x86.sse41.ptestc"]
1114    fn ptestc(a: i64x2, mask: i64x2) -> i32;
1115    #[link_name = "llvm.x86.sse41.ptestnzc"]
1116    fn ptestnzc(a: i64x2, mask: i64x2) -> i32;
1117}
1118
1119#[cfg(test)]
1120mod tests {
1121    use crate::core_arch::x86::*;
1122    use std::mem;
1123    use stdarch_test::simd_test;
1124
1125    #[simd_test(enable = "sse4.1")]
1126    unsafe fn test_mm_blendv_epi8() {
1127        #[rustfmt::skip]
1128        let a = _mm_setr_epi8(
1129            0, 1, 2, 3, 4, 5, 6, 7,
1130            8, 9, 10, 11, 12, 13, 14, 15,
1131        );
1132        #[rustfmt::skip]
1133        let b = _mm_setr_epi8(
1134            16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
1135        );
1136        #[rustfmt::skip]
1137        let mask = _mm_setr_epi8(
1138            0, -1, 0, -1, 0, -1, 0, -1,
1139            0, -1, 0, -1, 0, -1, 0, -1,
1140        );
1141        #[rustfmt::skip]
1142        let e = _mm_setr_epi8(
1143            0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31,
1144        );
1145        assert_eq_m128i(_mm_blendv_epi8(a, b, mask), e);
1146    }
1147
1148    #[simd_test(enable = "sse4.1")]
1149    unsafe fn test_mm_blendv_pd() {
1150        let a = _mm_set1_pd(0.0);
1151        let b = _mm_set1_pd(1.0);
1152        let mask = transmute(_mm_setr_epi64x(0, -1));
1153        let r = _mm_blendv_pd(a, b, mask);
1154        let e = _mm_setr_pd(0.0, 1.0);
1155        assert_eq_m128d(r, e);
1156    }
1157
1158    #[simd_test(enable = "sse4.1")]
1159    unsafe fn test_mm_blendv_ps() {
1160        let a = _mm_set1_ps(0.0);
1161        let b = _mm_set1_ps(1.0);
1162        let mask = transmute(_mm_setr_epi32(0, -1, 0, -1));
1163        let r = _mm_blendv_ps(a, b, mask);
1164        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1165        assert_eq_m128(r, e);
1166    }
1167
1168    #[simd_test(enable = "sse4.1")]
1169    unsafe fn test_mm_blend_pd() {
1170        let a = _mm_set1_pd(0.0);
1171        let b = _mm_set1_pd(1.0);
1172        let r = _mm_blend_pd::<0b10>(a, b);
1173        let e = _mm_setr_pd(0.0, 1.0);
1174        assert_eq_m128d(r, e);
1175    }
1176
1177    #[simd_test(enable = "sse4.1")]
1178    unsafe fn test_mm_blend_ps() {
1179        let a = _mm_set1_ps(0.0);
1180        let b = _mm_set1_ps(1.0);
1181        let r = _mm_blend_ps::<0b1010>(a, b);
1182        let e = _mm_setr_ps(0.0, 1.0, 0.0, 1.0);
1183        assert_eq_m128(r, e);
1184    }
1185
1186    #[simd_test(enable = "sse4.1")]
1187    unsafe fn test_mm_blend_epi16() {
1188        let a = _mm_set1_epi16(0);
1189        let b = _mm_set1_epi16(1);
1190        let r = _mm_blend_epi16::<0b1010_1100>(a, b);
1191        let e = _mm_setr_epi16(0, 0, 1, 1, 0, 1, 0, 1);
1192        assert_eq_m128i(r, e);
1193    }
1194
1195    #[simd_test(enable = "sse4.1")]
1196    unsafe fn test_mm_extract_ps() {
1197        let a = _mm_setr_ps(0.0, 1.0, 2.0, 3.0);
1198        let r: f32 = f32::from_bits(_mm_extract_ps::<1>(a) as u32);
1199        assert_eq!(r, 1.0);
1200        let r: f32 = f32::from_bits(_mm_extract_ps::<3>(a) as u32);
1201        assert_eq!(r, 3.0);
1202    }
1203
1204    #[simd_test(enable = "sse4.1")]
1205    unsafe fn test_mm_extract_epi8() {
1206        #[rustfmt::skip]
1207        let a = _mm_setr_epi8(
1208            -1, 1, 2, 3, 4, 5, 6, 7,
1209            8, 9, 10, 11, 12, 13, 14, 15
1210        );
1211        let r1 = _mm_extract_epi8::<0>(a);
1212        let r2 = _mm_extract_epi8::<3>(a);
1213        assert_eq!(r1, 0xFF);
1214        assert_eq!(r2, 3);
1215    }
1216
1217    #[simd_test(enable = "sse4.1")]
1218    unsafe fn test_mm_extract_epi32() {
1219        let a = _mm_setr_epi32(0, 1, 2, 3);
1220        let r = _mm_extract_epi32::<1>(a);
1221        assert_eq!(r, 1);
1222        let r = _mm_extract_epi32::<3>(a);
1223        assert_eq!(r, 3);
1224    }
1225
1226    #[simd_test(enable = "sse4.1")]
1227    unsafe fn test_mm_insert_ps() {
1228        let a = _mm_set1_ps(1.0);
1229        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1230        let r = _mm_insert_ps::<0b11_00_1100>(a, b);
1231        let e = _mm_setr_ps(4.0, 1.0, 0.0, 0.0);
1232        assert_eq_m128(r, e);
1233
1234        // Zeroing takes precedence over copied value
1235        let a = _mm_set1_ps(1.0);
1236        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
1237        let r = _mm_insert_ps::<0b11_00_0001>(a, b);
1238        let e = _mm_setr_ps(0.0, 1.0, 1.0, 1.0);
1239        assert_eq_m128(r, e);
1240    }
1241
1242    #[simd_test(enable = "sse4.1")]
1243    unsafe fn test_mm_insert_epi8() {
1244        let a = _mm_set1_epi8(0);
1245        let e = _mm_setr_epi8(0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
1246        let r = _mm_insert_epi8::<1>(a, 32);
1247        assert_eq_m128i(r, e);
1248        let e = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0);
1249        let r = _mm_insert_epi8::<14>(a, 32);
1250        assert_eq_m128i(r, e);
1251    }
1252
1253    #[simd_test(enable = "sse4.1")]
1254    unsafe fn test_mm_insert_epi32() {
1255        let a = _mm_set1_epi32(0);
1256        let e = _mm_setr_epi32(0, 32, 0, 0);
1257        let r = _mm_insert_epi32::<1>(a, 32);
1258        assert_eq_m128i(r, e);
1259        let e = _mm_setr_epi32(0, 0, 0, 32);
1260        let r = _mm_insert_epi32::<3>(a, 32);
1261        assert_eq_m128i(r, e);
1262    }
1263
1264    #[simd_test(enable = "sse4.1")]
1265    unsafe fn test_mm_max_epi8() {
1266        #[rustfmt::skip]
1267        let a = _mm_setr_epi8(
1268            1, 4, 5, 8, 9, 12, 13, 16,
1269            17, 20, 21, 24, 25, 28, 29, 32,
1270        );
1271        #[rustfmt::skip]
1272        let b = _mm_setr_epi8(
1273            2, 3, 6, 7, 10, 11, 14, 15,
1274            18, 19, 22, 23, 26, 27, 30, 31,
1275        );
1276        let r = _mm_max_epi8(a, b);
1277        #[rustfmt::skip]
1278        let e = _mm_setr_epi8(
1279            2, 4, 6, 8, 10, 12, 14, 16,
1280            18, 20, 22, 24, 26, 28, 30, 32,
1281        );
1282        assert_eq_m128i(r, e);
1283    }
1284
1285    #[simd_test(enable = "sse4.1")]
1286    unsafe fn test_mm_max_epu16() {
1287        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1288        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1289        let r = _mm_max_epu16(a, b);
1290        let e = _mm_setr_epi16(2, 4, 6, 8, 10, 12, 14, 16);
1291        assert_eq_m128i(r, e);
1292    }
1293
1294    #[simd_test(enable = "sse4.1")]
1295    unsafe fn test_mm_max_epi32() {
1296        let a = _mm_setr_epi32(1, 4, 5, 8);
1297        let b = _mm_setr_epi32(2, 3, 6, 7);
1298        let r = _mm_max_epi32(a, b);
1299        let e = _mm_setr_epi32(2, 4, 6, 8);
1300        assert_eq_m128i(r, e);
1301    }
1302
1303    #[simd_test(enable = "sse4.1")]
1304    unsafe fn test_mm_max_epu32() {
1305        let a = _mm_setr_epi32(1, 4, 5, 8);
1306        let b = _mm_setr_epi32(2, 3, 6, 7);
1307        let r = _mm_max_epu32(a, b);
1308        let e = _mm_setr_epi32(2, 4, 6, 8);
1309        assert_eq_m128i(r, e);
1310    }
1311
1312    #[simd_test(enable = "sse4.1")]
1313    unsafe fn test_mm_min_epi8_1() {
1314        #[rustfmt::skip]
1315        let a = _mm_setr_epi8(
1316            1, 4, 5, 8, 9, 12, 13, 16,
1317            17, 20, 21, 24, 25, 28, 29, 32,
1318        );
1319        #[rustfmt::skip]
1320        let b = _mm_setr_epi8(
1321            2, 3, 6, 7, 10, 11, 14, 15,
1322            18, 19, 22, 23, 26, 27, 30, 31,
1323        );
1324        let r = _mm_min_epi8(a, b);
1325        #[rustfmt::skip]
1326        let e = _mm_setr_epi8(
1327            1, 3, 5, 7, 9, 11, 13, 15,
1328            17, 19, 21, 23, 25, 27, 29, 31,
1329        );
1330        assert_eq_m128i(r, e);
1331    }
1332
1333    #[simd_test(enable = "sse4.1")]
1334    unsafe fn test_mm_min_epi8_2() {
1335        #[rustfmt::skip]
1336        let a = _mm_setr_epi8(
1337            1, -4, -5, 8, -9, -12, 13, -16,
1338            17, 20, 21, 24, 25, 28, 29, 32,
1339        );
1340        #[rustfmt::skip]
1341        let b = _mm_setr_epi8(
1342            2, -3, -6, 7, -10, -11, 14, -15,
1343            18, 19, 22, 23, 26, 27, 30, 31,
1344        );
1345        let r = _mm_min_epi8(a, b);
1346        #[rustfmt::skip]
1347        let e = _mm_setr_epi8(
1348            1, -4, -6, 7, -10, -12, 13, -16,
1349            17, 19, 21, 23, 25, 27, 29, 31,
1350        );
1351        assert_eq_m128i(r, e);
1352    }
1353
1354    #[simd_test(enable = "sse4.1")]
1355    unsafe fn test_mm_min_epu16() {
1356        let a = _mm_setr_epi16(1, 4, 5, 8, 9, 12, 13, 16);
1357        let b = _mm_setr_epi16(2, 3, 6, 7, 10, 11, 14, 15);
1358        let r = _mm_min_epu16(a, b);
1359        let e = _mm_setr_epi16(1, 3, 5, 7, 9, 11, 13, 15);
1360        assert_eq_m128i(r, e);
1361    }
1362
1363    #[simd_test(enable = "sse4.1")]
1364    unsafe fn test_mm_min_epi32_1() {
1365        let a = _mm_setr_epi32(1, 4, 5, 8);
1366        let b = _mm_setr_epi32(2, 3, 6, 7);
1367        let r = _mm_min_epi32(a, b);
1368        let e = _mm_setr_epi32(1, 3, 5, 7);
1369        assert_eq_m128i(r, e);
1370    }
1371
1372    #[simd_test(enable = "sse4.1")]
1373    unsafe fn test_mm_min_epi32_2() {
1374        let a = _mm_setr_epi32(-1, 4, 5, -7);
1375        let b = _mm_setr_epi32(-2, 3, -6, 8);
1376        let r = _mm_min_epi32(a, b);
1377        let e = _mm_setr_epi32(-2, 3, -6, -7);
1378        assert_eq_m128i(r, e);
1379    }
1380
1381    #[simd_test(enable = "sse4.1")]
1382    unsafe fn test_mm_min_epu32() {
1383        let a = _mm_setr_epi32(1, 4, 5, 8);
1384        let b = _mm_setr_epi32(2, 3, 6, 7);
1385        let r = _mm_min_epu32(a, b);
1386        let e = _mm_setr_epi32(1, 3, 5, 7);
1387        assert_eq_m128i(r, e);
1388    }
1389
1390    #[simd_test(enable = "sse4.1")]
1391    unsafe fn test_mm_packus_epi32() {
1392        let a = _mm_setr_epi32(1, 2, 3, 4);
1393        let b = _mm_setr_epi32(-1, -2, -3, -4);
1394        let r = _mm_packus_epi32(a, b);
1395        let e = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
1396        assert_eq_m128i(r, e);
1397    }
1398
1399    #[simd_test(enable = "sse4.1")]
1400    unsafe fn test_mm_cmpeq_epi64() {
1401        let a = _mm_setr_epi64x(0, 1);
1402        let b = _mm_setr_epi64x(0, 0);
1403        let r = _mm_cmpeq_epi64(a, b);
1404        let e = _mm_setr_epi64x(-1, 0);
1405        assert_eq_m128i(r, e);
1406    }
1407
1408    #[simd_test(enable = "sse4.1")]
1409    unsafe fn test_mm_cvtepi8_epi16() {
1410        let a = _mm_set1_epi8(10);
1411        let r = _mm_cvtepi8_epi16(a);
1412        let e = _mm_set1_epi16(10);
1413        assert_eq_m128i(r, e);
1414        let a = _mm_set1_epi8(-10);
1415        let r = _mm_cvtepi8_epi16(a);
1416        let e = _mm_set1_epi16(-10);
1417        assert_eq_m128i(r, e);
1418    }
1419
1420    #[simd_test(enable = "sse4.1")]
1421    unsafe fn test_mm_cvtepi8_epi32() {
1422        let a = _mm_set1_epi8(10);
1423        let r = _mm_cvtepi8_epi32(a);
1424        let e = _mm_set1_epi32(10);
1425        assert_eq_m128i(r, e);
1426        let a = _mm_set1_epi8(-10);
1427        let r = _mm_cvtepi8_epi32(a);
1428        let e = _mm_set1_epi32(-10);
1429        assert_eq_m128i(r, e);
1430    }
1431
1432    #[simd_test(enable = "sse4.1")]
1433    unsafe fn test_mm_cvtepi8_epi64() {
1434        let a = _mm_set1_epi8(10);
1435        let r = _mm_cvtepi8_epi64(a);
1436        let e = _mm_set1_epi64x(10);
1437        assert_eq_m128i(r, e);
1438        let a = _mm_set1_epi8(-10);
1439        let r = _mm_cvtepi8_epi64(a);
1440        let e = _mm_set1_epi64x(-10);
1441        assert_eq_m128i(r, e);
1442    }
1443
1444    #[simd_test(enable = "sse4.1")]
1445    unsafe fn test_mm_cvtepi16_epi32() {
1446        let a = _mm_set1_epi16(10);
1447        let r = _mm_cvtepi16_epi32(a);
1448        let e = _mm_set1_epi32(10);
1449        assert_eq_m128i(r, e);
1450        let a = _mm_set1_epi16(-10);
1451        let r = _mm_cvtepi16_epi32(a);
1452        let e = _mm_set1_epi32(-10);
1453        assert_eq_m128i(r, e);
1454    }
1455
1456    #[simd_test(enable = "sse4.1")]
1457    unsafe fn test_mm_cvtepi16_epi64() {
1458        let a = _mm_set1_epi16(10);
1459        let r = _mm_cvtepi16_epi64(a);
1460        let e = _mm_set1_epi64x(10);
1461        assert_eq_m128i(r, e);
1462        let a = _mm_set1_epi16(-10);
1463        let r = _mm_cvtepi16_epi64(a);
1464        let e = _mm_set1_epi64x(-10);
1465        assert_eq_m128i(r, e);
1466    }
1467
1468    #[simd_test(enable = "sse4.1")]
1469    unsafe fn test_mm_cvtepi32_epi64() {
1470        let a = _mm_set1_epi32(10);
1471        let r = _mm_cvtepi32_epi64(a);
1472        let e = _mm_set1_epi64x(10);
1473        assert_eq_m128i(r, e);
1474        let a = _mm_set1_epi32(-10);
1475        let r = _mm_cvtepi32_epi64(a);
1476        let e = _mm_set1_epi64x(-10);
1477        assert_eq_m128i(r, e);
1478    }
1479
1480    #[simd_test(enable = "sse4.1")]
1481    unsafe fn test_mm_cvtepu8_epi16() {
1482        let a = _mm_set1_epi8(10);
1483        let r = _mm_cvtepu8_epi16(a);
1484        let e = _mm_set1_epi16(10);
1485        assert_eq_m128i(r, e);
1486    }
1487
1488    #[simd_test(enable = "sse4.1")]
1489    unsafe fn test_mm_cvtepu8_epi32() {
1490        let a = _mm_set1_epi8(10);
1491        let r = _mm_cvtepu8_epi32(a);
1492        let e = _mm_set1_epi32(10);
1493        assert_eq_m128i(r, e);
1494    }
1495
1496    #[simd_test(enable = "sse4.1")]
1497    unsafe fn test_mm_cvtepu8_epi64() {
1498        let a = _mm_set1_epi8(10);
1499        let r = _mm_cvtepu8_epi64(a);
1500        let e = _mm_set1_epi64x(10);
1501        assert_eq_m128i(r, e);
1502    }
1503
1504    #[simd_test(enable = "sse4.1")]
1505    unsafe fn test_mm_cvtepu16_epi32() {
1506        let a = _mm_set1_epi16(10);
1507        let r = _mm_cvtepu16_epi32(a);
1508        let e = _mm_set1_epi32(10);
1509        assert_eq_m128i(r, e);
1510    }
1511
1512    #[simd_test(enable = "sse4.1")]
1513    unsafe fn test_mm_cvtepu16_epi64() {
1514        let a = _mm_set1_epi16(10);
1515        let r = _mm_cvtepu16_epi64(a);
1516        let e = _mm_set1_epi64x(10);
1517        assert_eq_m128i(r, e);
1518    }
1519
1520    #[simd_test(enable = "sse4.1")]
1521    unsafe fn test_mm_cvtepu32_epi64() {
1522        let a = _mm_set1_epi32(10);
1523        let r = _mm_cvtepu32_epi64(a);
1524        let e = _mm_set1_epi64x(10);
1525        assert_eq_m128i(r, e);
1526    }
1527
1528    #[simd_test(enable = "sse4.1")]
1529    unsafe fn test_mm_dp_pd() {
1530        let a = _mm_setr_pd(2.0, 3.0);
1531        let b = _mm_setr_pd(1.0, 4.0);
1532        let e = _mm_setr_pd(14.0, 0.0);
1533        assert_eq_m128d(_mm_dp_pd::<0b00110001>(a, b), e);
1534    }
1535
1536    #[simd_test(enable = "sse4.1")]
1537    unsafe fn test_mm_dp_ps() {
1538        let a = _mm_setr_ps(2.0, 3.0, 1.0, 10.0);
1539        let b = _mm_setr_ps(1.0, 4.0, 0.5, 10.0);
1540        let e = _mm_setr_ps(14.5, 0.0, 14.5, 0.0);
1541        assert_eq_m128(_mm_dp_ps::<0b01110101>(a, b), e);
1542    }
1543
1544    #[simd_test(enable = "sse4.1")]
1545    unsafe fn test_mm_floor_pd() {
1546        let a = _mm_setr_pd(2.5, 4.5);
1547        let r = _mm_floor_pd(a);
1548        let e = _mm_setr_pd(2.0, 4.0);
1549        assert_eq_m128d(r, e);
1550    }
1551
1552    #[simd_test(enable = "sse4.1")]
1553    unsafe fn test_mm_floor_ps() {
1554        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1555        let r = _mm_floor_ps(a);
1556        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1557        assert_eq_m128(r, e);
1558    }
1559
1560    #[simd_test(enable = "sse4.1")]
1561    unsafe fn test_mm_floor_sd() {
1562        let a = _mm_setr_pd(2.5, 4.5);
1563        let b = _mm_setr_pd(-1.5, -3.5);
1564        let r = _mm_floor_sd(a, b);
1565        let e = _mm_setr_pd(-2.0, 4.5);
1566        assert_eq_m128d(r, e);
1567    }
1568
1569    #[simd_test(enable = "sse4.1")]
1570    unsafe fn test_mm_floor_ss() {
1571        let a = _mm_setr_ps(2.5, 4.5, 8.5, 16.5);
1572        let b = _mm_setr_ps(-1.5, -3.5, -7.5, -15.5);
1573        let r = _mm_floor_ss(a, b);
1574        let e = _mm_setr_ps(-2.0, 4.5, 8.5, 16.5);
1575        assert_eq_m128(r, e);
1576    }
1577
1578    #[simd_test(enable = "sse4.1")]
1579    unsafe fn test_mm_ceil_pd() {
1580        let a = _mm_setr_pd(1.5, 3.5);
1581        let r = _mm_ceil_pd(a);
1582        let e = _mm_setr_pd(2.0, 4.0);
1583        assert_eq_m128d(r, e);
1584    }
1585
1586    #[simd_test(enable = "sse4.1")]
1587    unsafe fn test_mm_ceil_ps() {
1588        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1589        let r = _mm_ceil_ps(a);
1590        let e = _mm_setr_ps(2.0, 4.0, 8.0, 16.0);
1591        assert_eq_m128(r, e);
1592    }
1593
1594    #[simd_test(enable = "sse4.1")]
1595    unsafe fn test_mm_ceil_sd() {
1596        let a = _mm_setr_pd(1.5, 3.5);
1597        let b = _mm_setr_pd(-2.5, -4.5);
1598        let r = _mm_ceil_sd(a, b);
1599        let e = _mm_setr_pd(-2.0, 3.5);
1600        assert_eq_m128d(r, e);
1601    }
1602
1603    #[simd_test(enable = "sse4.1")]
1604    unsafe fn test_mm_ceil_ss() {
1605        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1606        let b = _mm_setr_ps(-2.5, -4.5, -8.5, -16.5);
1607        let r = _mm_ceil_ss(a, b);
1608        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1609        assert_eq_m128(r, e);
1610    }
1611
1612    #[simd_test(enable = "sse4.1")]
1613    unsafe fn test_mm_round_pd() {
1614        let a = _mm_setr_pd(1.25, 3.75);
1615        let r = _mm_round_pd::<_MM_FROUND_TO_NEAREST_INT>(a);
1616        let e = _mm_setr_pd(1.0, 4.0);
1617        assert_eq_m128d(r, e);
1618    }
1619
1620    #[simd_test(enable = "sse4.1")]
1621    unsafe fn test_mm_round_ps() {
1622        let a = _mm_setr_ps(2.25, 4.75, -1.75, -4.25);
1623        let r = _mm_round_ps::<_MM_FROUND_TO_ZERO>(a);
1624        let e = _mm_setr_ps(2.0, 4.0, -1.0, -4.0);
1625        assert_eq_m128(r, e);
1626    }
1627
1628    #[simd_test(enable = "sse4.1")]
1629    unsafe fn test_mm_round_sd() {
1630        let a = _mm_setr_pd(1.5, 3.5);
1631        let b = _mm_setr_pd(-2.5, -4.5);
1632        let r = _mm_round_sd::<_MM_FROUND_TO_NEAREST_INT>(a, b);
1633        let e = _mm_setr_pd(-2.0, 3.5);
1634        assert_eq_m128d(r, e);
1635
1636        let a = _mm_setr_pd(1.5, 3.5);
1637        let b = _mm_setr_pd(-2.5, -4.5);
1638        let r = _mm_round_sd::<_MM_FROUND_TO_NEG_INF>(a, b);
1639        let e = _mm_setr_pd(-3.0, 3.5);
1640        assert_eq_m128d(r, e);
1641
1642        let a = _mm_setr_pd(1.5, 3.5);
1643        let b = _mm_setr_pd(-2.5, -4.5);
1644        let r = _mm_round_sd::<_MM_FROUND_TO_POS_INF>(a, b);
1645        let e = _mm_setr_pd(-2.0, 3.5);
1646        assert_eq_m128d(r, e);
1647
1648        let a = _mm_setr_pd(1.5, 3.5);
1649        let b = _mm_setr_pd(-2.5, -4.5);
1650        let r = _mm_round_sd::<_MM_FROUND_TO_ZERO>(a, b);
1651        let e = _mm_setr_pd(-2.0, 3.5);
1652        assert_eq_m128d(r, e);
1653    }
1654
1655    #[simd_test(enable = "sse4.1")]
1656    unsafe fn test_mm_round_ss() {
1657        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1658        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1659        let r = _mm_round_ss::<_MM_FROUND_TO_NEAREST_INT>(a, b);
1660        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1661        assert_eq_m128(r, e);
1662
1663        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1664        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1665        let r = _mm_round_ss::<_MM_FROUND_TO_NEG_INF>(a, b);
1666        let e = _mm_setr_ps(-2.0, 3.5, 7.5, 15.5);
1667        assert_eq_m128(r, e);
1668
1669        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1670        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1671        let r = _mm_round_ss::<_MM_FROUND_TO_POS_INF>(a, b);
1672        let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5);
1673        assert_eq_m128(r, e);
1674
1675        let a = _mm_setr_ps(1.5, 3.5, 7.5, 15.5);
1676        let b = _mm_setr_ps(-1.75, -4.5, -8.5, -16.5);
1677        let r = _mm_round_ss::<_MM_FROUND_TO_ZERO>(a, b);
1678        let e = _mm_setr_ps(-1.0, 3.5, 7.5, 15.5);
1679        assert_eq_m128(r, e);
1680    }
1681
1682    #[simd_test(enable = "sse4.1")]
1683    unsafe fn test_mm_minpos_epu16_1() {
1684        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 66);
1685        let r = _mm_minpos_epu16(a);
1686        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1687        assert_eq_m128i(r, e);
1688    }
1689
1690    #[simd_test(enable = "sse4.1")]
1691    unsafe fn test_mm_minpos_epu16_2() {
1692        let a = _mm_setr_epi16(0, 18, 44, 97, 50, 13, 67, 66);
1693        let r = _mm_minpos_epu16(a);
1694        let e = _mm_setr_epi16(0, 0, 0, 0, 0, 0, 0, 0);
1695        assert_eq_m128i(r, e);
1696    }
1697
1698    #[simd_test(enable = "sse4.1")]
1699    unsafe fn test_mm_minpos_epu16_3() {
1700        // Case where the minimum value is repeated
1701        let a = _mm_setr_epi16(23, 18, 44, 97, 50, 13, 67, 13);
1702        let r = _mm_minpos_epu16(a);
1703        let e = _mm_setr_epi16(13, 5, 0, 0, 0, 0, 0, 0);
1704        assert_eq_m128i(r, e);
1705    }
1706
1707    #[simd_test(enable = "sse4.1")]
1708    unsafe fn test_mm_mul_epi32() {
1709        {
1710            let a = _mm_setr_epi32(1, 1, 1, 1);
1711            let b = _mm_setr_epi32(1, 2, 3, 4);
1712            let r = _mm_mul_epi32(a, b);
1713            let e = _mm_setr_epi64x(1, 3);
1714            assert_eq_m128i(r, e);
1715        }
1716        {
1717            let a = _mm_setr_epi32(15, 2 /* ignored */, 1234567, 4 /* ignored */);
1718            let b = _mm_setr_epi32(
1719                -20, -256, /* ignored */
1720                666666, 666666, /* ignored */
1721            );
1722            let r = _mm_mul_epi32(a, b);
1723            let e = _mm_setr_epi64x(-300, 823043843622);
1724            assert_eq_m128i(r, e);
1725        }
1726    }
1727
1728    #[simd_test(enable = "sse4.1")]
1729    unsafe fn test_mm_mullo_epi32() {
1730        {
1731            let a = _mm_setr_epi32(1, 1, 1, 1);
1732            let b = _mm_setr_epi32(1, 2, 3, 4);
1733            let r = _mm_mullo_epi32(a, b);
1734            let e = _mm_setr_epi32(1, 2, 3, 4);
1735            assert_eq_m128i(r, e);
1736        }
1737        {
1738            let a = _mm_setr_epi32(15, -2, 1234567, 99999);
1739            let b = _mm_setr_epi32(-20, -256, 666666, -99999);
1740            let r = _mm_mullo_epi32(a, b);
1741            // Attention, most significant bit in r[2] is treated
1742            // as a sign bit:
1743            // 1234567 * 666666 = -1589877210
1744            let e = _mm_setr_epi32(-300, 512, -1589877210, -1409865409);
1745            assert_eq_m128i(r, e);
1746        }
1747    }
1748
1749    #[simd_test(enable = "sse4.1")]
1750    unsafe fn test_mm_minpos_epu16() {
1751        let a = _mm_setr_epi16(8, 7, 6, 5, 4, 1, 2, 3);
1752        let r = _mm_minpos_epu16(a);
1753        let e = _mm_setr_epi16(1, 5, 0, 0, 0, 0, 0, 0);
1754        assert_eq_m128i(r, e);
1755    }
1756
1757    #[simd_test(enable = "sse4.1")]
1758    unsafe fn test_mm_mpsadbw_epu8() {
1759        #[rustfmt::skip]
1760        let a = _mm_setr_epi8(
1761            0, 1, 2, 3, 4, 5, 6, 7,
1762            8, 9, 10, 11, 12, 13, 14, 15,
1763        );
1764
1765        let r = _mm_mpsadbw_epu8::<0b000>(a, a);
1766        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1767        assert_eq_m128i(r, e);
1768
1769        let r = _mm_mpsadbw_epu8::<0b001>(a, a);
1770        let e = _mm_setr_epi16(16, 12, 8, 4, 0, 4, 8, 12);
1771        assert_eq_m128i(r, e);
1772
1773        let r = _mm_mpsadbw_epu8::<0b100>(a, a);
1774        let e = _mm_setr_epi16(16, 20, 24, 28, 32, 36, 40, 44);
1775        assert_eq_m128i(r, e);
1776
1777        let r = _mm_mpsadbw_epu8::<0b101>(a, a);
1778        let e = _mm_setr_epi16(0, 4, 8, 12, 16, 20, 24, 28);
1779        assert_eq_m128i(r, e);
1780
1781        let r = _mm_mpsadbw_epu8::<0b111>(a, a);
1782        let e = _mm_setr_epi16(32, 28, 24, 20, 16, 12, 8, 4);
1783        assert_eq_m128i(r, e);
1784    }
1785
1786    #[simd_test(enable = "sse4.1")]
1787    unsafe fn test_mm_testz_si128() {
1788        let a = _mm_set1_epi8(1);
1789        let mask = _mm_set1_epi8(0);
1790        let r = _mm_testz_si128(a, mask);
1791        assert_eq!(r, 1);
1792        let a = _mm_set1_epi8(0b101);
1793        let mask = _mm_set1_epi8(0b110);
1794        let r = _mm_testz_si128(a, mask);
1795        assert_eq!(r, 0);
1796        let a = _mm_set1_epi8(0b011);
1797        let mask = _mm_set1_epi8(0b100);
1798        let r = _mm_testz_si128(a, mask);
1799        assert_eq!(r, 1);
1800    }
1801
1802    #[simd_test(enable = "sse4.1")]
1803    unsafe fn test_mm_testc_si128() {
1804        let a = _mm_set1_epi8(-1);
1805        let mask = _mm_set1_epi8(0);
1806        let r = _mm_testc_si128(a, mask);
1807        assert_eq!(r, 1);
1808        let a = _mm_set1_epi8(0b101);
1809        let mask = _mm_set1_epi8(0b110);
1810        let r = _mm_testc_si128(a, mask);
1811        assert_eq!(r, 0);
1812        let a = _mm_set1_epi8(0b101);
1813        let mask = _mm_set1_epi8(0b100);
1814        let r = _mm_testc_si128(a, mask);
1815        assert_eq!(r, 1);
1816    }
1817
1818    #[simd_test(enable = "sse4.1")]
1819    unsafe fn test_mm_testnzc_si128() {
1820        let a = _mm_set1_epi8(0);
1821        let mask = _mm_set1_epi8(1);
1822        let r = _mm_testnzc_si128(a, mask);
1823        assert_eq!(r, 0);
1824        let a = _mm_set1_epi8(-1);
1825        let mask = _mm_set1_epi8(0);
1826        let r = _mm_testnzc_si128(a, mask);
1827        assert_eq!(r, 0);
1828        let a = _mm_set1_epi8(0b101);
1829        let mask = _mm_set1_epi8(0b110);
1830        let r = _mm_testnzc_si128(a, mask);
1831        assert_eq!(r, 1);
1832        let a = _mm_set1_epi8(0b101);
1833        let mask = _mm_set1_epi8(0b101);
1834        let r = _mm_testnzc_si128(a, mask);
1835        assert_eq!(r, 0);
1836    }
1837
1838    #[simd_test(enable = "sse4.1")]
1839    unsafe fn test_mm_test_all_zeros() {
1840        let a = _mm_set1_epi8(1);
1841        let mask = _mm_set1_epi8(0);
1842        let r = _mm_test_all_zeros(a, mask);
1843        assert_eq!(r, 1);
1844        let a = _mm_set1_epi8(0b101);
1845        let mask = _mm_set1_epi8(0b110);
1846        let r = _mm_test_all_zeros(a, mask);
1847        assert_eq!(r, 0);
1848        let a = _mm_set1_epi8(0b011);
1849        let mask = _mm_set1_epi8(0b100);
1850        let r = _mm_test_all_zeros(a, mask);
1851        assert_eq!(r, 1);
1852    }
1853
1854    #[simd_test(enable = "sse4.1")]
1855    unsafe fn test_mm_test_all_ones() {
1856        let a = _mm_set1_epi8(-1);
1857        let r = _mm_test_all_ones(a);
1858        assert_eq!(r, 1);
1859        let a = _mm_set1_epi8(0b101);
1860        let r = _mm_test_all_ones(a);
1861        assert_eq!(r, 0);
1862    }
1863
1864    #[simd_test(enable = "sse4.1")]
1865    unsafe fn test_mm_test_mix_ones_zeros() {
1866        let a = _mm_set1_epi8(0);
1867        let mask = _mm_set1_epi8(1);
1868        let r = _mm_test_mix_ones_zeros(a, mask);
1869        assert_eq!(r, 0);
1870        let a = _mm_set1_epi8(-1);
1871        let mask = _mm_set1_epi8(0);
1872        let r = _mm_test_mix_ones_zeros(a, mask);
1873        assert_eq!(r, 0);
1874        let a = _mm_set1_epi8(0b101);
1875        let mask = _mm_set1_epi8(0b110);
1876        let r = _mm_test_mix_ones_zeros(a, mask);
1877        assert_eq!(r, 1);
1878        let a = _mm_set1_epi8(0b101);
1879        let mask = _mm_set1_epi8(0b101);
1880        let r = _mm_test_mix_ones_zeros(a, mask);
1881        assert_eq!(r, 0);
1882    }
1883
1884    #[simd_test(enable = "sse4.1")]
1885    unsafe fn test_mm_stream_load_si128() {
1886        let a = _mm_set_epi64x(5, 6);
1887        let r = _mm_stream_load_si128(core::ptr::addr_of!(a) as *const _);
1888        assert_eq_m128i(a, r);
1889    }
1890}