core/stdarch/crates/core_arch/src/x86/
avx512fp16.rs

1use crate::arch::asm;
2use crate::core_arch::{simd::*, x86::*};
3use crate::intrinsics::{fmaf16, simd::*};
4use crate::ptr;
5
6/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
7///
8/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ph)
9#[inline]
10#[target_feature(enable = "avx512fp16")]
11#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12pub unsafe fn _mm_set_ph(
13    e7: f16,
14    e6: f16,
15    e5: f16,
16    e4: f16,
17    e3: f16,
18    e2: f16,
19    e1: f16,
20    e0: f16,
21) -> __m128h {
22    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
23}
24
25/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
26///
27/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_ph)
28#[inline]
29#[target_feature(enable = "avx512fp16")]
30#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
31pub unsafe fn _mm256_set_ph(
32    e15: f16,
33    e14: f16,
34    e13: f16,
35    e12: f16,
36    e11: f16,
37    e10: f16,
38    e9: f16,
39    e8: f16,
40    e7: f16,
41    e6: f16,
42    e5: f16,
43    e4: f16,
44    e3: f16,
45    e2: f16,
46    e1: f16,
47    e0: f16,
48) -> __m256h {
49    __m256h([
50        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
51    ])
52}
53
54/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
55///
56/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_ph)
57#[inline]
58#[target_feature(enable = "avx512fp16")]
59#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
60pub unsafe fn _mm512_set_ph(
61    e31: f16,
62    e30: f16,
63    e29: f16,
64    e28: f16,
65    e27: f16,
66    e26: f16,
67    e25: f16,
68    e24: f16,
69    e23: f16,
70    e22: f16,
71    e21: f16,
72    e20: f16,
73    e19: f16,
74    e18: f16,
75    e17: f16,
76    e16: f16,
77    e15: f16,
78    e14: f16,
79    e13: f16,
80    e12: f16,
81    e11: f16,
82    e10: f16,
83    e9: f16,
84    e8: f16,
85    e7: f16,
86    e6: f16,
87    e5: f16,
88    e4: f16,
89    e3: f16,
90    e2: f16,
91    e1: f16,
92    e0: f16,
93) -> __m512h {
94    __m512h([
95        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
96        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
97    ])
98}
99
100/// Copy half-precision (16-bit) floating-point elements from a to the lower element of dst and zero
101/// the upper 7 elements.
102///
103/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sh)
104#[inline]
105#[target_feature(enable = "avx512fp16")]
106#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
107pub unsafe fn _mm_set_sh(a: f16) -> __m128h {
108    __m128h([a, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
109}
110
111/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
112///
113/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ph)
114#[inline]
115#[target_feature(enable = "avx512fp16")]
116#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
117pub unsafe fn _mm_set1_ph(a: f16) -> __m128h {
118    transmute(f16x8::splat(a))
119}
120
121/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
122///
123/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_ph)
124#[inline]
125#[target_feature(enable = "avx512fp16")]
126#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
127pub unsafe fn _mm256_set1_ph(a: f16) -> __m256h {
128    transmute(f16x16::splat(a))
129}
130
131/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
132///
133/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_ph)
134#[inline]
135#[target_feature(enable = "avx512fp16")]
136#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
137pub unsafe fn _mm512_set1_ph(a: f16) -> __m512h {
138    transmute(f16x32::splat(a))
139}
140
141/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
142///
143/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ph)
144#[inline]
145#[target_feature(enable = "avx512fp16")]
146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
147pub unsafe fn _mm_setr_ph(
148    e0: f16,
149    e1: f16,
150    e2: f16,
151    e3: f16,
152    e4: f16,
153    e5: f16,
154    e6: f16,
155    e7: f16,
156) -> __m128h {
157    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
158}
159
160/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
161///
162/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_ph)
163#[inline]
164#[target_feature(enable = "avx512fp16")]
165#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
166pub unsafe fn _mm256_setr_ph(
167    e0: f16,
168    e1: f16,
169    e2: f16,
170    e3: f16,
171    e4: f16,
172    e5: f16,
173    e6: f16,
174    e7: f16,
175    e8: f16,
176    e9: f16,
177    e10: f16,
178    e11: f16,
179    e12: f16,
180    e13: f16,
181    e14: f16,
182    e15: f16,
183) -> __m256h {
184    __m256h([
185        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
186    ])
187}
188
189/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
190///
191/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_ph)
192#[inline]
193#[target_feature(enable = "avx512fp16")]
194#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
195pub unsafe fn _mm512_setr_ph(
196    e0: f16,
197    e1: f16,
198    e2: f16,
199    e3: f16,
200    e4: f16,
201    e5: f16,
202    e6: f16,
203    e7: f16,
204    e8: f16,
205    e9: f16,
206    e10: f16,
207    e11: f16,
208    e12: f16,
209    e13: f16,
210    e14: f16,
211    e15: f16,
212    e16: f16,
213    e17: f16,
214    e18: f16,
215    e19: f16,
216    e20: f16,
217    e21: f16,
218    e22: f16,
219    e23: f16,
220    e24: f16,
221    e25: f16,
222    e26: f16,
223    e27: f16,
224    e28: f16,
225    e29: f16,
226    e30: f16,
227    e31: f16,
228) -> __m512h {
229    __m512h([
230        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
231        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
232    ])
233}
234
235/// Return vector of type __m128h with all elements set to zero.
236///
237/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ph)
238#[inline]
239#[target_feature(enable = "avx512fp16,avx512vl")]
240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
241pub unsafe fn _mm_setzero_ph() -> __m128h {
242    transmute(f16x8::ZERO)
243}
244
245/// Return vector of type __m256h with all elements set to zero.
246///
247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_ph)
248#[inline]
249#[target_feature(enable = "avx512fp16,avx512vl")]
250#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
251pub unsafe fn _mm256_setzero_ph() -> __m256h {
252    transmute(f16x16::ZERO)
253}
254
255/// Return vector of type __m512h with all elements set to zero.
256///
257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_ph)
258#[inline]
259#[target_feature(enable = "avx512fp16")]
260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
261pub unsafe fn _mm512_setzero_ph() -> __m512h {
262    transmute(f16x32::ZERO)
263}
264
265/// Return vector of type `__m128h` with undefined elements. In practice, this returns the all-zero
266/// vector.
267///
268/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ph)
269#[inline]
270#[target_feature(enable = "avx512fp16,avx512vl")]
271#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
272pub unsafe fn _mm_undefined_ph() -> __m128h {
273    transmute(f16x8::ZERO)
274}
275
276/// Return vector of type `__m256h` with undefined elements. In practice, this returns the all-zero
277/// vector.
278///
279/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_ph)
280#[inline]
281#[target_feature(enable = "avx512fp16,avx512vl")]
282#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
283pub unsafe fn _mm256_undefined_ph() -> __m256h {
284    transmute(f16x16::ZERO)
285}
286
287/// Return vector of type `__m512h` with undefined elements. In practice, this returns the all-zero
288/// vector.
289///
290/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ph)
291#[inline]
292#[target_feature(enable = "avx512fp16")]
293#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
294pub unsafe fn _mm512_undefined_ph() -> __m512h {
295    transmute(f16x32::ZERO)
296}
297
298/// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and
299/// does not generate any instructions, thus it has zero latency.
300///
301/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ph)
302#[inline]
303#[target_feature(enable = "avx512fp16")]
304#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
305pub unsafe fn _mm_castpd_ph(a: __m128d) -> __m128h {
306    transmute(a)
307}
308
309/// Cast vector of type `__m256d` to type `__m256h`. This intrinsic is only used for compilation and
310/// does not generate any instructions, thus it has zero latency.
311///
312/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph)
313#[inline]
314#[target_feature(enable = "avx512fp16")]
315#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
316pub unsafe fn _mm256_castpd_ph(a: __m256d) -> __m256h {
317    transmute(a)
318}
319
320/// Cast vector of type `__m512d` to type `__m512h`. This intrinsic is only used for compilation and
321/// does not generate any instructions, thus it has zero latency.
322///
323/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ph)
324#[inline]
325#[target_feature(enable = "avx512fp16")]
326#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
327pub unsafe fn _mm512_castpd_ph(a: __m512d) -> __m512h {
328    transmute(a)
329}
330
331/// Cast vector of type `__m128h` to type `__m128d`. This intrinsic is only used for compilation and
332/// does not generate any instructions, thus it has zero latency.
333///
334/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_pd)
335#[inline]
336#[target_feature(enable = "avx512fp16")]
337#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
338pub unsafe fn _mm_castph_pd(a: __m128h) -> __m128d {
339    transmute(a)
340}
341
342/// Cast vector of type `__m256h` to type `__m256d`. This intrinsic is only used for compilation and
343/// does not generate any instructions, thus it has zero latency.
344///
345/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_pd)
346#[inline]
347#[target_feature(enable = "avx512fp16")]
348#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
349pub unsafe fn _mm256_castph_pd(a: __m256h) -> __m256d {
350    transmute(a)
351}
352
353/// Cast vector of type `__m512h` to type `__m512d`. This intrinsic is only used for compilation and
354/// does not generate any instructions, thus it has zero latency.
355///
356/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_pd)
357#[inline]
358#[target_feature(enable = "avx512fp16")]
359#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
360pub unsafe fn _mm512_castph_pd(a: __m512h) -> __m512d {
361    transmute(a)
362}
363
364/// Cast vector of type `__m128` to type `__m128h`. This intrinsic is only used for compilation and
365/// does not generate any instructions, thus it has zero latency.
366///
367/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_ph)
368#[inline]
369#[target_feature(enable = "avx512fp16")]
370#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
371pub unsafe fn _mm_castps_ph(a: __m128) -> __m128h {
372    transmute(a)
373}
374
375/// Cast vector of type `__m256` to type `__m256h`. This intrinsic is only used for compilation and
376/// does not generate any instructions, thus it has zero latency.
377///
378/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_ph)
379#[inline]
380#[target_feature(enable = "avx512fp16")]
381#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
382pub unsafe fn _mm256_castps_ph(a: __m256) -> __m256h {
383    transmute(a)
384}
385
386/// Cast vector of type `__m512` to type `__m512h`. This intrinsic is only used for compilation and
387/// does not generate any instructions, thus it has zero latency.
388///
389/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_ph)
390#[inline]
391#[target_feature(enable = "avx512fp16")]
392#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
393pub unsafe fn _mm512_castps_ph(a: __m512) -> __m512h {
394    transmute(a)
395}
396
397/// Cast vector of type `__m128h` to type `__m128`. This intrinsic is only used for compilation and
398/// does not generate any instructions, thus it has zero latency.
399///
400/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_ps)
401#[inline]
402#[target_feature(enable = "avx512fp16")]
403#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
404pub unsafe fn _mm_castph_ps(a: __m128h) -> __m128 {
405    transmute(a)
406}
407
408/// Cast vector of type `__m256h` to type `__m256`. This intrinsic is only used for compilation and
409/// does not generate any instructions, thus it has zero latency.
410///
411/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_ps)
412#[inline]
413#[target_feature(enable = "avx512fp16")]
414#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
415pub unsafe fn _mm256_castph_ps(a: __m256h) -> __m256 {
416    transmute(a)
417}
418
419/// Cast vector of type `__m512h` to type `__m512`. This intrinsic is only used for compilation and
420/// does not generate any instructions, thus it has zero latency.
421///
422/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_ps)
423#[inline]
424#[target_feature(enable = "avx512fp16")]
425#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
426pub unsafe fn _mm512_castph_ps(a: __m512h) -> __m512 {
427    transmute(a)
428}
429
430/// Cast vector of type `__m128i` to type `__m128h`. This intrinsic is only used for compilation and
431/// does not generate any instructions, thus it has zero latency.
432///
433/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_ph)
434#[inline]
435#[target_feature(enable = "avx512fp16")]
436#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
437pub unsafe fn _mm_castsi128_ph(a: __m128i) -> __m128h {
438    transmute(a)
439}
440
441/// Cast vector of type `__m256i` to type `__m256h`. This intrinsic is only used for compilation and
442/// does not generate any instructions, thus it has zero latency.
443///
444/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_ph)
445#[inline]
446#[target_feature(enable = "avx512fp16")]
447#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
448pub unsafe fn _mm256_castsi256_ph(a: __m256i) -> __m256h {
449    transmute(a)
450}
451
452/// Cast vector of type `__m512i` to type `__m512h`. This intrinsic is only used for compilation and
453/// does not generate any instructions, thus it has zero latency.
454///
455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_ph)
456#[inline]
457#[target_feature(enable = "avx512fp16")]
458#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
459pub unsafe fn _mm512_castsi512_ph(a: __m512i) -> __m512h {
460    transmute(a)
461}
462
463/// Cast vector of type `__m128h` to type `__m128i`. This intrinsic is only used for compilation and
464/// does not generate any instructions, thus it has zero latency.
465///
466/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_si128)
467#[inline]
468#[target_feature(enable = "avx512fp16")]
469#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
470pub unsafe fn _mm_castph_si128(a: __m128h) -> __m128i {
471    transmute(a)
472}
473
474/// Cast vector of type `__m256h` to type `__m256i`. This intrinsic is only used for compilation and
475/// does not generate any instructions, thus it has zero latency.
476///
477/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_si256)
478#[inline]
479#[target_feature(enable = "avx512fp16")]
480#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
481pub unsafe fn _mm256_castph_si256(a: __m256h) -> __m256i {
482    transmute(a)
483}
484
485/// Cast vector of type `__m512h` to type `__m512i`. This intrinsic is only used for compilation and
486/// does not generate any instructions, thus it has zero latency.
487///
488/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_si512)
489#[inline]
490#[target_feature(enable = "avx512fp16")]
491#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
492pub unsafe fn _mm512_castph_si512(a: __m512h) -> __m512i {
493    transmute(a)
494}
495
496/// Cast vector of type `__m256h` to type `__m128h`. This intrinsic is only used for compilation and
497/// does not generate any instructions, thus it has zero latency.
498///
499/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph256_ph128)
500#[inline]
501#[target_feature(enable = "avx512fp16")]
502#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
503pub unsafe fn _mm256_castph256_ph128(a: __m256h) -> __m128h {
504    simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
505}
506
507/// Cast vector of type `__m512h` to type `__m128h`. This intrinsic is only used for compilation and
508/// does not generate any instructions, thus it has zero latency.
509///
510/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph128)
511#[inline]
512#[target_feature(enable = "avx512fp16")]
513#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
514pub unsafe fn _mm512_castph512_ph128(a: __m512h) -> __m128h {
515    simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
516}
517
518/// Cast vector of type `__m512h` to type `__m256h`. This intrinsic is only used for compilation and
519/// does not generate any instructions, thus it has zero latency.
520///
521/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph256)
522#[inline]
523#[target_feature(enable = "avx512fp16")]
524#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
525pub unsafe fn _mm512_castph512_ph256(a: __m512h) -> __m256h {
526    simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
527}
528
529/// Cast vector of type `__m128h` to type `__m256h`. The upper 8 elements of the result are undefined.
530/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
531/// but most of the time it does not generate any instructions.
532///
533/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph128_ph256)
534#[inline]
535#[target_feature(enable = "avx512fp16")]
536#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
537pub unsafe fn _mm256_castph128_ph256(a: __m128h) -> __m256h {
538    simd_shuffle!(
539        a,
540        _mm_undefined_ph(),
541        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
542    )
543}
544
545/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are undefined.
546/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
547/// but most of the time it does not generate any instructions.
548///
549/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph128_ph512)
550#[inline]
551#[target_feature(enable = "avx512fp16")]
552#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
553pub unsafe fn _mm512_castph128_ph512(a: __m128h) -> __m512h {
554    simd_shuffle!(
555        a,
556        _mm_undefined_ph(),
557        [
558            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
559            8, 8, 8
560        ]
561    )
562}
563
564/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are undefined.
565/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
566/// but most of the time it does not generate any instructions.
567///
568/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph256_ph512)
569#[inline]
570#[target_feature(enable = "avx512fp16")]
571#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
572pub unsafe fn _mm512_castph256_ph512(a: __m256h) -> __m512h {
573    simd_shuffle!(
574        a,
575        _mm256_undefined_ph(),
576        [
577            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16,
578            16, 16, 16, 16, 16, 16, 16, 16
579        ]
580    )
581}
582
583/// Cast vector of type `__m256h` to type `__m128h`. The upper 8 elements of the result are zeroed.
584/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
585/// any instructions.
586///
587/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextph128_ph256)
588#[inline]
589#[target_feature(enable = "avx512fp16")]
590#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
591pub unsafe fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
592    simd_shuffle!(
593        a,
594        _mm_setzero_ph(),
595        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
596    )
597}
598
599/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
600/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
601/// any instructions.
602///
603/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
604#[inline]
605#[target_feature(enable = "avx512fp16")]
606#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
607pub unsafe fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
608    simd_shuffle!(
609        a,
610        _mm256_setzero_ph(),
611        [
612            0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16,
613            16, 16, 16, 16, 16, 16, 16, 16
614        ]
615    )
616}
617
618/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are zeroed.
619/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
620/// any instructions.
621///
622/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph128_ph512)
623#[inline]
624#[target_feature(enable = "avx512fp16")]
625#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
626pub unsafe fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
627    simd_shuffle!(
628        a,
629        _mm_setzero_ph(),
630        [
631            0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
632            8, 8, 8
633        ]
634    )
635}
636
637macro_rules! cmp_asm { // FIXME: use LLVM intrinsics
638    ($mask_type: ty, $reg: ident, $a: expr, $b: expr) => {{
639        let dst: $mask_type;
640        asm!(
641            "vcmpph {k}, {a}, {b}, {imm8}",
642            k = lateout(kreg) dst,
643            a = in($reg) $a,
644            b = in($reg) $b,
645            imm8 = const IMM5,
646            options(pure, nomem, nostack)
647        );
648        dst
649    }};
650    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr, $b: expr) => {{
651        let dst: $mask_type;
652        asm!(
653            "vcmpph {k} {{ {mask} }}, {a}, {b}, {imm8}",
654            k = lateout(kreg) dst,
655            mask = in(kreg) $mask,
656            a = in($reg) $a,
657            b = in($reg) $b,
658            imm8 = const IMM5,
659            options(pure, nomem, nostack)
660        );
661        dst
662    }};
663}
664
665/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
666/// operand specified by imm8, and store the results in mask vector k.
667///
668/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ph_mask)
669#[inline]
670#[target_feature(enable = "avx512fp16,avx512vl")]
671#[rustc_legacy_const_generics(2)]
672#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
673pub unsafe fn _mm_cmp_ph_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
674    static_assert_uimm_bits!(IMM5, 5);
675    cmp_asm!(__mmask8, xmm_reg, a, b)
676}
677
678/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
679/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
680/// zeroed out when the corresponding mask bit is not set).
681///
682/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ph_mask)
683#[inline]
684#[target_feature(enable = "avx512fp16,avx512vl")]
685#[rustc_legacy_const_generics(3)]
686#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
687pub unsafe fn _mm_mask_cmp_ph_mask<const IMM5: i32>(
688    k1: __mmask8,
689    a: __m128h,
690    b: __m128h,
691) -> __mmask8 {
692    static_assert_uimm_bits!(IMM5, 5);
693    cmp_asm!(__mmask8, k1, xmm_reg, a, b)
694}
695
696/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
697/// operand specified by imm8, and store the results in mask vector k.
698///
699/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ph_mask)
700#[inline]
701#[target_feature(enable = "avx512fp16,avx512vl")]
702#[rustc_legacy_const_generics(2)]
703#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
704pub unsafe fn _mm256_cmp_ph_mask<const IMM5: i32>(a: __m256h, b: __m256h) -> __mmask16 {
705    static_assert_uimm_bits!(IMM5, 5);
706    cmp_asm!(__mmask16, ymm_reg, a, b)
707}
708
709/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
710/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
711/// zeroed out when the corresponding mask bit is not set).
712///
713/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_ph_mask)
714#[inline]
715#[target_feature(enable = "avx512fp16,avx512vl")]
716#[rustc_legacy_const_generics(3)]
717#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
718pub unsafe fn _mm256_mask_cmp_ph_mask<const IMM5: i32>(
719    k1: __mmask16,
720    a: __m256h,
721    b: __m256h,
722) -> __mmask16 {
723    static_assert_uimm_bits!(IMM5, 5);
724    cmp_asm!(__mmask16, k1, ymm_reg, a, b)
725}
726
727/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
728/// operand specified by imm8, and store the results in mask vector k.
729///
730/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_ph_mask)
731#[inline]
732#[target_feature(enable = "avx512fp16")]
733#[rustc_legacy_const_generics(2)]
734#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
735pub unsafe fn _mm512_cmp_ph_mask<const IMM5: i32>(a: __m512h, b: __m512h) -> __mmask32 {
736    static_assert_uimm_bits!(IMM5, 5);
737    cmp_asm!(__mmask32, zmm_reg, a, b)
738}
739
740/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
741/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
742/// zeroed out when the corresponding mask bit is not set).
743///
744/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_ph_mask)
745#[inline]
746#[target_feature(enable = "avx512fp16")]
747#[rustc_legacy_const_generics(3)]
748#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
749pub unsafe fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
750    k1: __mmask32,
751    a: __m512h,
752    b: __m512h,
753) -> __mmask32 {
754    static_assert_uimm_bits!(IMM5, 5);
755    cmp_asm!(__mmask32, k1, zmm_reg, a, b)
756}
757
758/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
759/// operand specified by imm8, and store the results in mask vector k.
760///
761/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
762///
763/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask)
764#[inline]
765#[target_feature(enable = "avx512fp16")]
766#[rustc_legacy_const_generics(2, 3)]
767#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
768pub unsafe fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
769    a: __m512h,
770    b: __m512h,
771) -> __mmask32 {
772    static_assert_uimm_bits!(IMM5, 5);
773    static_assert_sae!(SAE);
774    if SAE == _MM_FROUND_NO_EXC {
775        let dst: __mmask32;
776        asm!(
777            "vcmpph {k}, {a}, {b}, {{sae}}, {imm8}",
778            k = lateout(kreg) dst,
779            a = in(zmm_reg) a,
780            b = in(zmm_reg) b,
781            imm8 = const IMM5,
782            options(pure, nomem, nostack)
783        );
784        dst
785    } else {
786        cmp_asm!(__mmask32, zmm_reg, a, b)
787    }
788}
789
790/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
791/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
792/// zeroed out when the corresponding mask bit is not set).
793///
794/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
795///
796/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
797#[inline]
798#[target_feature(enable = "avx512fp16")]
799#[rustc_legacy_const_generics(3, 4)]
800#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
801pub unsafe fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
802    k1: __mmask32,
803    a: __m512h,
804    b: __m512h,
805) -> __mmask32 {
806    static_assert_uimm_bits!(IMM5, 5);
807    static_assert_sae!(SAE);
808    if SAE == _MM_FROUND_NO_EXC {
809        let dst: __mmask32;
810        asm!(
811            "vcmpph {k} {{{k1}}}, {a}, {b}, {{sae}}, {imm8}",
812            k = lateout(kreg) dst,
813            k1 = in(kreg) k1,
814            a = in(zmm_reg) a,
815            b = in(zmm_reg) b,
816            imm8 = const IMM5,
817            options(pure, nomem, nostack)
818        );
819        dst
820    } else {
821        cmp_asm!(__mmask32, k1, zmm_reg, a, b)
822    }
823}
824
825/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
826/// operand specified by imm8, and store the result in mask vector k. Exceptions can be suppressed by
827/// passing _MM_FROUND_NO_EXC in the sae parameter.
828///
829/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sh_mask)
830#[inline]
831#[target_feature(enable = "avx512fp16")]
832#[rustc_legacy_const_generics(2, 3)]
833#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
834pub unsafe fn _mm_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
835    a: __m128h,
836    b: __m128h,
837) -> __mmask8 {
838    static_assert_uimm_bits!(IMM5, 5);
839    static_assert_sae!(SAE);
840    _mm_mask_cmp_round_sh_mask::<IMM5, SAE>(0xff, a, b)
841}
842
843/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
844/// operand specified by imm8, and store the result in mask vector k using zeromask k1. Exceptions can be
845/// suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
846///
847/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sh_mask)
848#[inline]
849#[target_feature(enable = "avx512fp16")]
850#[rustc_legacy_const_generics(3, 4)]
851#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
852pub unsafe fn _mm_mask_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
853    k1: __mmask8,
854    a: __m128h,
855    b: __m128h,
856) -> __mmask8 {
857    static_assert_uimm_bits!(IMM5, 5);
858    static_assert_sae!(SAE);
859    vcmpsh(a, b, IMM5, k1, SAE)
860}
861
862/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
863/// operand specified by imm8, and store the result in mask vector k.
864///
865/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sh_mask)
866#[inline]
867#[target_feature(enable = "avx512fp16")]
868#[rustc_legacy_const_generics(2)]
869#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
870pub unsafe fn _mm_cmp_sh_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
871    static_assert_uimm_bits!(IMM5, 5);
872    _mm_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
873}
874
875/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
876/// operand specified by imm8, and store the result in mask vector k using zeromask k1.
877///
878/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sh_mask)
879#[inline]
880#[target_feature(enable = "avx512fp16")]
881#[rustc_legacy_const_generics(3)]
882#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
883pub unsafe fn _mm_mask_cmp_sh_mask<const IMM5: i32>(
884    k1: __mmask8,
885    a: __m128h,
886    b: __m128h,
887) -> __mmask8 {
888    static_assert_uimm_bits!(IMM5, 5);
889    _mm_mask_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
890}
891
892/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
893/// operand specified by imm8, and return the boolean result (0 or 1).
894/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
895///
896/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_sh)
897#[inline]
898#[target_feature(enable = "avx512fp16")]
899#[rustc_legacy_const_generics(2, 3)]
900#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
901pub unsafe fn _mm_comi_round_sh<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> i32 {
902    static_assert_uimm_bits!(IMM5, 5);
903    static_assert_sae!(SAE);
904    vcomish(a, b, IMM5, SAE)
905}
906
907/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
908/// operand specified by imm8, and return the boolean result (0 or 1).
909///
910/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_sh)
911#[inline]
912#[target_feature(enable = "avx512fp16")]
913#[rustc_legacy_const_generics(2)]
914#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
915pub unsafe fn _mm_comi_sh<const IMM5: i32>(a: __m128h, b: __m128h) -> i32 {
916    static_assert_uimm_bits!(IMM5, 5);
917    _mm_comi_round_sh::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
918}
919
920/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and return
921/// the boolean result (0 or 1).
922///
923/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sh)
924#[inline]
925#[target_feature(enable = "avx512fp16")]
926#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
927pub unsafe fn _mm_comieq_sh(a: __m128h, b: __m128h) -> i32 {
928    _mm_comi_sh::<_CMP_EQ_OS>(a, b)
929}
930
931/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
932/// and return the boolean result (0 or 1).
933///
934/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sh)
935#[inline]
936#[target_feature(enable = "avx512fp16")]
937#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
938pub unsafe fn _mm_comige_sh(a: __m128h, b: __m128h) -> i32 {
939    _mm_comi_sh::<_CMP_GE_OS>(a, b)
940}
941
942/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
943/// the boolean result (0 or 1).
944///
945/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sh)
946#[inline]
947#[target_feature(enable = "avx512fp16")]
948#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
949pub unsafe fn _mm_comigt_sh(a: __m128h, b: __m128h) -> i32 {
950    _mm_comi_sh::<_CMP_GT_OS>(a, b)
951}
952
953/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
954/// return the boolean result (0 or 1).
955///
956/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sh)
957#[inline]
958#[target_feature(enable = "avx512fp16")]
959#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
960pub unsafe fn _mm_comile_sh(a: __m128h, b: __m128h) -> i32 {
961    _mm_comi_sh::<_CMP_LE_OS>(a, b)
962}
963
964/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
965/// the boolean result (0 or 1).
966///
967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sh)
968#[inline]
969#[target_feature(enable = "avx512fp16")]
970#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
971pub unsafe fn _mm_comilt_sh(a: __m128h, b: __m128h) -> i32 {
972    _mm_comi_sh::<_CMP_LT_OS>(a, b)
973}
974
975/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
976/// the boolean result (0 or 1).
977///
978/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sh)
979#[inline]
980#[target_feature(enable = "avx512fp16")]
981#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
982pub unsafe fn _mm_comineq_sh(a: __m128h, b: __m128h) -> i32 {
983    _mm_comi_sh::<_CMP_NEQ_OS>(a, b)
984}
985
986/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and
987/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
988///
989/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_sh)
990#[inline]
991#[target_feature(enable = "avx512fp16")]
992#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
993pub unsafe fn _mm_ucomieq_sh(a: __m128h, b: __m128h) -> i32 {
994    _mm_comi_sh::<_CMP_EQ_OQ>(a, b)
995}
996
997/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
998/// and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
999///
1000/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_sh)
1001#[inline]
1002#[target_feature(enable = "avx512fp16")]
1003#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1004pub unsafe fn _mm_ucomige_sh(a: __m128h, b: __m128h) -> i32 {
1005    _mm_comi_sh::<_CMP_GE_OQ>(a, b)
1006}
1007
1008/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
1009/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1010///
1011/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_sh)
1012#[inline]
1013#[target_feature(enable = "avx512fp16")]
1014#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1015pub unsafe fn _mm_ucomigt_sh(a: __m128h, b: __m128h) -> i32 {
1016    _mm_comi_sh::<_CMP_GT_OQ>(a, b)
1017}
1018
1019/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
1020/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1021///
1022/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_sh)
1023#[inline]
1024#[target_feature(enable = "avx512fp16")]
1025#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1026pub unsafe fn _mm_ucomile_sh(a: __m128h, b: __m128h) -> i32 {
1027    _mm_comi_sh::<_CMP_LE_OQ>(a, b)
1028}
1029
1030/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
1031/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1032///
1033/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_sh)
1034#[inline]
1035#[target_feature(enable = "avx512fp16")]
1036#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1037pub unsafe fn _mm_ucomilt_sh(a: __m128h, b: __m128h) -> i32 {
1038    _mm_comi_sh::<_CMP_LT_OQ>(a, b)
1039}
1040
1041/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
1042/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1043///
1044/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_sh)
1045#[inline]
1046#[target_feature(enable = "avx512fp16")]
1047#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1048pub unsafe fn _mm_ucomineq_sh(a: __m128h, b: __m128h) -> i32 {
1049    _mm_comi_sh::<_CMP_NEQ_OQ>(a, b)
1050}
1051
1052/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1053/// a new vector. The address must be aligned to 16 bytes or a general-protection exception may be generated.
1054///
1055/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ph)
1056#[inline]
1057#[target_feature(enable = "avx512fp16,avx512vl")]
1058#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1059pub unsafe fn _mm_load_ph(mem_addr: *const f16) -> __m128h {
1060    *mem_addr.cast()
1061}
1062
1063/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1064/// a new vector. The address must be aligned to 32 bytes or a general-protection exception may be generated.
1065///
1066/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_ph)
1067#[inline]
1068#[target_feature(enable = "avx512fp16,avx512vl")]
1069#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1070pub unsafe fn _mm256_load_ph(mem_addr: *const f16) -> __m256h {
1071    *mem_addr.cast()
1072}
1073
1074/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1075/// a new vector. The address must be aligned to 64 bytes or a general-protection exception may be generated.
1076///
1077/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_ph)
1078#[inline]
1079#[target_feature(enable = "avx512fp16")]
1080#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1081pub unsafe fn _mm512_load_ph(mem_addr: *const f16) -> __m512h {
1082    *mem_addr.cast()
1083}
1084
1085/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector,
1086/// and zero the upper elements
1087///
1088/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sh)
1089#[inline]
1090#[target_feature(enable = "avx512fp16")]
1091#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1092pub unsafe fn _mm_load_sh(mem_addr: *const f16) -> __m128h {
1093    _mm_set_sh(*mem_addr)
1094}
1095
1096/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1097/// using writemask k (the element is copied from src when mask bit 0 is not set), and zero the upper elements.
1098///
1099/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sh)
1100#[inline]
1101#[target_feature(enable = "avx512fp16")]
1102#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1103pub unsafe fn _mm_mask_load_sh(src: __m128h, k: __mmask8, mem_addr: *const f16) -> __m128h {
1104    let mut dst = src;
1105    asm!(
1106        vpl!("vmovsh {dst}{{{k}}}"),
1107        dst = inout(xmm_reg) dst,
1108        k = in(kreg) k,
1109        p = in(reg) mem_addr,
1110        options(pure, readonly, nostack, preserves_flags)
1111    );
1112    dst
1113}
1114
1115/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1116/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and zero the upper elements.
1117///
1118/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sh)
1119#[inline]
1120#[target_feature(enable = "avx512fp16")]
1121#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1122pub unsafe fn _mm_maskz_load_sh(k: __mmask8, mem_addr: *const f16) -> __m128h {
1123    let mut dst: __m128h;
1124    asm!(
1125        vpl!("vmovsh {dst}{{{k}}}{{z}}"),
1126        dst = out(xmm_reg) dst,
1127        k = in(kreg) k,
1128        p = in(reg) mem_addr,
1129        options(pure, readonly, nostack, preserves_flags)
1130    );
1131    dst
1132}
1133
1134/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1135/// a new vector. The address does not need to be aligned to any particular boundary.
1136///
1137/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ph)
1138#[inline]
1139#[target_feature(enable = "avx512fp16,avx512vl")]
1140#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1141pub unsafe fn _mm_loadu_ph(mem_addr: *const f16) -> __m128h {
1142    ptr::read_unaligned(mem_addr.cast())
1143}
1144
1145/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1146/// a new vector. The address does not need to be aligned to any particular boundary.
1147///
1148/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_ph)
1149#[inline]
1150#[target_feature(enable = "avx512fp16,avx512vl")]
1151#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1152pub unsafe fn _mm256_loadu_ph(mem_addr: *const f16) -> __m256h {
1153    ptr::read_unaligned(mem_addr.cast())
1154}
1155
1156/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1157/// a new vector. The address does not need to be aligned to any particular boundary.
1158///
1159/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ph)
1160#[inline]
1161#[target_feature(enable = "avx512fp16")]
1162#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1163pub unsafe fn _mm512_loadu_ph(mem_addr: *const f16) -> __m512h {
1164    ptr::read_unaligned(mem_addr.cast())
1165}
1166
1167/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1168/// using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper
1169/// 7 packed elements from a to the upper elements of dst.
1170///
1171/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sh)
1172#[inline]
1173#[target_feature(enable = "avx512fp16")]
1174#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1175pub unsafe fn _mm_mask_move_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1176    let mut mov: f16 = simd_extract!(src, 0);
1177    if (k & 1) != 0 {
1178        mov = simd_extract!(b, 0);
1179    }
1180    simd_insert!(a, 0, mov)
1181}
1182
1183/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1184/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
1185/// elements from a to the upper elements of dst.
1186///
1187/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sh)
1188#[inline]
1189#[target_feature(enable = "avx512fp16")]
1190#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1191pub unsafe fn _mm_maskz_move_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1192    let mut mov: f16 = 0.;
1193    if (k & 1) != 0 {
1194        mov = simd_extract!(b, 0);
1195    }
1196    simd_insert!(a, 0, mov)
1197}
1198
1199/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst,
1200/// and copy the upper 7 packed elements from a to the upper elements of dst.
1201///
1202/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sh)
1203#[inline]
1204#[target_feature(enable = "avx512fp16")]
1205#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1206pub unsafe fn _mm_move_sh(a: __m128h, b: __m128h) -> __m128h {
1207    let mov: f16 = simd_extract!(b, 0);
1208    simd_insert!(a, 0, mov)
1209}
1210
1211/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1212/// The address must be aligned to 16 bytes or a general-protection exception may be generated.
1213///
1214/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ph)
1215#[inline]
1216#[target_feature(enable = "avx512fp16,avx512vl")]
1217#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1218pub unsafe fn _mm_store_ph(mem_addr: *mut f16, a: __m128h) {
1219    *mem_addr.cast() = a;
1220}
1221
1222/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1223/// The address must be aligned to 32 bytes or a general-protection exception may be generated.
1224///
1225/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_ph)
1226#[inline]
1227#[target_feature(enable = "avx512fp16,avx512vl")]
1228#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1229pub unsafe fn _mm256_store_ph(mem_addr: *mut f16, a: __m256h) {
1230    *mem_addr.cast() = a;
1231}
1232
1233/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1234/// The address must be aligned to 64 bytes or a general-protection exception may be generated.
1235///
1236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_ph)
1237#[inline]
1238#[target_feature(enable = "avx512fp16")]
1239#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1240pub unsafe fn _mm512_store_ph(mem_addr: *mut f16, a: __m512h) {
1241    *mem_addr.cast() = a;
1242}
1243
1244/// Store the lower half-precision (16-bit) floating-point element from a into memory.
1245///
1246/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_sh)
1247#[inline]
1248#[target_feature(enable = "avx512fp16")]
1249#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1250pub unsafe fn _mm_store_sh(mem_addr: *mut f16, a: __m128h) {
1251    *mem_addr = simd_extract!(a, 0);
1252}
1253
1254/// Store the lower half-precision (16-bit) floating-point element from a into memory using writemask k
1255///
1256/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sh)
1257#[inline]
1258#[target_feature(enable = "avx512fp16")]
1259#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1260pub unsafe fn _mm_mask_store_sh(mem_addr: *mut f16, k: __mmask8, a: __m128h) {
1261    asm!(
1262        vps!("vmovdqu16", "{{{k}}}, {src}"),
1263        p = in(reg) mem_addr,
1264        k = in(kreg) k,
1265        src = in(xmm_reg) a,
1266        options(nostack, preserves_flags)
1267    );
1268}
1269
1270/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1271/// The address does not need to be aligned to any particular boundary.
1272///
1273/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ph)
1274#[inline]
1275#[target_feature(enable = "avx512fp16,avx512vl")]
1276#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1277pub unsafe fn _mm_storeu_ph(mem_addr: *mut f16, a: __m128h) {
1278    ptr::write_unaligned(mem_addr.cast(), a);
1279}
1280
1281/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1282/// The address does not need to be aligned to any particular boundary.
1283///
1284/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_ph)
1285#[inline]
1286#[target_feature(enable = "avx512fp16,avx512vl")]
1287#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1288pub unsafe fn _mm256_storeu_ph(mem_addr: *mut f16, a: __m256h) {
1289    ptr::write_unaligned(mem_addr.cast(), a);
1290}
1291
1292/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1293/// The address does not need to be aligned to any particular boundary.
1294///
1295/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ph)
1296#[inline]
1297#[target_feature(enable = "avx512fp16")]
1298#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1299pub unsafe fn _mm512_storeu_ph(mem_addr: *mut f16, a: __m512h) {
1300    ptr::write_unaligned(mem_addr.cast(), a);
1301}
1302
1303/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1304///
1305/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ph)
1306#[inline]
1307#[target_feature(enable = "avx512fp16,avx512vl")]
1308#[cfg_attr(test, assert_instr(vaddph))]
1309#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1310pub unsafe fn _mm_add_ph(a: __m128h, b: __m128h) -> __m128h {
1311    simd_add(a, b)
1312}
1313
1314/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1315/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1316///
1317/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ph)
1318#[inline]
1319#[target_feature(enable = "avx512fp16,avx512vl")]
1320#[cfg_attr(test, assert_instr(vaddph))]
1321#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1322pub unsafe fn _mm_mask_add_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1323    let r = _mm_add_ph(a, b);
1324    simd_select_bitmask(k, r, src)
1325}
1326
1327/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1328/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1329///
1330/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ph)
1331#[inline]
1332#[target_feature(enable = "avx512fp16,avx512vl")]
1333#[cfg_attr(test, assert_instr(vaddph))]
1334#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1335pub unsafe fn _mm_maskz_add_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1336    let r = _mm_add_ph(a, b);
1337    simd_select_bitmask(k, r, _mm_setzero_ph())
1338}
1339
1340/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1341///
1342/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_ph)
1343#[inline]
1344#[target_feature(enable = "avx512fp16,avx512vl")]
1345#[cfg_attr(test, assert_instr(vaddph))]
1346#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1347pub unsafe fn _mm256_add_ph(a: __m256h, b: __m256h) -> __m256h {
1348    simd_add(a, b)
1349}
1350
1351/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1352/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1353///
1354/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_ph)
1355#[inline]
1356#[target_feature(enable = "avx512fp16,avx512vl")]
1357#[cfg_attr(test, assert_instr(vaddph))]
1358#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1359pub unsafe fn _mm256_mask_add_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1360    let r = _mm256_add_ph(a, b);
1361    simd_select_bitmask(k, r, src)
1362}
1363
1364/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1365/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1366///
1367/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_ph)
1368#[inline]
1369#[target_feature(enable = "avx512fp16,avx512vl")]
1370#[cfg_attr(test, assert_instr(vaddph))]
1371#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1372pub unsafe fn _mm256_maskz_add_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1373    let r = _mm256_add_ph(a, b);
1374    simd_select_bitmask(k, r, _mm256_setzero_ph())
1375}
1376
1377/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1378///
1379/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_ph)
1380#[inline]
1381#[target_feature(enable = "avx512fp16")]
1382#[cfg_attr(test, assert_instr(vaddph))]
1383#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1384pub unsafe fn _mm512_add_ph(a: __m512h, b: __m512h) -> __m512h {
1385    simd_add(a, b)
1386}
1387
1388/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1389/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1390///
1391/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_ph)
1392#[inline]
1393#[target_feature(enable = "avx512fp16")]
1394#[cfg_attr(test, assert_instr(vaddph))]
1395#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1396pub unsafe fn _mm512_mask_add_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1397    let r = _mm512_add_ph(a, b);
1398    simd_select_bitmask(k, r, src)
1399}
1400
1401/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1402/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1403///
1404/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_ph)
1405#[inline]
1406#[target_feature(enable = "avx512fp16")]
1407#[cfg_attr(test, assert_instr(vaddph))]
1408#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1409pub unsafe fn _mm512_maskz_add_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1410    let r = _mm512_add_ph(a, b);
1411    simd_select_bitmask(k, r, _mm512_setzero_ph())
1412}
1413
1414/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1415/// Rounding is done according to the rounding parameter, which can be one of:
1416///
1417/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1418/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1419/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1420/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1421/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1422///
1423/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_round_ph)
1424#[inline]
1425#[target_feature(enable = "avx512fp16")]
1426#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1427#[rustc_legacy_const_generics(2)]
1428#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1429pub unsafe fn _mm512_add_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1430    static_assert_rounding!(ROUNDING);
1431    vaddph(a, b, ROUNDING)
1432}
1433
1434/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1435/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1436/// Rounding is done according to the rounding parameter, which can be one of:
1437///
1438/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1439/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1440/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1441/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1442/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1443///
1444/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_round_ph)
1445#[inline]
1446#[target_feature(enable = "avx512fp16")]
1447#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1448#[rustc_legacy_const_generics(4)]
1449#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1450pub unsafe fn _mm512_mask_add_round_ph<const ROUNDING: i32>(
1451    src: __m512h,
1452    k: __mmask32,
1453    a: __m512h,
1454    b: __m512h,
1455) -> __m512h {
1456    static_assert_rounding!(ROUNDING);
1457    let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1458    simd_select_bitmask(k, r, src)
1459}
1460
1461/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1462/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1463/// Rounding is done according to the rounding parameter, which can be one of:
1464///
1465/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1466/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1467/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1468/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1469///
1470/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_round_ph)
1471#[inline]
1472#[target_feature(enable = "avx512fp16")]
1473#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1474#[rustc_legacy_const_generics(3)]
1475#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1476pub unsafe fn _mm512_maskz_add_round_ph<const ROUNDING: i32>(
1477    k: __mmask32,
1478    a: __m512h,
1479    b: __m512h,
1480) -> __m512h {
1481    static_assert_rounding!(ROUNDING);
1482    let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1483    simd_select_bitmask(k, r, _mm512_setzero_ph())
1484}
1485
1486/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1487/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1488/// Rounding is done according to the rounding parameter, which can be one of:
1489///
1490/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1491/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1492/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1493/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1494/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1495///
1496/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sh)
1497#[inline]
1498#[target_feature(enable = "avx512fp16")]
1499#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1500#[rustc_legacy_const_generics(2)]
1501#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1502pub unsafe fn _mm_add_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1503    static_assert_rounding!(ROUNDING);
1504    _mm_mask_add_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
1505}
1506
1507/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1508/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1509/// writemask k (the element is copied from src when mask bit 0 is not set).
1510/// Rounding is done according to the rounding parameter, which can be one of:
1511///
1512/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1513/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1514/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1515/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1516/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1517///
1518/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sh)
1519#[inline]
1520#[target_feature(enable = "avx512fp16")]
1521#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1522#[rustc_legacy_const_generics(4)]
1523#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1524pub unsafe fn _mm_mask_add_round_sh<const ROUNDING: i32>(
1525    src: __m128h,
1526    k: __mmask8,
1527    a: __m128h,
1528    b: __m128h,
1529) -> __m128h {
1530    static_assert_rounding!(ROUNDING);
1531    vaddsh(a, b, src, k, ROUNDING)
1532}
1533
1534/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1535/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1536/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1537/// Rounding is done according to the rounding parameter, which can be one of:
1538///
1539/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1540/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1541/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1542/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1543/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1544///
1545/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sh)
1546#[inline]
1547#[target_feature(enable = "avx512fp16")]
1548#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1549#[rustc_legacy_const_generics(3)]
1550#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1551pub unsafe fn _mm_maskz_add_round_sh<const ROUNDING: i32>(
1552    k: __mmask8,
1553    a: __m128h,
1554    b: __m128h,
1555) -> __m128h {
1556    static_assert_rounding!(ROUNDING);
1557    _mm_mask_add_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
1558}
1559
1560/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1561/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1562///
1563/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sh)
1564#[inline]
1565#[target_feature(enable = "avx512fp16")]
1566#[cfg_attr(test, assert_instr(vaddsh))]
1567#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1568pub unsafe fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
1569    _mm_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
1570}
1571
1572/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1573/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1574/// writemask k (the element is copied from src when mask bit 0 is not set).
1575///
1576/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sh)
1577#[inline]
1578#[target_feature(enable = "avx512fp16")]
1579#[cfg_attr(test, assert_instr(vaddsh))]
1580#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1581pub unsafe fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1582    _mm_mask_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
1583}
1584
1585/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1586/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1587/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1588///
1589/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sh)
1590#[inline]
1591#[target_feature(enable = "avx512fp16")]
1592#[cfg_attr(test, assert_instr(vaddsh))]
1593#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1594pub unsafe fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1595    _mm_maskz_add_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
1596}
1597
1598/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1599///
1600/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ph)
1601#[inline]
1602#[target_feature(enable = "avx512fp16,avx512vl")]
1603#[cfg_attr(test, assert_instr(vsubph))]
1604#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1605pub unsafe fn _mm_sub_ph(a: __m128h, b: __m128h) -> __m128h {
1606    simd_sub(a, b)
1607}
1608
1609/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1610/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1611///
1612/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ph)
1613#[inline]
1614#[target_feature(enable = "avx512fp16,avx512vl")]
1615#[cfg_attr(test, assert_instr(vsubph))]
1616#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1617pub unsafe fn _mm_mask_sub_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1618    let r = _mm_sub_ph(a, b);
1619    simd_select_bitmask(k, r, src)
1620}
1621
1622/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1623/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1624///
1625/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ph)
1626#[inline]
1627#[target_feature(enable = "avx512fp16,avx512vl")]
1628#[cfg_attr(test, assert_instr(vsubph))]
1629#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1630pub unsafe fn _mm_maskz_sub_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1631    let r = _mm_sub_ph(a, b);
1632    simd_select_bitmask(k, r, _mm_setzero_ph())
1633}
1634
1635/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1636///
1637/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_ph)
1638#[inline]
1639#[target_feature(enable = "avx512fp16,avx512vl")]
1640#[cfg_attr(test, assert_instr(vsubph))]
1641#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1642pub unsafe fn _mm256_sub_ph(a: __m256h, b: __m256h) -> __m256h {
1643    simd_sub(a, b)
1644}
1645
1646/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1647/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1648///
1649/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_ph)
1650#[inline]
1651#[target_feature(enable = "avx512fp16,avx512vl")]
1652#[cfg_attr(test, assert_instr(vsubph))]
1653#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1654pub unsafe fn _mm256_mask_sub_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1655    let r = _mm256_sub_ph(a, b);
1656    simd_select_bitmask(k, r, src)
1657}
1658
1659/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1660/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1661///
1662/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_ph)
1663#[inline]
1664#[target_feature(enable = "avx512fp16,avx512vl")]
1665#[cfg_attr(test, assert_instr(vsubph))]
1666#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1667pub unsafe fn _mm256_maskz_sub_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1668    let r = _mm256_sub_ph(a, b);
1669    simd_select_bitmask(k, r, _mm256_setzero_ph())
1670}
1671
1672/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1673///
1674/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_ph)
1675#[inline]
1676#[target_feature(enable = "avx512fp16")]
1677#[cfg_attr(test, assert_instr(vsubph))]
1678#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1679pub unsafe fn _mm512_sub_ph(a: __m512h, b: __m512h) -> __m512h {
1680    simd_sub(a, b)
1681}
1682
1683/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1684/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1685///
1686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ph)
1687#[inline]
1688#[target_feature(enable = "avx512fp16")]
1689#[cfg_attr(test, assert_instr(vsubph))]
1690#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1691pub unsafe fn _mm512_mask_sub_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1692    let r = _mm512_sub_ph(a, b);
1693    simd_select_bitmask(k, r, src)
1694}
1695
1696/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1697/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1698///
1699/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ph)
1700#[inline]
1701#[target_feature(enable = "avx512fp16")]
1702#[cfg_attr(test, assert_instr(vsubph))]
1703#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1704pub unsafe fn _mm512_maskz_sub_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1705    let r = _mm512_sub_ph(a, b);
1706    simd_select_bitmask(k, r, _mm512_setzero_ph())
1707}
1708
1709/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1710/// Rounding is done according to the rounding parameter, which can be one of:
1711///
1712/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1713/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1714/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1715/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1716/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1717///
1718/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ph)
1719#[inline]
1720#[target_feature(enable = "avx512fp16")]
1721#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1722#[rustc_legacy_const_generics(2)]
1723#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1724pub unsafe fn _mm512_sub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1725    static_assert_rounding!(ROUNDING);
1726    vsubph(a, b, ROUNDING)
1727}
1728
1729/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1730/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1731/// Rounding is done according to the rounding parameter, which can be one of:
1732///
1733/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1734/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1735/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1736/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1737/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1738///
1739/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ph)
1740#[inline]
1741#[target_feature(enable = "avx512fp16")]
1742#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1743#[rustc_legacy_const_generics(4)]
1744#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1745pub unsafe fn _mm512_mask_sub_round_ph<const ROUNDING: i32>(
1746    src: __m512h,
1747    k: __mmask32,
1748    a: __m512h,
1749    b: __m512h,
1750) -> __m512h {
1751    static_assert_rounding!(ROUNDING);
1752    let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1753    simd_select_bitmask(k, r, src)
1754}
1755
1756/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1757/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1758/// Rounding is done according to the rounding parameter, which can be one of:
1759///
1760/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1761/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1762/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1763/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1764/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1765///
1766/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ph)
1767#[inline]
1768#[target_feature(enable = "avx512fp16")]
1769#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1770#[rustc_legacy_const_generics(3)]
1771#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1772pub unsafe fn _mm512_maskz_sub_round_ph<const ROUNDING: i32>(
1773    k: __mmask32,
1774    a: __m512h,
1775    b: __m512h,
1776) -> __m512h {
1777    static_assert_rounding!(ROUNDING);
1778    let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1779    simd_select_bitmask(k, r, _mm512_setzero_ph())
1780}
1781
1782/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1783/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1784/// Rounding is done according to the rounding parameter, which can be one of:
1785///
1786/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1787/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1788/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1789/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1790/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1791///
1792/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sh)
1793#[inline]
1794#[target_feature(enable = "avx512fp16")]
1795#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1796#[rustc_legacy_const_generics(2)]
1797#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1798pub unsafe fn _mm_sub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1799    static_assert_rounding!(ROUNDING);
1800    _mm_mask_sub_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
1801}
1802
1803/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1804/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1805/// writemask k (the element is copied from src when mask bit 0 is not set).
1806/// Rounding is done according to the rounding parameter, which can be one of:
1807///
1808/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1809/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1810/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1811/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1812/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1813///
1814/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sh)
1815#[inline]
1816#[target_feature(enable = "avx512fp16")]
1817#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1818#[rustc_legacy_const_generics(4)]
1819#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1820pub unsafe fn _mm_mask_sub_round_sh<const ROUNDING: i32>(
1821    src: __m128h,
1822    k: __mmask8,
1823    a: __m128h,
1824    b: __m128h,
1825) -> __m128h {
1826    static_assert_rounding!(ROUNDING);
1827    vsubsh(a, b, src, k, ROUNDING)
1828}
1829
1830/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1831/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1832/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1833/// Rounding is done according to the rounding parameter, which can be one of:
1834///
1835/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1836/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1837/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1838/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1839/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1840///
1841/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sh)
1842#[inline]
1843#[target_feature(enable = "avx512fp16")]
1844#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1845#[rustc_legacy_const_generics(3)]
1846#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1847pub unsafe fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(
1848    k: __mmask8,
1849    a: __m128h,
1850    b: __m128h,
1851) -> __m128h {
1852    static_assert_rounding!(ROUNDING);
1853    _mm_mask_sub_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
1854}
1855
1856/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1857/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1858///
1859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sh)
1860#[inline]
1861#[target_feature(enable = "avx512fp16")]
1862#[cfg_attr(test, assert_instr(vsubsh))]
1863#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1864pub unsafe fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
1865    _mm_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
1866}
1867
1868/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1869/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1870/// writemask k (the element is copied from src when mask bit 0 is not set).
1871///
1872/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sh)
1873#[inline]
1874#[target_feature(enable = "avx512fp16")]
1875#[cfg_attr(test, assert_instr(vsubsh))]
1876#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1877pub unsafe fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1878    _mm_mask_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
1879}
1880
1881/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1882/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1883/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1884///
1885/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sh)
1886#[inline]
1887#[target_feature(enable = "avx512fp16")]
1888#[cfg_attr(test, assert_instr(vsubsh))]
1889#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1890pub unsafe fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1891    _mm_maskz_sub_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
1892}
1893
1894/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1895///
1896/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ph)
1897#[inline]
1898#[target_feature(enable = "avx512fp16,avx512vl")]
1899#[cfg_attr(test, assert_instr(vmulph))]
1900#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1901pub unsafe fn _mm_mul_ph(a: __m128h, b: __m128h) -> __m128h {
1902    simd_mul(a, b)
1903}
1904
1905/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1906/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1907///
1908/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ph)
1909#[inline]
1910#[target_feature(enable = "avx512fp16,avx512vl")]
1911#[cfg_attr(test, assert_instr(vmulph))]
1912#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1913pub unsafe fn _mm_mask_mul_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1914    let r = _mm_mul_ph(a, b);
1915    simd_select_bitmask(k, r, src)
1916}
1917
1918/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1919/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1920///
1921/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ph)
1922#[inline]
1923#[target_feature(enable = "avx512fp16,avx512vl")]
1924#[cfg_attr(test, assert_instr(vmulph))]
1925#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1926pub unsafe fn _mm_maskz_mul_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1927    let r = _mm_mul_ph(a, b);
1928    simd_select_bitmask(k, r, _mm_setzero_ph())
1929}
1930
1931/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1932///
1933/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_ph)
1934#[inline]
1935#[target_feature(enable = "avx512fp16,avx512vl")]
1936#[cfg_attr(test, assert_instr(vmulph))]
1937#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1938pub unsafe fn _mm256_mul_ph(a: __m256h, b: __m256h) -> __m256h {
1939    simd_mul(a, b)
1940}
1941
1942/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1943/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1944///
1945/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_ph)
1946#[inline]
1947#[target_feature(enable = "avx512fp16,avx512vl")]
1948#[cfg_attr(test, assert_instr(vmulph))]
1949#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1950pub unsafe fn _mm256_mask_mul_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1951    let r = _mm256_mul_ph(a, b);
1952    simd_select_bitmask(k, r, src)
1953}
1954
1955/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1956/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1957///
1958/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_ph)
1959#[inline]
1960#[target_feature(enable = "avx512fp16,avx512vl")]
1961#[cfg_attr(test, assert_instr(vmulph))]
1962#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1963pub unsafe fn _mm256_maskz_mul_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1964    let r = _mm256_mul_ph(a, b);
1965    simd_select_bitmask(k, r, _mm256_setzero_ph())
1966}
1967
1968/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1969///
1970/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_ph)
1971#[inline]
1972#[target_feature(enable = "avx512fp16")]
1973#[cfg_attr(test, assert_instr(vmulph))]
1974#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1975pub unsafe fn _mm512_mul_ph(a: __m512h, b: __m512h) -> __m512h {
1976    simd_mul(a, b)
1977}
1978
1979/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1980/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1981///
1982/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_ph)
1983#[inline]
1984#[target_feature(enable = "avx512fp16")]
1985#[cfg_attr(test, assert_instr(vmulph))]
1986#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1987pub unsafe fn _mm512_mask_mul_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1988    let r = _mm512_mul_ph(a, b);
1989    simd_select_bitmask(k, r, src)
1990}
1991
1992/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1993/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1994///
1995/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_ph)
1996#[inline]
1997#[target_feature(enable = "avx512fp16")]
1998#[cfg_attr(test, assert_instr(vmulph))]
1999#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2000pub unsafe fn _mm512_maskz_mul_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2001    let r = _mm512_mul_ph(a, b);
2002    simd_select_bitmask(k, r, _mm512_setzero_ph())
2003}
2004
2005/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2006/// Rounding is done according to the rounding parameter, which can be one of:
2007///
2008/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2009/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2010/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2011/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2012/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2013///
2014/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_ph)
2015#[inline]
2016#[target_feature(enable = "avx512fp16")]
2017#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2018#[rustc_legacy_const_generics(2)]
2019#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2020pub unsafe fn _mm512_mul_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2021    static_assert_rounding!(ROUNDING);
2022    vmulph(a, b, ROUNDING)
2023}
2024
2025/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2026/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2027/// Rounding is done according to the rounding parameter, which can be one of:
2028///
2029/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2030/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2031/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2032/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2033/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2034///
2035/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_ph)
2036#[inline]
2037#[target_feature(enable = "avx512fp16")]
2038#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2039#[rustc_legacy_const_generics(4)]
2040#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2041pub unsafe fn _mm512_mask_mul_round_ph<const ROUNDING: i32>(
2042    src: __m512h,
2043    k: __mmask32,
2044    a: __m512h,
2045    b: __m512h,
2046) -> __m512h {
2047    static_assert_rounding!(ROUNDING);
2048    let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2049    simd_select_bitmask(k, r, src)
2050}
2051
2052/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2053/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2054/// Rounding is done according to the rounding parameter, which can be one of:
2055///
2056/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2057/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2058/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2059/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2060/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2061///
2062/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_ph)
2063#[inline]
2064#[target_feature(enable = "avx512fp16")]
2065#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2066#[rustc_legacy_const_generics(3)]
2067#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2068pub unsafe fn _mm512_maskz_mul_round_ph<const ROUNDING: i32>(
2069    k: __mmask32,
2070    a: __m512h,
2071    b: __m512h,
2072) -> __m512h {
2073    static_assert_rounding!(ROUNDING);
2074    let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2075    simd_select_bitmask(k, r, _mm512_setzero_ph())
2076}
2077
2078/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2079/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2080/// Rounding is done according to the rounding parameter, which can be one of:
2081///
2082/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2083/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2084/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2085/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2086/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2087///
2088/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sh)
2089#[inline]
2090#[target_feature(enable = "avx512fp16")]
2091#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2092#[rustc_legacy_const_generics(2)]
2093#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2094pub unsafe fn _mm_mul_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2095    static_assert_rounding!(ROUNDING);
2096    _mm_mask_mul_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
2097}
2098
2099/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2100/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2101/// writemask k (the element is copied from src when mask bit 0 is not set).
2102/// Rounding is done according to the rounding parameter, which can be one of:
2103///
2104/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2105/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2106/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2107/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2108/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2109///
2110/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sh)
2111#[inline]
2112#[target_feature(enable = "avx512fp16")]
2113#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2114#[rustc_legacy_const_generics(4)]
2115#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2116pub unsafe fn _mm_mask_mul_round_sh<const ROUNDING: i32>(
2117    src: __m128h,
2118    k: __mmask8,
2119    a: __m128h,
2120    b: __m128h,
2121) -> __m128h {
2122    static_assert_rounding!(ROUNDING);
2123    vmulsh(a, b, src, k, ROUNDING)
2124}
2125
2126/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2127/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2128/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2129/// Rounding is done according to the rounding parameter, which can be one of:
2130///
2131/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2132/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2133/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2134/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2135/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2136///
2137/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sh)
2138#[inline]
2139#[target_feature(enable = "avx512fp16")]
2140#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2141#[rustc_legacy_const_generics(3)]
2142#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2143pub unsafe fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(
2144    k: __mmask8,
2145    a: __m128h,
2146    b: __m128h,
2147) -> __m128h {
2148    static_assert_rounding!(ROUNDING);
2149    _mm_mask_mul_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
2150}
2151
2152/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2153/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2154///
2155/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sh)
2156#[inline]
2157#[target_feature(enable = "avx512fp16")]
2158#[cfg_attr(test, assert_instr(vmulsh))]
2159#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2160pub unsafe fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
2161    _mm_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
2162}
2163
2164/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2165/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2166/// writemask k (the element is copied from src when mask bit 0 is not set).
2167///
2168/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sh)
2169#[inline]
2170#[target_feature(enable = "avx512fp16")]
2171#[cfg_attr(test, assert_instr(vmulsh))]
2172#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2173pub unsafe fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2174    _mm_mask_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2175}
2176
2177/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2178/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2179/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2180///
2181/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sh)
2182#[inline]
2183#[target_feature(enable = "avx512fp16")]
2184#[cfg_attr(test, assert_instr(vmulsh))]
2185#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2186pub unsafe fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2187    _mm_maskz_mul_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
2188}
2189
2190/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2191///
2192/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ph)
2193#[inline]
2194#[target_feature(enable = "avx512fp16,avx512vl")]
2195#[cfg_attr(test, assert_instr(vdivph))]
2196#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2197pub unsafe fn _mm_div_ph(a: __m128h, b: __m128h) -> __m128h {
2198    simd_div(a, b)
2199}
2200
2201/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2202/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2203///
2204/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ph)
2205#[inline]
2206#[target_feature(enable = "avx512fp16,avx512vl")]
2207#[cfg_attr(test, assert_instr(vdivph))]
2208#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2209pub unsafe fn _mm_mask_div_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2210    let r = _mm_div_ph(a, b);
2211    simd_select_bitmask(k, r, src)
2212}
2213
2214/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2215/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2216///
2217/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ph)
2218#[inline]
2219#[target_feature(enable = "avx512fp16,avx512vl")]
2220#[cfg_attr(test, assert_instr(vdivph))]
2221#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2222pub unsafe fn _mm_maskz_div_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2223    let r = _mm_div_ph(a, b);
2224    simd_select_bitmask(k, r, _mm_setzero_ph())
2225}
2226
2227/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2228///
2229/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_ph)
2230#[inline]
2231#[target_feature(enable = "avx512fp16,avx512vl")]
2232#[cfg_attr(test, assert_instr(vdivph))]
2233#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2234pub unsafe fn _mm256_div_ph(a: __m256h, b: __m256h) -> __m256h {
2235    simd_div(a, b)
2236}
2237
2238/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2239/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2240///
2241/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_div_ph)
2242#[inline]
2243#[target_feature(enable = "avx512fp16,avx512vl")]
2244#[cfg_attr(test, assert_instr(vdivph))]
2245#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2246pub unsafe fn _mm256_mask_div_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2247    let r = _mm256_div_ph(a, b);
2248    simd_select_bitmask(k, r, src)
2249}
2250
2251/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2252/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2253///
2254/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_div_ph)
2255#[inline]
2256#[target_feature(enable = "avx512fp16,avx512vl")]
2257#[cfg_attr(test, assert_instr(vdivph))]
2258#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2259pub unsafe fn _mm256_maskz_div_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2260    let r = _mm256_div_ph(a, b);
2261    simd_select_bitmask(k, r, _mm256_setzero_ph())
2262}
2263
2264/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2265///
2266/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_ph)
2267#[inline]
2268#[target_feature(enable = "avx512fp16")]
2269#[cfg_attr(test, assert_instr(vdivph))]
2270#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2271pub unsafe fn _mm512_div_ph(a: __m512h, b: __m512h) -> __m512h {
2272    simd_div(a, b)
2273}
2274
2275/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2276/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2277///
2278/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_ph)
2279#[inline]
2280#[target_feature(enable = "avx512fp16")]
2281#[cfg_attr(test, assert_instr(vdivph))]
2282#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2283pub unsafe fn _mm512_mask_div_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2284    let r = _mm512_div_ph(a, b);
2285    simd_select_bitmask(k, r, src)
2286}
2287
2288/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2289/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2290///
2291/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_ph)
2292#[inline]
2293#[target_feature(enable = "avx512fp16")]
2294#[cfg_attr(test, assert_instr(vdivph))]
2295#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2296pub unsafe fn _mm512_maskz_div_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2297    let r = _mm512_div_ph(a, b);
2298    simd_select_bitmask(k, r, _mm512_setzero_ph())
2299}
2300
2301/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2302/// Rounding is done according to the rounding parameter, which can be one of:
2303///
2304/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2305/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2306/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2307/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2308/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2309///
2310/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_ph)
2311#[inline]
2312#[target_feature(enable = "avx512fp16")]
2313#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2314#[rustc_legacy_const_generics(2)]
2315#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2316pub unsafe fn _mm512_div_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2317    static_assert_rounding!(ROUNDING);
2318    vdivph(a, b, ROUNDING)
2319}
2320
2321/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2322/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2323/// Rounding is done according to the rounding parameter, which can be one of:
2324///
2325/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2326/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2327/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2328/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2329/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2330///
2331/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_round_ph)
2332#[inline]
2333#[target_feature(enable = "avx512fp16")]
2334#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2335#[rustc_legacy_const_generics(4)]
2336#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2337pub unsafe fn _mm512_mask_div_round_ph<const ROUNDING: i32>(
2338    src: __m512h,
2339    k: __mmask32,
2340    a: __m512h,
2341    b: __m512h,
2342) -> __m512h {
2343    static_assert_rounding!(ROUNDING);
2344    let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2345    simd_select_bitmask(k, r, src)
2346}
2347
2348/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2349/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2350/// Rounding is done according to the rounding parameter, which can be one of:
2351///
2352/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2353/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2354/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2355/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2356/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2357///
2358/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_round_ph)
2359#[inline]
2360#[target_feature(enable = "avx512fp16")]
2361#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2362#[rustc_legacy_const_generics(3)]
2363#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2364pub unsafe fn _mm512_maskz_div_round_ph<const ROUNDING: i32>(
2365    k: __mmask32,
2366    a: __m512h,
2367    b: __m512h,
2368) -> __m512h {
2369    static_assert_rounding!(ROUNDING);
2370    let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2371    simd_select_bitmask(k, r, _mm512_setzero_ph())
2372}
2373
2374/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2375/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2376/// Rounding is done according to the rounding parameter, which can be one of:
2377///
2378/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2379/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2380/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2381/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2382/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2383///
2384/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sh)
2385#[inline]
2386#[target_feature(enable = "avx512fp16")]
2387#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2388#[rustc_legacy_const_generics(2)]
2389#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2390pub unsafe fn _mm_div_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2391    static_assert_rounding!(ROUNDING);
2392    _mm_mask_div_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
2393}
2394
2395/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2396/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2397/// writemask k (the element is copied from src when mask bit 0 is not set).
2398/// Rounding is done according to the rounding parameter, which can be one of:
2399///
2400/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2401/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2402/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2403/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2404/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2405///
2406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sh)
2407#[inline]
2408#[target_feature(enable = "avx512fp16")]
2409#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2410#[rustc_legacy_const_generics(4)]
2411#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2412pub unsafe fn _mm_mask_div_round_sh<const ROUNDING: i32>(
2413    src: __m128h,
2414    k: __mmask8,
2415    a: __m128h,
2416    b: __m128h,
2417) -> __m128h {
2418    static_assert_rounding!(ROUNDING);
2419    vdivsh(a, b, src, k, ROUNDING)
2420}
2421
2422/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2423/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2424/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2425/// Rounding is done according to the rounding parameter, which can be one of:
2426///
2427/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2428/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2429/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2430/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2431/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2432///
2433/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sh)
2434#[inline]
2435#[target_feature(enable = "avx512fp16")]
2436#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2437#[rustc_legacy_const_generics(3)]
2438#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2439pub unsafe fn _mm_maskz_div_round_sh<const ROUNDING: i32>(
2440    k: __mmask8,
2441    a: __m128h,
2442    b: __m128h,
2443) -> __m128h {
2444    static_assert_rounding!(ROUNDING);
2445    _mm_mask_div_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
2446}
2447
2448/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2449/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2450///
2451/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sh)
2452#[inline]
2453#[target_feature(enable = "avx512fp16")]
2454#[cfg_attr(test, assert_instr(vdivsh))]
2455#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2456pub unsafe fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
2457    _mm_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(a, b)
2458}
2459
2460/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2461/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2462/// writemask k (the element is copied from src when mask bit 0 is not set).
2463///
2464/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sh)
2465#[inline]
2466#[target_feature(enable = "avx512fp16")]
2467#[cfg_attr(test, assert_instr(vdivsh))]
2468#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2469pub unsafe fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2470    _mm_mask_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2471}
2472
2473/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2474/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2475/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2476///
2477/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sh)
2478#[inline]
2479#[target_feature(enable = "avx512fp16")]
2480#[cfg_attr(test, assert_instr(vdivsh))]
2481#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2482pub unsafe fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2483    _mm_maskz_div_round_sh::<_MM_FROUND_CUR_DIRECTION>(k, a, b)
2484}
2485
2486/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2487/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2488/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2489///
2490/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pch)
2491#[inline]
2492#[target_feature(enable = "avx512fp16,avx512vl")]
2493#[cfg_attr(test, assert_instr(vfmulcph))]
2494#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2495pub unsafe fn _mm_mul_pch(a: __m128h, b: __m128h) -> __m128h {
2496    _mm_mask_mul_pch(_mm_undefined_ph(), 0xff, a, b)
2497}
2498
2499/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2500/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2501/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2502///
2503/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_pch)
2504#[inline]
2505#[target_feature(enable = "avx512fp16,avx512vl")]
2506#[cfg_attr(test, assert_instr(vfmulcph))]
2507#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2508pub unsafe fn _mm_mask_mul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2509    transmute(vfmulcph_128(transmute(a), transmute(b), transmute(src), k))
2510}
2511
2512/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2513/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2514/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2515///
2516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_pch)
2517#[inline]
2518#[target_feature(enable = "avx512fp16,avx512vl")]
2519#[cfg_attr(test, assert_instr(vfmulcph))]
2520#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2521pub unsafe fn _mm_maskz_mul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2522    _mm_mask_mul_pch(_mm_setzero_ph(), k, a, b)
2523}
2524
2525/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2526/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2527/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2528///
2529/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pch)
2530#[inline]
2531#[target_feature(enable = "avx512fp16,avx512vl")]
2532#[cfg_attr(test, assert_instr(vfmulcph))]
2533#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2534pub unsafe fn _mm256_mul_pch(a: __m256h, b: __m256h) -> __m256h {
2535    _mm256_mask_mul_pch(_mm256_undefined_ph(), 0xff, a, b)
2536}
2537
2538/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2539/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2540/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2541///
2542/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_pch)
2543#[inline]
2544#[target_feature(enable = "avx512fp16,avx512vl")]
2545#[cfg_attr(test, assert_instr(vfmulcph))]
2546#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2547pub unsafe fn _mm256_mask_mul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2548    transmute(vfmulcph_256(transmute(a), transmute(b), transmute(src), k))
2549}
2550
2551/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2552/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2553/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2554///
2555/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_pch)
2556#[inline]
2557#[target_feature(enable = "avx512fp16,avx512vl")]
2558#[cfg_attr(test, assert_instr(vfmulcph))]
2559#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2560pub unsafe fn _mm256_maskz_mul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2561    _mm256_mask_mul_pch(_mm256_setzero_ph(), k, a, b)
2562}
2563
2564/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2565/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2566/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2567///
2568/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pch)
2569#[inline]
2570#[target_feature(enable = "avx512fp16")]
2571#[cfg_attr(test, assert_instr(vfmulcph))]
2572#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2573pub unsafe fn _mm512_mul_pch(a: __m512h, b: __m512h) -> __m512h {
2574    _mm512_mask_mul_pch(_mm512_undefined_ph(), 0xffff, a, b)
2575}
2576
2577/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2578/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2579/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2580///
2581/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pch)
2582#[inline]
2583#[target_feature(enable = "avx512fp16")]
2584#[cfg_attr(test, assert_instr(vfmulcph))]
2585#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2586pub unsafe fn _mm512_mask_mul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2587    _mm512_mask_mul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2588}
2589
2590/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2591/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2592/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2593///
2594/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pch)
2595#[inline]
2596#[target_feature(enable = "avx512fp16")]
2597#[cfg_attr(test, assert_instr(vfmulcph))]
2598#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2599pub unsafe fn _mm512_maskz_mul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2600    _mm512_mask_mul_pch(_mm512_setzero_ph(), k, a, b)
2601}
2602
2603/// Multiply the packed complex numbers in a and b, and store the results in dst. Each complex number is
2604/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2605/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2606///
2607/// Rounding is done according to the rounding parameter, which can be one of:
2608///
2609/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2610/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2611/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2612/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2613/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2614///
2615/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_pch)
2616#[inline]
2617#[target_feature(enable = "avx512fp16")]
2618#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2619#[rustc_legacy_const_generics(2)]
2620#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2621pub unsafe fn _mm512_mul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2622    static_assert_rounding!(ROUNDING);
2623    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
2624}
2625
2626/// Multiply the packed complex numbers in a and b, and store the results in dst using writemask k (the element
2627/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2628/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2629///
2630/// Rounding is done according to the rounding parameter, which can be one of:
2631///
2632/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2633/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2634/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2635/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2636/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2637///
2638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_pch)
2639#[inline]
2640#[target_feature(enable = "avx512fp16")]
2641#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2642#[rustc_legacy_const_generics(4)]
2643#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2644pub unsafe fn _mm512_mask_mul_round_pch<const ROUNDING: i32>(
2645    src: __m512h,
2646    k: __mmask16,
2647    a: __m512h,
2648    b: __m512h,
2649) -> __m512h {
2650    static_assert_rounding!(ROUNDING);
2651    transmute(vfmulcph_512(
2652        transmute(a),
2653        transmute(b),
2654        transmute(src),
2655        k,
2656        ROUNDING,
2657    ))
2658}
2659
2660/// Multiply the packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2661/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2662/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2663///
2664/// Rounding is done according to the rounding parameter, which can be one of:
2665///
2666/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2667/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2668/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2669/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2670/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2671///
2672/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_pch)
2673#[inline]
2674#[target_feature(enable = "avx512fp16")]
2675#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2676#[rustc_legacy_const_generics(3)]
2677#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2678pub unsafe fn _mm512_maskz_mul_round_pch<const ROUNDING: i32>(
2679    k: __mmask16,
2680    a: __m512h,
2681    b: __m512h,
2682) -> __m512h {
2683    static_assert_rounding!(ROUNDING);
2684    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
2685}
2686
2687/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
2688/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
2689/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2690/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2691///
2692/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sch)
2693#[inline]
2694#[target_feature(enable = "avx512fp16")]
2695#[cfg_attr(test, assert_instr(vfmulcsh))]
2696#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2697pub unsafe fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h {
2698    _mm_mask_mul_sch(_mm_undefined_ph(), 0xff, a, b)
2699}
2700
2701/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2702/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
2703/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent
2704/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2705///
2706/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sch)
2707#[inline]
2708#[target_feature(enable = "avx512fp16")]
2709#[cfg_attr(test, assert_instr(vfmulcsh))]
2710#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2711pub unsafe fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2712    _mm_mask_mul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2713}
2714
2715/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2716/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
2717/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2718/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2719///
2720/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sch)
2721#[inline]
2722#[target_feature(enable = "avx512fp16")]
2723#[cfg_attr(test, assert_instr(vfmulcsh))]
2724#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2725pub unsafe fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2726    _mm_mask_mul_sch(_mm_setzero_ph(), k, a, b)
2727}
2728
2729/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
2730/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
2731/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2732/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2733///
2734/// Rounding is done according to the rounding parameter, which can be one of:
2735///
2736/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2737/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2738/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2739/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2740/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2741///
2742/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sch)
2743#[inline]
2744#[target_feature(enable = "avx512fp16")]
2745#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2746#[rustc_legacy_const_generics(2)]
2747#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2748pub unsafe fn _mm_mul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2749    static_assert_rounding!(ROUNDING);
2750    _mm_mask_mul_round_sch::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
2751}
2752
2753/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2754/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
2755/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2756/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2757///
2758/// Rounding is done according to the rounding parameter, which can be one of:
2759///
2760/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2761/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2762/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2763/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2764/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2765///
2766/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sch)
2767#[inline]
2768#[target_feature(enable = "avx512fp16")]
2769#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2770#[rustc_legacy_const_generics(4)]
2771#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2772pub unsafe fn _mm_mask_mul_round_sch<const ROUNDING: i32>(
2773    src: __m128h,
2774    k: __mmask8,
2775    a: __m128h,
2776    b: __m128h,
2777) -> __m128h {
2778    static_assert_rounding!(ROUNDING);
2779    transmute(vfmulcsh(
2780        transmute(a),
2781        transmute(b),
2782        transmute(src),
2783        k,
2784        ROUNDING,
2785    ))
2786}
2787
2788/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2789/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
2790/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2791/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2792///
2793/// Rounding is done according to the rounding parameter, which can be one of:
2794///
2795/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2796/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2797/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2798/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2799/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2800///
2801/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sch)
2802#[inline]
2803#[target_feature(enable = "avx512fp16")]
2804#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2805#[rustc_legacy_const_generics(3)]
2806#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2807pub unsafe fn _mm_maskz_mul_round_sch<const ROUNDING: i32>(
2808    k: __mmask8,
2809    a: __m128h,
2810    b: __m128h,
2811) -> __m128h {
2812    static_assert_rounding!(ROUNDING);
2813    _mm_mask_mul_round_sch::<ROUNDING>(_mm_setzero_ph(), k, a, b)
2814}
2815
2816/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2817/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2818/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2819///
2820/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_pch)
2821#[inline]
2822#[target_feature(enable = "avx512fp16,avx512vl")]
2823#[cfg_attr(test, assert_instr(vfmulcph))]
2824#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2825pub unsafe fn _mm_fmul_pch(a: __m128h, b: __m128h) -> __m128h {
2826    _mm_mul_pch(a, b)
2827}
2828
2829/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2830/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2831/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2832///
2833/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_pch)
2834#[inline]
2835#[target_feature(enable = "avx512fp16,avx512vl")]
2836#[cfg_attr(test, assert_instr(vfmulcph))]
2837#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2838pub unsafe fn _mm_mask_fmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2839    _mm_mask_mul_pch(src, k, a, b)
2840}
2841
2842/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2843/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2844/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2845///
2846/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_pch)
2847#[inline]
2848#[target_feature(enable = "avx512fp16,avx512vl")]
2849#[cfg_attr(test, assert_instr(vfmulcph))]
2850#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2851pub unsafe fn _mm_maskz_fmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2852    _mm_maskz_mul_pch(k, a, b)
2853}
2854
2855/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2856/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2857/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2858///
2859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmul_pch)
2860#[inline]
2861#[target_feature(enable = "avx512fp16,avx512vl")]
2862#[cfg_attr(test, assert_instr(vfmulcph))]
2863#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2864pub unsafe fn _mm256_fmul_pch(a: __m256h, b: __m256h) -> __m256h {
2865    _mm256_mul_pch(a, b)
2866}
2867
2868/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2869/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2870/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2871///
2872/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmul_pch)
2873#[inline]
2874#[target_feature(enable = "avx512fp16,avx512vl")]
2875#[cfg_attr(test, assert_instr(vfmulcph))]
2876#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2877pub unsafe fn _mm256_mask_fmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2878    _mm256_mask_mul_pch(src, k, a, b)
2879}
2880
2881/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2882/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2883/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2884///
2885/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmul_pch)
2886#[inline]
2887#[target_feature(enable = "avx512fp16,avx512vl")]
2888#[cfg_attr(test, assert_instr(vfmulcph))]
2889#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2890pub unsafe fn _mm256_maskz_fmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2891    _mm256_maskz_mul_pch(k, a, b)
2892}
2893
2894/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
2895/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2896///
2897/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_pch)
2898#[inline]
2899#[target_feature(enable = "avx512fp16")]
2900#[cfg_attr(test, assert_instr(vfmulcph))]
2901#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2902pub unsafe fn _mm512_fmul_pch(a: __m512h, b: __m512h) -> __m512h {
2903    _mm512_mul_pch(a, b)
2904}
2905
2906/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2907/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2908/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2909///
2910/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_pch)
2911#[inline]
2912#[target_feature(enable = "avx512fp16")]
2913#[cfg_attr(test, assert_instr(vfmulcph))]
2914#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2915pub unsafe fn _mm512_mask_fmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2916    _mm512_mask_mul_pch(src, k, a, b)
2917}
2918
2919/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2920/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2921/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2922///
2923/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_pch)
2924#[inline]
2925#[target_feature(enable = "avx512fp16")]
2926#[cfg_attr(test, assert_instr(vfmulcph))]
2927#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2928pub unsafe fn _mm512_maskz_fmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2929    _mm512_maskz_mul_pch(k, a, b)
2930}
2931
2932/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
2933/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2934/// Rounding is done according to the rounding parameter, which can be one of:
2935///
2936/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2937/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2938/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2939/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2940/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2941///
2942/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_round_pch)
2943#[inline]
2944#[target_feature(enable = "avx512fp16")]
2945#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2946#[rustc_legacy_const_generics(2)]
2947#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2948pub unsafe fn _mm512_fmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2949    static_assert_rounding!(ROUNDING);
2950    _mm512_mul_round_pch::<ROUNDING>(a, b)
2951}
2952
2953/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2954/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2955/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2956/// Rounding is done according to the rounding parameter, which can be one of:
2957///
2958/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2959/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2960/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2961/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2962/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2963///
2964/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_round_pch)
2965#[inline]
2966#[target_feature(enable = "avx512fp16")]
2967#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2968#[rustc_legacy_const_generics(4)]
2969#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2970pub unsafe fn _mm512_mask_fmul_round_pch<const ROUNDING: i32>(
2971    src: __m512h,
2972    k: __mmask16,
2973    a: __m512h,
2974    b: __m512h,
2975) -> __m512h {
2976    static_assert_rounding!(ROUNDING);
2977    _mm512_mask_mul_round_pch::<ROUNDING>(src, k, a, b)
2978}
2979
2980/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2981/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
2982/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2983/// Rounding is done according to the rounding parameter, which can be one of:
2984///
2985/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2986/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2987/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2988/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2989/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2990///
2991/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_round_pch)
2992#[inline]
2993#[target_feature(enable = "avx512fp16")]
2994#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2995#[rustc_legacy_const_generics(3)]
2996#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2997pub unsafe fn _mm512_maskz_fmul_round_pch<const ROUNDING: i32>(
2998    k: __mmask16,
2999    a: __m512h,
3000    b: __m512h,
3001) -> __m512h {
3002    static_assert_rounding!(ROUNDING);
3003    _mm512_maskz_mul_round_pch::<ROUNDING>(k, a, b)
3004}
3005
3006/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is
3007/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3008/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3009///
3010/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_sch)
3011#[inline]
3012#[target_feature(enable = "avx512fp16")]
3013#[cfg_attr(test, assert_instr(vfmulcsh))]
3014#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3015pub unsafe fn _mm_fmul_sch(a: __m128h, b: __m128h) -> __m128h {
3016    _mm_mul_sch(a, b)
3017}
3018
3019/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3020/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3021/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3022///
3023/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_sch)
3024#[inline]
3025#[target_feature(enable = "avx512fp16")]
3026#[cfg_attr(test, assert_instr(vfmulcsh))]
3027#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3028pub unsafe fn _mm_mask_fmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3029    _mm_mask_mul_sch(src, k, a, b)
3030}
3031
3032/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3033/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3034/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3035///
3036/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_sch)
3037#[inline]
3038#[target_feature(enable = "avx512fp16")]
3039#[cfg_attr(test, assert_instr(vfmulcsh))]
3040#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3041pub unsafe fn _mm_maskz_fmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3042    _mm_maskz_mul_sch(k, a, b)
3043}
3044
3045/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is composed
3046/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3047///
3048/// Rounding is done according to the rounding parameter, which can be one of:
3049///
3050/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3051/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3052/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3053/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3054/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3055///
3056/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_round_sch)
3057#[inline]
3058#[target_feature(enable = "avx512fp16")]
3059#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3060#[rustc_legacy_const_generics(2)]
3061#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3062pub unsafe fn _mm_fmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3063    static_assert_rounding!(ROUNDING);
3064    _mm_mul_round_sch::<ROUNDING>(a, b)
3065}
3066
3067/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3068/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3069/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3070///
3071/// Rounding is done according to the rounding parameter, which can be one of:
3072///
3073/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3074/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3075/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3076/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3077/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3078///
3079/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_round_sch)
3080#[inline]
3081#[target_feature(enable = "avx512fp16")]
3082#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3083#[rustc_legacy_const_generics(4)]
3084#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3085pub unsafe fn _mm_mask_fmul_round_sch<const ROUNDING: i32>(
3086    src: __m128h,
3087    k: __mmask8,
3088    a: __m128h,
3089    b: __m128h,
3090) -> __m128h {
3091    static_assert_rounding!(ROUNDING);
3092    _mm_mask_mul_round_sch::<ROUNDING>(src, k, a, b)
3093}
3094
3095/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3096/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3097/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3098///
3099/// Rounding is done according to the rounding parameter, which can be one of:
3100///
3101/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3102/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3103/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3104/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3105/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3106///
3107/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_round_sch)
3108#[inline]
3109#[target_feature(enable = "avx512fp16")]
3110#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3111#[rustc_legacy_const_generics(3)]
3112#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3113pub unsafe fn _mm_maskz_fmul_round_sch<const ROUNDING: i32>(
3114    k: __mmask8,
3115    a: __m128h,
3116    b: __m128h,
3117) -> __m128h {
3118    static_assert_rounding!(ROUNDING);
3119    _mm_maskz_mul_round_sch::<ROUNDING>(k, a, b)
3120}
3121
3122/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3123/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3124/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3125/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3126///
3127/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_pch)
3128#[inline]
3129#[target_feature(enable = "avx512fp16,avx512vl")]
3130#[cfg_attr(test, assert_instr(vfcmulcph))]
3131#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3132pub unsafe fn _mm_cmul_pch(a: __m128h, b: __m128h) -> __m128h {
3133    _mm_mask_cmul_pch(_mm_undefined_ph(), 0xff, a, b)
3134}
3135
3136/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3137/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3138/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3139/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3140///
3141/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_pch)
3142#[inline]
3143#[target_feature(enable = "avx512fp16,avx512vl")]
3144#[cfg_attr(test, assert_instr(vfcmulcph))]
3145#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3146pub unsafe fn _mm_mask_cmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3147    transmute(vfcmulcph_128(transmute(a), transmute(b), transmute(src), k))
3148}
3149
3150/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3151/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3152/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3153/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3154///
3155/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_pch)
3156#[inline]
3157#[target_feature(enable = "avx512fp16,avx512vl")]
3158#[cfg_attr(test, assert_instr(vfcmulcph))]
3159#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3160pub unsafe fn _mm_maskz_cmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3161    _mm_mask_cmul_pch(_mm_setzero_ph(), k, a, b)
3162}
3163
3164/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3165/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3166/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3167/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3168///
3169/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmul_pch)
3170#[inline]
3171#[target_feature(enable = "avx512fp16,avx512vl")]
3172#[cfg_attr(test, assert_instr(vfcmulcph))]
3173#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3174pub unsafe fn _mm256_cmul_pch(a: __m256h, b: __m256h) -> __m256h {
3175    _mm256_mask_cmul_pch(_mm256_undefined_ph(), 0xff, a, b)
3176}
3177
3178/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3179/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3180/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3181/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3182///
3183/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmul_pch)
3184#[inline]
3185#[target_feature(enable = "avx512fp16,avx512vl")]
3186#[cfg_attr(test, assert_instr(vfcmulcph))]
3187#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3188pub unsafe fn _mm256_mask_cmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3189    transmute(vfcmulcph_256(transmute(a), transmute(b), transmute(src), k))
3190}
3191
3192/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3193/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3194/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3195/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3196///
3197/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cmul_pch)
3198#[inline]
3199#[target_feature(enable = "avx512fp16,avx512vl")]
3200#[cfg_attr(test, assert_instr(vfcmulcph))]
3201#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3202pub unsafe fn _mm256_maskz_cmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3203    _mm256_mask_cmul_pch(_mm256_setzero_ph(), k, a, b)
3204}
3205
3206/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3207/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3208/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3209/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3210///
3211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_pch)
3212#[inline]
3213#[target_feature(enable = "avx512fp16")]
3214#[cfg_attr(test, assert_instr(vfcmulcph))]
3215#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3216pub unsafe fn _mm512_cmul_pch(a: __m512h, b: __m512h) -> __m512h {
3217    _mm512_mask_cmul_pch(_mm512_undefined_ph(), 0xffff, a, b)
3218}
3219
3220/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3221/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3222/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3223/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3224///
3225/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_pch)
3226#[inline]
3227#[target_feature(enable = "avx512fp16")]
3228#[cfg_attr(test, assert_instr(vfcmulcph))]
3229#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3230pub unsafe fn _mm512_mask_cmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3231    _mm512_mask_cmul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3232}
3233
3234/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3235/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3236/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3237/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3238///
3239/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_pch)
3240#[inline]
3241#[target_feature(enable = "avx512fp16")]
3242#[cfg_attr(test, assert_instr(vfcmulcph))]
3243#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3244pub unsafe fn _mm512_maskz_cmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3245    _mm512_mask_cmul_pch(_mm512_setzero_ph(), k, a, b)
3246}
3247
3248/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3249/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3250/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3251/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3252///
3253/// Rounding is done according to the rounding parameter, which can be one of:
3254///
3255/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3256/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3257/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3258/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3259/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3260///
3261/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_round_pch)
3262#[inline]
3263#[target_feature(enable = "avx512fp16")]
3264#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3265#[rustc_legacy_const_generics(2)]
3266#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3267pub unsafe fn _mm512_cmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3268    static_assert_rounding!(ROUNDING);
3269    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
3270}
3271
3272/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3273/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3274/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3275/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3276///
3277/// Rounding is done according to the rounding parameter, which can be one of:
3278///
3279/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3280/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3281/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3282/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3283/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3284///
3285/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_round_pch)
3286#[inline]
3287#[target_feature(enable = "avx512fp16")]
3288#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3289#[rustc_legacy_const_generics(4)]
3290#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3291pub unsafe fn _mm512_mask_cmul_round_pch<const ROUNDING: i32>(
3292    src: __m512h,
3293    k: __mmask16,
3294    a: __m512h,
3295    b: __m512h,
3296) -> __m512h {
3297    static_assert_rounding!(ROUNDING);
3298    transmute(vfcmulcph_512(
3299        transmute(a),
3300        transmute(b),
3301        transmute(src),
3302        k,
3303        ROUNDING,
3304    ))
3305}
3306
3307/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3308/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3309/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3310/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3311///
3312/// Rounding is done according to the rounding parameter, which can be one of:
3313///
3314/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3315/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3316/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3317/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3318/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3319///
3320/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_round_pch)
3321#[inline]
3322#[target_feature(enable = "avx512fp16")]
3323#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3324#[rustc_legacy_const_generics(3)]
3325#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3326pub unsafe fn _mm512_maskz_cmul_round_pch<const ROUNDING: i32>(
3327    k: __mmask16,
3328    a: __m512h,
3329    b: __m512h,
3330) -> __m512h {
3331    static_assert_rounding!(ROUNDING);
3332    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
3333}
3334
3335/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3336/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3337/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3338///
3339/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_sch)
3340#[inline]
3341#[target_feature(enable = "avx512fp16")]
3342#[cfg_attr(test, assert_instr(vfcmulcsh))]
3343#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3344pub unsafe fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h {
3345    _mm_mask_cmul_sch(_mm_undefined_ph(), 0xff, a, b)
3346}
3347
3348/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3349/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3350/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3351/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3352///
3353/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_sch)
3354#[inline]
3355#[target_feature(enable = "avx512fp16")]
3356#[cfg_attr(test, assert_instr(vfcmulcsh))]
3357#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3358pub unsafe fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3359    _mm_mask_cmul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3360}
3361
3362/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3363/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3364/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3365/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3366///
3367/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_sch)
3368#[inline]
3369#[target_feature(enable = "avx512fp16")]
3370#[cfg_attr(test, assert_instr(vfcmulcsh))]
3371#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3372pub unsafe fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3373    _mm_mask_cmul_sch(_mm_setzero_ph(), k, a, b)
3374}
3375
3376/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3377/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3378/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3379///
3380/// Rounding is done according to the rounding parameter, which can be one of:
3381///
3382/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3383/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3384/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3385/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3386/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3387///
3388/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_round_sch)
3389#[inline]
3390#[target_feature(enable = "avx512fp16")]
3391#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3392#[rustc_legacy_const_generics(2)]
3393#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3394pub unsafe fn _mm_cmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3395    static_assert_rounding!(ROUNDING);
3396    _mm_mask_cmul_round_sch::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
3397}
3398
3399/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3400/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3401/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3402/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3403///
3404/// Rounding is done according to the rounding parameter, which can be one of:
3405///
3406/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3407/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3408/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3409/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3410/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3411///
3412/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_round_sch)
3413#[inline]
3414#[target_feature(enable = "avx512fp16")]
3415#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3416#[rustc_legacy_const_generics(4)]
3417#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3418pub unsafe fn _mm_mask_cmul_round_sch<const ROUNDING: i32>(
3419    src: __m128h,
3420    k: __mmask8,
3421    a: __m128h,
3422    b: __m128h,
3423) -> __m128h {
3424    static_assert_rounding!(ROUNDING);
3425    transmute(vfcmulcsh(
3426        transmute(a),
3427        transmute(b),
3428        transmute(src),
3429        k,
3430        ROUNDING,
3431    ))
3432}
3433
3434/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3435/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3436/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3437/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3438///
3439/// Rounding is done according to the rounding parameter, which can be one of:
3440///
3441/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3442/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3443/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3444/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3445/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3446///
3447/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_round_sch)
3448#[inline]
3449#[target_feature(enable = "avx512fp16")]
3450#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3451#[rustc_legacy_const_generics(3)]
3452#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3453pub unsafe fn _mm_maskz_cmul_round_sch<const ROUNDING: i32>(
3454    k: __mmask8,
3455    a: __m128h,
3456    b: __m128h,
3457) -> __m128h {
3458    static_assert_rounding!(ROUNDING);
3459    _mm_mask_cmul_round_sch::<ROUNDING>(_mm_setzero_ph(), k, a, b)
3460}
3461
3462/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3463/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3464/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3465/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3466///
3467/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_pch)
3468#[inline]
3469#[target_feature(enable = "avx512fp16,avx512vl")]
3470#[cfg_attr(test, assert_instr(vfcmulcph))]
3471#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3472pub unsafe fn _mm_fcmul_pch(a: __m128h, b: __m128h) -> __m128h {
3473    _mm_cmul_pch(a, b)
3474}
3475
3476/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3477/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3478/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3479/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3480///
3481/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_pch)
3482#[inline]
3483#[target_feature(enable = "avx512fp16,avx512vl")]
3484#[cfg_attr(test, assert_instr(vfcmulcph))]
3485#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3486pub unsafe fn _mm_mask_fcmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3487    _mm_mask_cmul_pch(src, k, a, b)
3488}
3489
3490/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3491/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3492/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3493/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3494///
3495/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_pch)
3496#[inline]
3497#[target_feature(enable = "avx512fp16,avx512vl")]
3498#[cfg_attr(test, assert_instr(vfcmulcph))]
3499#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3500pub unsafe fn _mm_maskz_fcmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3501    _mm_maskz_cmul_pch(k, a, b)
3502}
3503
3504/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3505/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3506/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3507/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3508///
3509/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmul_pch)
3510#[inline]
3511#[target_feature(enable = "avx512fp16,avx512vl")]
3512#[cfg_attr(test, assert_instr(vfcmulcph))]
3513#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3514pub unsafe fn _mm256_fcmul_pch(a: __m256h, b: __m256h) -> __m256h {
3515    _mm256_cmul_pch(a, b)
3516}
3517
3518/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3519/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3520/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3521/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3522///
3523/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmul_pch)
3524#[inline]
3525#[target_feature(enable = "avx512fp16,avx512vl")]
3526#[cfg_attr(test, assert_instr(vfcmulcph))]
3527#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3528pub unsafe fn _mm256_mask_fcmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3529    _mm256_mask_cmul_pch(src, k, a, b)
3530}
3531
3532/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3533/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3534/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3535/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3536///
3537/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmul_pch)
3538#[inline]
3539#[target_feature(enable = "avx512fp16,avx512vl")]
3540#[cfg_attr(test, assert_instr(vfcmulcph))]
3541#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3542pub unsafe fn _mm256_maskz_fcmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3543    _mm256_maskz_cmul_pch(k, a, b)
3544}
3545
3546/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3547/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3548/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3549/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3550///
3551/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_pch)
3552#[inline]
3553#[target_feature(enable = "avx512fp16")]
3554#[cfg_attr(test, assert_instr(vfcmulcph))]
3555#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3556pub unsafe fn _mm512_fcmul_pch(a: __m512h, b: __m512h) -> __m512h {
3557    _mm512_cmul_pch(a, b)
3558}
3559
3560/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3561/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3562/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3563/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3564///
3565/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_pch)
3566#[inline]
3567#[target_feature(enable = "avx512fp16")]
3568#[cfg_attr(test, assert_instr(vfcmulcph))]
3569#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3570pub unsafe fn _mm512_mask_fcmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3571    _mm512_mask_cmul_pch(src, k, a, b)
3572}
3573
3574/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3575/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3576/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3577/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3578///
3579/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_pch)
3580#[inline]
3581#[target_feature(enable = "avx512fp16")]
3582#[cfg_attr(test, assert_instr(vfcmulcph))]
3583#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3584pub unsafe fn _mm512_maskz_fcmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3585    _mm512_maskz_cmul_pch(k, a, b)
3586}
3587
3588/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3589/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3590/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3591///
3592/// Rounding is done according to the rounding parameter, which can be one of:
3593///
3594/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3595/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3596/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3597/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3598/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3599///
3600/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_round_pch)
3601#[inline]
3602#[target_feature(enable = "avx512fp16")]
3603#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3604#[rustc_legacy_const_generics(2)]
3605#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3606pub unsafe fn _mm512_fcmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3607    static_assert_rounding!(ROUNDING);
3608    _mm512_cmul_round_pch::<ROUNDING>(a, b)
3609}
3610
3611/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3612/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3613/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3614/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3615///
3616/// Rounding is done according to the rounding parameter, which can be one of:
3617///
3618/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3619/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3620/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3621/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3622/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3623///
3624/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_round_pch)
3625#[inline]
3626#[target_feature(enable = "avx512fp16")]
3627#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3628#[rustc_legacy_const_generics(4)]
3629#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3630pub unsafe fn _mm512_mask_fcmul_round_pch<const ROUNDING: i32>(
3631    src: __m512h,
3632    k: __mmask16,
3633    a: __m512h,
3634    b: __m512h,
3635) -> __m512h {
3636    static_assert_rounding!(ROUNDING);
3637    _mm512_mask_cmul_round_pch::<ROUNDING>(src, k, a, b)
3638}
3639
3640/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3641/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3642/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3643/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3644///
3645/// Rounding is done according to the rounding parameter, which can be one of:
3646///
3647/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3648/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3649/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3650/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3651/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3652///
3653/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_round_pch)
3654#[inline]
3655#[target_feature(enable = "avx512fp16")]
3656#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3657#[rustc_legacy_const_generics(3)]
3658#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3659pub unsafe fn _mm512_maskz_fcmul_round_pch<const ROUNDING: i32>(
3660    k: __mmask16,
3661    a: __m512h,
3662    b: __m512h,
3663) -> __m512h {
3664    static_assert_rounding!(ROUNDING);
3665    _mm512_maskz_cmul_round_pch::<ROUNDING>(k, a, b)
3666}
3667
3668/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3669/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3670/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3671/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3672///
3673/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_sch)
3674#[inline]
3675#[target_feature(enable = "avx512fp16")]
3676#[cfg_attr(test, assert_instr(vfcmulcsh))]
3677#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3678pub unsafe fn _mm_fcmul_sch(a: __m128h, b: __m128h) -> __m128h {
3679    _mm_cmul_sch(a, b)
3680}
3681
3682/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3683/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3684/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3685/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3686///
3687/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_sch)
3688#[inline]
3689#[target_feature(enable = "avx512fp16")]
3690#[cfg_attr(test, assert_instr(vfcmulcsh))]
3691#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3692pub unsafe fn _mm_mask_fcmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3693    _mm_mask_cmul_sch(src, k, a, b)
3694}
3695
3696/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3697/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3698/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3699/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3700///
3701/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_sch)
3702#[inline]
3703#[target_feature(enable = "avx512fp16")]
3704#[cfg_attr(test, assert_instr(vfcmulcsh))]
3705#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3706pub unsafe fn _mm_maskz_fcmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3707    _mm_maskz_cmul_sch(k, a, b)
3708}
3709
3710/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3711/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3712/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3713///
3714/// Rounding is done according to the rounding parameter, which can be one of:
3715///
3716/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3717/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3718/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3719/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3720/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3721///
3722/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_round_sch)
3723#[inline]
3724#[target_feature(enable = "avx512fp16")]
3725#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3726#[rustc_legacy_const_generics(2)]
3727#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3728pub unsafe fn _mm_fcmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3729    static_assert_rounding!(ROUNDING);
3730    _mm_cmul_round_sch::<ROUNDING>(a, b)
3731}
3732
3733/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3734/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3735/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3736/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3737///
3738/// Rounding is done according to the rounding parameter, which can be one of:
3739///
3740/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3741/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3742/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3743/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3744/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3745///
3746/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_round_sch)
3747#[inline]
3748#[target_feature(enable = "avx512fp16")]
3749#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3750#[rustc_legacy_const_generics(4)]
3751#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3752pub unsafe fn _mm_mask_fcmul_round_sch<const ROUNDING: i32>(
3753    src: __m128h,
3754    k: __mmask8,
3755    a: __m128h,
3756    b: __m128h,
3757) -> __m128h {
3758    static_assert_rounding!(ROUNDING);
3759    _mm_mask_cmul_round_sch::<ROUNDING>(src, k, a, b)
3760}
3761
3762/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3763/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3764/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3765/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3766///
3767/// Rounding is done according to the rounding parameter, which can be one of:
3768///
3769/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3770/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3771/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3772/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3773/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3774///
3775/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_round_sch)
3776#[inline]
3777#[target_feature(enable = "avx512fp16")]
3778#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3779#[rustc_legacy_const_generics(3)]
3780#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3781pub unsafe fn _mm_maskz_fcmul_round_sch<const ROUNDING: i32>(
3782    k: __mmask8,
3783    a: __m128h,
3784    b: __m128h,
3785) -> __m128h {
3786    static_assert_rounding!(ROUNDING);
3787    _mm_maskz_cmul_round_sch::<ROUNDING>(k, a, b)
3788}
3789
3790/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3791/// the results in dst.
3792///
3793/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_ph)
3794#[inline]
3795#[target_feature(enable = "avx512fp16,avx512vl")]
3796#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3797pub unsafe fn _mm_abs_ph(v2: __m128h) -> __m128h {
3798    transmute(_mm_and_si128(transmute(v2), _mm_set1_epi16(i16::MAX)))
3799}
3800
3801/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3802/// the result in dst.
3803///
3804/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_ph)
3805#[inline]
3806#[target_feature(enable = "avx512fp16,avx512vl")]
3807#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3808pub unsafe fn _mm256_abs_ph(v2: __m256h) -> __m256h {
3809    transmute(_mm256_and_si256(transmute(v2), _mm256_set1_epi16(i16::MAX)))
3810}
3811
3812/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3813/// the result in dst.
3814///
3815/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_ph)
3816#[inline]
3817#[target_feature(enable = "avx512fp16")]
3818#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3819pub unsafe fn _mm512_abs_ph(v2: __m512h) -> __m512h {
3820    transmute(_mm512_and_si512(transmute(v2), _mm512_set1_epi16(i16::MAX)))
3821}
3822
3823/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex
3824/// number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines
3825/// the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate
3826/// `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3827///
3828/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_conj_pch)
3829#[inline]
3830#[target_feature(enable = "avx512fp16,avx512vl")]
3831#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3832pub unsafe fn _mm_conj_pch(a: __m128h) -> __m128h {
3833    transmute(_mm_xor_si128(transmute(a), _mm_set1_epi32(i32::MIN)))
3834}
3835
3836/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
3837/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
3838/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number
3839/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3840///
3841/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_conj_pch)
3842#[inline]
3843#[target_feature(enable = "avx512fp16,avx512vl")]
3844#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3845pub unsafe fn _mm_mask_conj_pch(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
3846    let r: __m128 = transmute(_mm_conj_pch(a));
3847    transmute(simd_select_bitmask(k, r, transmute(src)))
3848}
3849
3850/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
3851/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
3852/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3853/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3854///
3855/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_conj_pch)
3856#[inline]
3857#[target_feature(enable = "avx512fp16,avx512vl")]
3858#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3859pub unsafe fn _mm_maskz_conj_pch(k: __mmask8, a: __m128h) -> __m128h {
3860    _mm_mask_conj_pch(_mm_setzero_ph(), k, a)
3861}
3862
3863/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
3864/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3865/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3866///
3867/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_conj_pch)
3868#[inline]
3869#[target_feature(enable = "avx512fp16,avx512vl")]
3870#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3871pub unsafe fn _mm256_conj_pch(a: __m256h) -> __m256h {
3872    transmute(_mm256_xor_si256(transmute(a), _mm256_set1_epi32(i32::MIN)))
3873}
3874
3875/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
3876/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
3877/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3878/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3879///
3880/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_conj_pch)
3881#[inline]
3882#[target_feature(enable = "avx512fp16,avx512vl")]
3883#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3884pub unsafe fn _mm256_mask_conj_pch(src: __m256h, k: __mmask8, a: __m256h) -> __m256h {
3885    let r: __m256 = transmute(_mm256_conj_pch(a));
3886    transmute(simd_select_bitmask(k, r, transmute(src)))
3887}
3888
3889/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
3890/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
3891/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3892/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3893///
3894/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_conj_pch)
3895#[inline]
3896#[target_feature(enable = "avx512fp16,avx512vl")]
3897#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3898pub unsafe fn _mm256_maskz_conj_pch(k: __mmask8, a: __m256h) -> __m256h {
3899    _mm256_mask_conj_pch(_mm256_setzero_ph(), k, a)
3900}
3901
3902/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
3903/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3904/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3905///
3906/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_conj_pch)
3907#[inline]
3908#[target_feature(enable = "avx512fp16")]
3909#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3910pub unsafe fn _mm512_conj_pch(a: __m512h) -> __m512h {
3911    transmute(_mm512_xor_si512(transmute(a), _mm512_set1_epi32(i32::MIN)))
3912}
3913
3914/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
3915/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
3916/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3917/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3918///
3919/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_conj_pch)
3920#[inline]
3921#[target_feature(enable = "avx512fp16")]
3922#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3923pub unsafe fn _mm512_mask_conj_pch(src: __m512h, k: __mmask16, a: __m512h) -> __m512h {
3924    let r: __m512 = transmute(_mm512_conj_pch(a));
3925    transmute(simd_select_bitmask(k, r, transmute(src)))
3926}
3927
3928/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
3929/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
3930/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3931/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3932///
3933/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_conj_pch)
3934#[inline]
3935#[target_feature(enable = "avx512fp16")]
3936#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3937pub unsafe fn _mm512_maskz_conj_pch(k: __mmask16, a: __m512h) -> __m512h {
3938    _mm512_mask_conj_pch(_mm512_setzero_ph(), k, a)
3939}
3940
3941/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
3942/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3943/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3944///
3945/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pch)
3946#[inline]
3947#[target_feature(enable = "avx512fp16,avx512vl")]
3948#[cfg_attr(test, assert_instr(vfmaddcph))]
3949#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3950pub unsafe fn _mm_fmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
3951    _mm_mask3_fmadd_pch(a, b, c, 0xff)
3952}
3953
3954/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
3955/// and store the results in dst using writemask k (the element is copied from a when the corresponding
3956/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
3957/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3958///
3959/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_pch)
3960#[inline]
3961#[target_feature(enable = "avx512fp16,avx512vl")]
3962#[cfg_attr(test, assert_instr(vfmaddcph))]
3963#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3964pub unsafe fn _mm_mask_fmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
3965    let r: __m128 = transmute(_mm_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
3966    transmute(simd_select_bitmask(k, r, transmute(a)))
3967}
3968
3969/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
3970/// and store the results in dst using writemask k (the element is copied from c when the corresponding
3971/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
3972/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3973///
3974/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_pch)
3975#[inline]
3976#[target_feature(enable = "avx512fp16,avx512vl")]
3977#[cfg_attr(test, assert_instr(vfmaddcph))]
3978#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3979pub unsafe fn _mm_mask3_fmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
3980    transmute(vfmaddcph_mask3_128(
3981        transmute(a),
3982        transmute(b),
3983        transmute(c),
3984        k,
3985    ))
3986}
3987
3988/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
3989/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
3990/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
3991/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3992///
3993/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_pch)
3994#[inline]
3995#[target_feature(enable = "avx512fp16,avx512vl")]
3996#[cfg_attr(test, assert_instr(vfmaddcph))]
3997#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3998pub unsafe fn _mm_maskz_fmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
3999    transmute(vfmaddcph_maskz_128(
4000        transmute(a),
4001        transmute(b),
4002        transmute(c),
4003        k,
4004    ))
4005}
4006
4007/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4008/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4009/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4010///
4011/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pch)
4012#[inline]
4013#[target_feature(enable = "avx512fp16,avx512vl")]
4014#[cfg_attr(test, assert_instr(vfmaddcph))]
4015#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4016pub unsafe fn _mm256_fmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4017    _mm256_mask3_fmadd_pch(a, b, c, 0xff)
4018}
4019
4020/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4021/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4022/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4023/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4024///
4025/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_pch)
4026#[inline]
4027#[target_feature(enable = "avx512fp16,avx512vl")]
4028#[cfg_attr(test, assert_instr(vfmaddcph))]
4029#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4030pub unsafe fn _mm256_mask_fmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4031    let r: __m256 = transmute(_mm256_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4032    transmute(simd_select_bitmask(k, r, transmute(a)))
4033}
4034
4035/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4036/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4037/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4038/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4039///
4040/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_pch)
4041#[inline]
4042#[target_feature(enable = "avx512fp16,avx512vl")]
4043#[cfg_attr(test, assert_instr(vfmaddcph))]
4044#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4045pub unsafe fn _mm256_mask3_fmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4046    transmute(vfmaddcph_mask3_256(
4047        transmute(a),
4048        transmute(b),
4049        transmute(c),
4050        k,
4051    ))
4052}
4053
4054/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4055/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4056/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4057/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4058///
4059/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_pch)
4060#[inline]
4061#[target_feature(enable = "avx512fp16,avx512vl")]
4062#[cfg_attr(test, assert_instr(vfmaddcph))]
4063#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4064pub unsafe fn _mm256_maskz_fmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4065    transmute(vfmaddcph_maskz_256(
4066        transmute(a),
4067        transmute(b),
4068        transmute(c),
4069        k,
4070    ))
4071}
4072
4073/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4074/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4075/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4076///
4077/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pch)
4078#[inline]
4079#[target_feature(enable = "avx512fp16")]
4080#[cfg_attr(test, assert_instr(vfmaddcph))]
4081#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4082pub unsafe fn _mm512_fmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4083    _mm512_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4084}
4085
4086/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4087/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4088/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4089/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4090///
4091/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pch)
4092#[inline]
4093#[target_feature(enable = "avx512fp16")]
4094#[cfg_attr(test, assert_instr(vfmaddcph))]
4095#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4096pub unsafe fn _mm512_mask_fmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4097    _mm512_mask_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4098}
4099
4100/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4101/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4102/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4103/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4104///
4105/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pch)
4106#[inline]
4107#[target_feature(enable = "avx512fp16")]
4108#[cfg_attr(test, assert_instr(vfmaddcph))]
4109#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4110pub unsafe fn _mm512_mask3_fmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4111    _mm512_mask3_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4112}
4113
4114/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4115/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4116/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4117/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4118///
4119/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pch)
4120#[inline]
4121#[target_feature(enable = "avx512fp16")]
4122#[cfg_attr(test, assert_instr(vfmaddcph))]
4123#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4124pub unsafe fn _mm512_maskz_fmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4125    _mm512_maskz_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4126}
4127
4128/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4129/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4130/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4131///
4132/// Rounding is done according to the rounding parameter, which can be one of:
4133///
4134/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4135/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4136/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4137/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4138/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4139///
4140/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pch)
4141#[inline]
4142#[target_feature(enable = "avx512fp16")]
4143#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4144#[rustc_legacy_const_generics(3)]
4145#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4146pub unsafe fn _mm512_fmadd_round_pch<const ROUNDING: i32>(
4147    a: __m512h,
4148    b: __m512h,
4149    c: __m512h,
4150) -> __m512h {
4151    static_assert_rounding!(ROUNDING);
4152    _mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4153}
4154
4155/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4156/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4157/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4158/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4159///
4160/// Rounding is done according to the rounding parameter, which can be one of:
4161///
4162/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4163/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4164/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4165/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4166/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4167///
4168/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pch)
4169#[inline]
4170#[target_feature(enable = "avx512fp16")]
4171#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4172#[rustc_legacy_const_generics(4)]
4173#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4174pub unsafe fn _mm512_mask_fmadd_round_pch<const ROUNDING: i32>(
4175    a: __m512h,
4176    k: __mmask16,
4177    b: __m512h,
4178    c: __m512h,
4179) -> __m512h {
4180    static_assert_rounding!(ROUNDING);
4181    let r: __m512 = transmute(_mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
4182    transmute(simd_select_bitmask(k, r, transmute(a)))
4183}
4184
4185/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4186/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4187/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4188/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4189///
4190/// Rounding is done according to the rounding parameter, which can be one of:
4191///
4192/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4193/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4194/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4195/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4196/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4197///
4198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pch)
4199#[inline]
4200#[target_feature(enable = "avx512fp16")]
4201#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4202#[rustc_legacy_const_generics(4)]
4203#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4204pub unsafe fn _mm512_mask3_fmadd_round_pch<const ROUNDING: i32>(
4205    a: __m512h,
4206    b: __m512h,
4207    c: __m512h,
4208    k: __mmask16,
4209) -> __m512h {
4210    static_assert_rounding!(ROUNDING);
4211    transmute(vfmaddcph_mask3_512(
4212        transmute(a),
4213        transmute(b),
4214        transmute(c),
4215        k,
4216        ROUNDING,
4217    ))
4218}
4219
4220/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4221/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4222/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4223/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4224///
4225/// Rounding is done according to the rounding parameter, which can be one of:
4226///
4227/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4228/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4229/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4230/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4231/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4232///
4233/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pch)
4234#[inline]
4235#[target_feature(enable = "avx512fp16")]
4236#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4237#[rustc_legacy_const_generics(4)]
4238#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4239pub unsafe fn _mm512_maskz_fmadd_round_pch<const ROUNDING: i32>(
4240    k: __mmask16,
4241    a: __m512h,
4242    b: __m512h,
4243    c: __m512h,
4244) -> __m512h {
4245    static_assert_rounding!(ROUNDING);
4246    transmute(vfmaddcph_maskz_512(
4247        transmute(a),
4248        transmute(b),
4249        transmute(c),
4250        k,
4251        ROUNDING,
4252    ))
4253}
4254
4255/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4256/// store the result in the lower elements of dst, and copy the upper 6 packed elements from a to the
4257/// upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
4258/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4259///
4260/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sch)
4261#[inline]
4262#[target_feature(enable = "avx512fp16")]
4263#[cfg_attr(test, assert_instr(vfmaddcsh))]
4264#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4265pub unsafe fn _mm_fmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4266    _mm_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4267}
4268
4269/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4270/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4271/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4272/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4273/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4274///
4275/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sch)
4276#[inline]
4277#[target_feature(enable = "avx512fp16")]
4278#[cfg_attr(test, assert_instr(vfmaddcsh))]
4279#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4280pub unsafe fn _mm_mask_fmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4281    _mm_mask_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4282}
4283
4284/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4285/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4286/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4287/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4288/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4289///
4290/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sch)
4291#[inline]
4292#[target_feature(enable = "avx512fp16")]
4293#[cfg_attr(test, assert_instr(vfmaddcsh))]
4294#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4295pub unsafe fn _mm_mask3_fmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4296    _mm_mask3_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4297}
4298
4299/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4300/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4301/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4302/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4303/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4304///
4305/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sch)
4306#[inline]
4307#[target_feature(enable = "avx512fp16")]
4308#[cfg_attr(test, assert_instr(vfmaddcsh))]
4309#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4310pub unsafe fn _mm_maskz_fmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4311    _mm_maskz_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4312}
4313
4314/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4315/// store the result in the lower elements of dst. Each complex number is composed of two adjacent
4316/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4317///
4318/// Rounding is done according to the rounding parameter, which can be one of:
4319///
4320/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4321/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4322/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4323/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4324/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4325///
4326/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sch)
4327#[inline]
4328#[target_feature(enable = "avx512fp16")]
4329#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4330#[rustc_legacy_const_generics(3)]
4331#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4332pub unsafe fn _mm_fmadd_round_sch<const ROUNDING: i32>(
4333    a: __m128h,
4334    b: __m128h,
4335    c: __m128h,
4336) -> __m128h {
4337    static_assert_rounding!(ROUNDING);
4338    transmute(vfmaddcsh_mask(
4339        transmute(a),
4340        transmute(b),
4341        transmute(c),
4342        0xff,
4343        ROUNDING,
4344    ))
4345}
4346
4347/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4348/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4349/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4350/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4351/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4352///
4353/// Rounding is done according to the rounding parameter, which can be one of:
4354///
4355/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4356/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4357/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4358/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4359/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4360///
4361/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sch)
4362#[inline]
4363#[target_feature(enable = "avx512fp16")]
4364#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4365#[rustc_legacy_const_generics(4)]
4366#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4367pub unsafe fn _mm_mask_fmadd_round_sch<const ROUNDING: i32>(
4368    a: __m128h,
4369    k: __mmask8,
4370    b: __m128h,
4371    c: __m128h,
4372) -> __m128h {
4373    static_assert_rounding!(ROUNDING);
4374    let a = transmute(a);
4375    let r = vfmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING); // using `0xff` would have been fine here, but this is what CLang does
4376    transmute(_mm_mask_move_ss(a, k, a, r))
4377}
4378
4379/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4380/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4381/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4382/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4383/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4384///
4385/// Rounding is done according to the rounding parameter, which can be one of:
4386///
4387/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4388/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4389/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4390/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4391/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4392///
4393/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sch)
4394#[inline]
4395#[target_feature(enable = "avx512fp16")]
4396#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4397#[rustc_legacy_const_generics(4)]
4398#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4399pub unsafe fn _mm_mask3_fmadd_round_sch<const ROUNDING: i32>(
4400    a: __m128h,
4401    b: __m128h,
4402    c: __m128h,
4403    k: __mmask8,
4404) -> __m128h {
4405    static_assert_rounding!(ROUNDING);
4406    let c = transmute(c);
4407    let r = vfmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
4408    transmute(_mm_move_ss(c, r))
4409}
4410
4411/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4412/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4413/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4414/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4415/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4416///
4417/// Rounding is done according to the rounding parameter, which can be one of:
4418///
4419/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4420/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4421/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4422/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4423/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4424///
4425/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sch)
4426#[inline]
4427#[target_feature(enable = "avx512fp16")]
4428#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4429#[rustc_legacy_const_generics(4)]
4430#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4431pub unsafe fn _mm_maskz_fmadd_round_sch<const ROUNDING: i32>(
4432    k: __mmask8,
4433    a: __m128h,
4434    b: __m128h,
4435    c: __m128h,
4436) -> __m128h {
4437    static_assert_rounding!(ROUNDING);
4438    let a = transmute(a);
4439    let r = vfmaddcsh_maskz(a, transmute(b), transmute(c), k, ROUNDING);
4440    transmute(_mm_move_ss(a, r)) // FIXME: If `k == 0`, then LLVM optimized `vfmaddcsh_maskz` to output an all-zero vector, which is incorrect
4441}
4442
4443/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4444/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4445/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4446/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4447///
4448/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_pch)
4449#[inline]
4450#[target_feature(enable = "avx512fp16,avx512vl")]
4451#[cfg_attr(test, assert_instr(vfcmaddcph))]
4452#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4453pub unsafe fn _mm_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4454    _mm_mask3_fcmadd_pch(a, b, c, 0xff)
4455}
4456
4457/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4458/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4459/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4460/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4461/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4462///
4463/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_pch)
4464#[inline]
4465#[target_feature(enable = "avx512fp16,avx512vl")]
4466#[cfg_attr(test, assert_instr(vfcmaddcph))]
4467#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4468pub unsafe fn _mm_mask_fcmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4469    let r: __m128 = transmute(_mm_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4470    transmute(simd_select_bitmask(k, r, transmute(a)))
4471}
4472
4473/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4474/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4475/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4476/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4477/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4478///
4479/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_pch)
4480#[inline]
4481#[target_feature(enable = "avx512fp16,avx512vl")]
4482#[cfg_attr(test, assert_instr(vfcmaddcph))]
4483#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4484pub unsafe fn _mm_mask3_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4485    transmute(vfcmaddcph_mask3_128(
4486        transmute(a),
4487        transmute(b),
4488        transmute(c),
4489        k,
4490    ))
4491}
4492
4493/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4494/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4495/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4496/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4497/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4498///
4499/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_pch)
4500#[inline]
4501#[target_feature(enable = "avx512fp16,avx512vl")]
4502#[cfg_attr(test, assert_instr(vfcmaddcph))]
4503#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4504pub unsafe fn _mm_maskz_fcmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4505    transmute(vfcmaddcph_maskz_128(
4506        transmute(a),
4507        transmute(b),
4508        transmute(c),
4509        k,
4510    ))
4511}
4512
4513/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4514/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4515/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4516/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4517///
4518/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmadd_pch)
4519#[inline]
4520#[target_feature(enable = "avx512fp16,avx512vl")]
4521#[cfg_attr(test, assert_instr(vfcmaddcph))]
4522#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4523pub unsafe fn _mm256_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4524    _mm256_mask3_fcmadd_pch(a, b, c, 0xff)
4525}
4526
4527/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4528/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4529/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4530/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4531/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4532///
4533/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmadd_pch)
4534#[inline]
4535#[target_feature(enable = "avx512fp16,avx512vl")]
4536#[cfg_attr(test, assert_instr(vfcmaddcph))]
4537#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4538pub unsafe fn _mm256_mask_fcmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4539    let r: __m256 = transmute(_mm256_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4540    transmute(simd_select_bitmask(k, r, transmute(a)))
4541}
4542
4543/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4544/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4545/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4546/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4547/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4548///
4549/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fcmadd_pch)
4550#[inline]
4551#[target_feature(enable = "avx512fp16,avx512vl")]
4552#[cfg_attr(test, assert_instr(vfcmaddcph))]
4553#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4554pub unsafe fn _mm256_mask3_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4555    transmute(vfcmaddcph_mask3_256(
4556        transmute(a),
4557        transmute(b),
4558        transmute(c),
4559        k,
4560    ))
4561}
4562
4563/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4564/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4565/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4566/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4567/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4568///
4569/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmadd_pch)
4570#[inline]
4571#[target_feature(enable = "avx512fp16,avx512vl")]
4572#[cfg_attr(test, assert_instr(vfcmaddcph))]
4573#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4574pub unsafe fn _mm256_maskz_fcmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4575    transmute(vfcmaddcph_maskz_256(
4576        transmute(a),
4577        transmute(b),
4578        transmute(c),
4579        k,
4580    ))
4581}
4582
4583/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4584/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4585/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4586/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4587///
4588/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_pch)
4589#[inline]
4590#[target_feature(enable = "avx512fp16")]
4591#[cfg_attr(test, assert_instr(vfcmaddcph))]
4592#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4593pub unsafe fn _mm512_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4594    _mm512_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4595}
4596
4597/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4598/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4599/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4600/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4601/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4602///
4603/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_pch)
4604#[inline]
4605#[target_feature(enable = "avx512fp16")]
4606#[cfg_attr(test, assert_instr(vfcmaddcph))]
4607#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4608pub unsafe fn _mm512_mask_fcmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4609    _mm512_mask_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4610}
4611
4612/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4613/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4614/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4615/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4616/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4617///
4618/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_pch)
4619#[inline]
4620#[target_feature(enable = "avx512fp16")]
4621#[cfg_attr(test, assert_instr(vfcmaddcph))]
4622#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4623pub unsafe fn _mm512_mask3_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4624    _mm512_mask3_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4625}
4626
4627/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4628/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4629/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4630/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4631/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4632///
4633/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_pch)
4634#[inline]
4635#[target_feature(enable = "avx512fp16")]
4636#[cfg_attr(test, assert_instr(vfcmaddcph))]
4637#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4638pub unsafe fn _mm512_maskz_fcmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4639    _mm512_maskz_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4640}
4641
4642/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4643/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4644/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4645/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4646///
4647/// Rounding is done according to the rounding parameter, which can be one of:
4648///
4649/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4650/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4651/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4652/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4653/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4654///
4655/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_round_pch)
4656#[inline]
4657#[target_feature(enable = "avx512fp16")]
4658#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4659#[rustc_legacy_const_generics(3)]
4660#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4661pub unsafe fn _mm512_fcmadd_round_pch<const ROUNDING: i32>(
4662    a: __m512h,
4663    b: __m512h,
4664    c: __m512h,
4665) -> __m512h {
4666    static_assert_rounding!(ROUNDING);
4667    _mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4668}
4669
4670/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4671/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4672/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4673/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4674/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4675///
4676/// Rounding is done according to the rounding parameter, which can be one of:
4677///
4678/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4679/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4680/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4681/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4682/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4683///
4684/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_round_pch)
4685#[inline]
4686#[target_feature(enable = "avx512fp16")]
4687#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4688#[rustc_legacy_const_generics(4)]
4689#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4690pub unsafe fn _mm512_mask_fcmadd_round_pch<const ROUNDING: i32>(
4691    a: __m512h,
4692    k: __mmask16,
4693    b: __m512h,
4694    c: __m512h,
4695) -> __m512h {
4696    static_assert_rounding!(ROUNDING);
4697    let r: __m512 = transmute(_mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
4698    transmute(simd_select_bitmask(k, r, transmute(a)))
4699}
4700
4701/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4702/// to the corresponding complex numbers in c using writemask k (the element is copied from c when the corresponding
4703/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
4704/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
4705/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4706///
4707/// Rounding is done according to the rounding parameter, which can be one of:
4708///
4709/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4710/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4711/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4712/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4713/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4714///
4715/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_round_pch)
4716#[inline]
4717#[target_feature(enable = "avx512fp16")]
4718#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4719#[rustc_legacy_const_generics(4)]
4720#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4721pub unsafe fn _mm512_mask3_fcmadd_round_pch<const ROUNDING: i32>(
4722    a: __m512h,
4723    b: __m512h,
4724    c: __m512h,
4725    k: __mmask16,
4726) -> __m512h {
4727    static_assert_rounding!(ROUNDING);
4728    transmute(vfcmaddcph_mask3_512(
4729        transmute(a),
4730        transmute(b),
4731        transmute(c),
4732        k,
4733        ROUNDING,
4734    ))
4735}
4736
4737/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4738/// to the corresponding complex numbers in c using zeromask k (the element is zeroed out when the corresponding
4739/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
4740/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
4741/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4742///
4743/// Rounding is done according to the rounding parameter, which can be one of:
4744///
4745/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4746/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4747/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4748/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4749/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4750///
4751/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_round_pch)
4752#[inline]
4753#[target_feature(enable = "avx512fp16")]
4754#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4755#[rustc_legacy_const_generics(4)]
4756#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4757pub unsafe fn _mm512_maskz_fcmadd_round_pch<const ROUNDING: i32>(
4758    k: __mmask16,
4759    a: __m512h,
4760    b: __m512h,
4761    c: __m512h,
4762) -> __m512h {
4763    static_assert_rounding!(ROUNDING);
4764    transmute(vfcmaddcph_maskz_512(
4765        transmute(a),
4766        transmute(b),
4767        transmute(c),
4768        k,
4769        ROUNDING,
4770    ))
4771}
4772
4773/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4774/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
4775/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
4776/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4777/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4778///
4779/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_sch)
4780#[inline]
4781#[target_feature(enable = "avx512fp16")]
4782#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4783#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4784pub unsafe fn _mm_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4785    _mm_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4786}
4787
4788/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4789/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4790/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
4791/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4792/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4793/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4794///
4795/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_sch)
4796#[inline]
4797#[target_feature(enable = "avx512fp16")]
4798#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4799#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4800pub unsafe fn _mm_mask_fcmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4801    _mm_mask_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4802}
4803
4804/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4805/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4806/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
4807/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4808/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4809/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4810///
4811/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_sch)
4812#[inline]
4813#[target_feature(enable = "avx512fp16")]
4814#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4815#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4816pub unsafe fn _mm_mask3_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4817    _mm_mask3_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4818}
4819
4820/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4821/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4822/// zeromask k (the element is zeroed out when the corresponding mask bit is not set), and copy the upper
4823/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4824/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4825/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4826///
4827/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_sch)
4828#[inline]
4829#[target_feature(enable = "avx512fp16")]
4830#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4831#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4832pub unsafe fn _mm_maskz_fcmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4833    _mm_maskz_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4834}
4835
4836/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4837/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
4838/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
4839/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4840/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4841///
4842/// Rounding is done according to the rounding parameter, which can be one of:
4843///
4844/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4845/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4846/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4847/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4848/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4849///
4850/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_round_sch)
4851#[inline]
4852#[target_feature(enable = "avx512fp16")]
4853#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
4854#[rustc_legacy_const_generics(3)]
4855#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4856pub unsafe fn _mm_fcmadd_round_sch<const ROUNDING: i32>(
4857    a: __m128h,
4858    b: __m128h,
4859    c: __m128h,
4860) -> __m128h {
4861    static_assert_rounding!(ROUNDING);
4862    transmute(vfcmaddcsh_mask(
4863        transmute(a),
4864        transmute(b),
4865        transmute(c),
4866        0xff,
4867        ROUNDING,
4868    ))
4869}
4870
4871/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4872/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4873/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
4874/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4875/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4876/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4877///
4878/// Rounding is done according to the rounding parameter, which can be one of:
4879///
4880/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4881/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4882/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4883/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4884/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4885///
4886/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_round_sch)
4887#[inline]
4888#[target_feature(enable = "avx512fp16")]
4889#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
4890#[rustc_legacy_const_generics(4)]
4891#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4892pub unsafe fn _mm_mask_fcmadd_round_sch<const ROUNDING: i32>(
4893    a: __m128h,
4894    k: __mmask8,
4895    b: __m128h,
4896    c: __m128h,
4897) -> __m128h {
4898    static_assert_rounding!(ROUNDING);
4899    let a = transmute(a);
4900    let r = vfcmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING);
4901    transmute(_mm_mask_move_ss(a, k, a, r))
4902}
4903
4904/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4905/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
4906/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
4907/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
4908/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4909/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4910///
4911/// Rounding is done according to the rounding parameter, which can be one of:
4912///
4913/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4914/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4915/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4916/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4917/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4918///
4919/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_round_sch)
4920#[inline]
4921#[target_feature(enable = "avx512fp16")]
4922#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
4923#[rustc_legacy_const_generics(4)]
4924#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4925pub unsafe fn _mm_mask3_fcmadd_round_sch<const ROUNDING: i32>(
4926    a: __m128h,
4927    b: __m128h,
4928    c: __m128h,
4929    k: __mmask8,
4930) -> __m128h {
4931    static_assert_rounding!(ROUNDING);
4932    let c = transmute(c);
4933    let r = vfcmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
4934    transmute(_mm_move_ss(c, r))
4935}
4936
4937/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4938/// accumulate to the lower complex number in c using zeromask k (the element is zeroed out when the corresponding
4939/// mask bit is not set), and store the result in the lower elements of dst, and copy the upper 6 packed elements
4940/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
4941/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
4942/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4943///
4944/// Rounding is done according to the rounding parameter, which can be one of:
4945///
4946/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4947/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4948/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4949/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4950/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4951///
4952/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_round_sch)
4953#[inline]
4954#[target_feature(enable = "avx512fp16")]
4955#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
4956#[rustc_legacy_const_generics(4)]
4957#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4958pub unsafe fn _mm_maskz_fcmadd_round_sch<const ROUNDING: i32>(
4959    k: __mmask8,
4960    a: __m128h,
4961    b: __m128h,
4962    c: __m128h,
4963) -> __m128h {
4964    static_assert_rounding!(ROUNDING);
4965    let a = transmute(a);
4966    let r = vfcmaddcsh_maskz(a, transmute(b), transmute(c), k, ROUNDING);
4967    transmute(_mm_move_ss(a, r)) // FIXME: If `k == 0`, then LLVM optimized `vfcmaddcsh_maskz` to output an all-zero vector, which is incorrect
4968}
4969
4970/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
4971/// result to packed elements in c, and store the results in dst.
4972///
4973/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ph)
4974#[inline]
4975#[target_feature(enable = "avx512fp16,avx512vl")]
4976#[cfg_attr(test, assert_instr(vfmadd))]
4977#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4978pub unsafe fn _mm_fmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4979    simd_fma(a, b, c)
4980}
4981
4982/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
4983/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
4984/// from a when the corresponding mask bit is not set).
4985///
4986/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ph)
4987#[inline]
4988#[target_feature(enable = "avx512fp16,avx512vl")]
4989#[cfg_attr(test, assert_instr(vfmadd))]
4990#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4991pub unsafe fn _mm_mask_fmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4992    simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), a)
4993}
4994
4995/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
4996/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
4997/// from c when the corresponding mask bit is not set).
4998///
4999/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ph)
5000#[inline]
5001#[target_feature(enable = "avx512fp16,avx512vl")]
5002#[cfg_attr(test, assert_instr(vfmadd))]
5003#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5004pub unsafe fn _mm_mask3_fmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5005    simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), c)
5006}
5007
5008/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5009/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5010/// out when the corresponding mask bit is not set).
5011///
5012/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ph)
5013#[inline]
5014#[target_feature(enable = "avx512fp16,avx512vl")]
5015#[cfg_attr(test, assert_instr(vfmadd))]
5016#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5017pub unsafe fn _mm_maskz_fmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5018    simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), _mm_setzero_ph())
5019}
5020
5021/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5022/// result to packed elements in c, and store the results in dst.
5023///
5024/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_ph)
5025#[inline]
5026#[target_feature(enable = "avx512fp16,avx512vl")]
5027#[cfg_attr(test, assert_instr(vfmadd))]
5028#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5029pub unsafe fn _mm256_fmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5030    simd_fma(a, b, c)
5031}
5032
5033/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5034/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5035/// from a when the corresponding mask bit is not set).
5036///
5037/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_ph)
5038#[inline]
5039#[target_feature(enable = "avx512fp16,avx512vl")]
5040#[cfg_attr(test, assert_instr(vfmadd))]
5041#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5042pub unsafe fn _mm256_mask_fmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5043    simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), a)
5044}
5045
5046/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5047/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5048/// from c when the corresponding mask bit is not set).
5049///
5050/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_ph)
5051#[inline]
5052#[target_feature(enable = "avx512fp16,avx512vl")]
5053#[cfg_attr(test, assert_instr(vfmadd))]
5054#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5055pub unsafe fn _mm256_mask3_fmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5056    simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), c)
5057}
5058
5059/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5060/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5061/// out when the corresponding mask bit is not set).
5062///
5063/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_ph)
5064#[inline]
5065#[target_feature(enable = "avx512fp16,avx512vl")]
5066#[cfg_attr(test, assert_instr(vfmadd))]
5067#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5068pub unsafe fn _mm256_maskz_fmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5069    simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), _mm256_setzero_ph())
5070}
5071
5072/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5073/// result to packed elements in c, and store the results in dst.
5074///
5075/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ph)
5076#[inline]
5077#[target_feature(enable = "avx512fp16")]
5078#[cfg_attr(test, assert_instr(vfmadd))]
5079#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5080pub unsafe fn _mm512_fmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5081    simd_fma(a, b, c)
5082}
5083
5084/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5085/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5086/// from a when the corresponding mask bit is not set).
5087///
5088/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ph)
5089#[inline]
5090#[target_feature(enable = "avx512fp16")]
5091#[cfg_attr(test, assert_instr(vfmadd))]
5092#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5093pub unsafe fn _mm512_mask_fmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5094    simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), a)
5095}
5096
5097/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5098/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5099/// from c when the corresponding mask bit is not set).
5100///
5101/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ph)
5102#[inline]
5103#[target_feature(enable = "avx512fp16")]
5104#[cfg_attr(test, assert_instr(vfmadd))]
5105#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5106pub unsafe fn _mm512_mask3_fmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5107    simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), c)
5108}
5109
5110/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5111/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5112/// out when the corresponding mask bit is not set).
5113///
5114/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ph)
5115#[inline]
5116#[target_feature(enable = "avx512fp16")]
5117#[cfg_attr(test, assert_instr(vfmadd))]
5118#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5119pub unsafe fn _mm512_maskz_fmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5120    simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), _mm512_setzero_ph())
5121}
5122
5123/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5124/// result to packed elements in c, and store the results in dst.
5125///
5126/// Rounding is done according to the rounding parameter, which can be one of:
5127///
5128/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5129/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5130/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5131/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5132/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5133///
5134/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_ph)
5135#[inline]
5136#[target_feature(enable = "avx512fp16")]
5137#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5138#[rustc_legacy_const_generics(3)]
5139#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5140pub unsafe fn _mm512_fmadd_round_ph<const ROUNDING: i32>(
5141    a: __m512h,
5142    b: __m512h,
5143    c: __m512h,
5144) -> __m512h {
5145    static_assert_rounding!(ROUNDING);
5146    vfmaddph_512(a, b, c, ROUNDING)
5147}
5148
5149/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5150/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5151/// from a when the corresponding mask bit is not set).
5152///
5153/// Rounding is done according to the rounding parameter, which can be one of:
5154///
5155/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5156/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5157/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5158/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5159/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5160///
5161/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_ph)
5162#[inline]
5163#[target_feature(enable = "avx512fp16")]
5164#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5165#[rustc_legacy_const_generics(4)]
5166#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5167pub unsafe fn _mm512_mask_fmadd_round_ph<const ROUNDING: i32>(
5168    a: __m512h,
5169    k: __mmask32,
5170    b: __m512h,
5171    c: __m512h,
5172) -> __m512h {
5173    static_assert_rounding!(ROUNDING);
5174    simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), a)
5175}
5176
5177/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5178/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5179/// from c when the corresponding mask bit is not set).
5180///
5181/// Rounding is done according to the rounding parameter, which can be one of:
5182///
5183/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5184/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5185/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5186/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5187/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5188///
5189/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_ph)
5190#[inline]
5191#[target_feature(enable = "avx512fp16")]
5192#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5193#[rustc_legacy_const_generics(4)]
5194#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5195pub unsafe fn _mm512_mask3_fmadd_round_ph<const ROUNDING: i32>(
5196    a: __m512h,
5197    b: __m512h,
5198    c: __m512h,
5199    k: __mmask32,
5200) -> __m512h {
5201    static_assert_rounding!(ROUNDING);
5202    simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), c)
5203}
5204
5205/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5206/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5207/// out when the corresponding mask bit is not set).
5208///
5209/// Rounding is done according to the rounding parameter, which can be one of:
5210///
5211/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5212/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5213/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5214/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5215/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5216///
5217/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_ph)
5218#[inline]
5219#[target_feature(enable = "avx512fp16")]
5220#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5221#[rustc_legacy_const_generics(4)]
5222#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5223pub unsafe fn _mm512_maskz_fmadd_round_ph<const ROUNDING: i32>(
5224    k: __mmask32,
5225    a: __m512h,
5226    b: __m512h,
5227    c: __m512h,
5228) -> __m512h {
5229    static_assert_rounding!(ROUNDING);
5230    simd_select_bitmask(
5231        k,
5232        _mm512_fmadd_round_ph::<ROUNDING>(a, b, c),
5233        _mm512_setzero_ph(),
5234    )
5235}
5236
5237/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5238/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5239/// 7 packed elements from a to the upper elements of dst.
5240///
5241/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sh)
5242#[inline]
5243#[target_feature(enable = "avx512fp16")]
5244#[cfg_attr(test, assert_instr(vfmadd))]
5245#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5246pub unsafe fn _mm_fmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5247    let extracta: f16 = simd_extract!(a, 0);
5248    let extractb: f16 = simd_extract!(b, 0);
5249    let extractc: f16 = simd_extract!(c, 0);
5250    let r = fmaf16(extracta, extractb, extractc);
5251    simd_insert!(a, 0, r)
5252}
5253
5254/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5255/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5256/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5257/// upper elements of dst.
5258///
5259/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sh)
5260#[inline]
5261#[target_feature(enable = "avx512fp16")]
5262#[cfg_attr(test, assert_instr(vfmadd))]
5263#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5264pub unsafe fn _mm_mask_fmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5265    let mut fmadd: f16 = simd_extract!(a, 0);
5266    if k & 1 != 0 {
5267        let extractb: f16 = simd_extract!(b, 0);
5268        let extractc: f16 = simd_extract!(c, 0);
5269        fmadd = fmaf16(fmadd, extractb, extractc);
5270    }
5271    simd_insert!(a, 0, fmadd)
5272}
5273
5274/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5275/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5276/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5277/// upper elements of dst.
5278///
5279/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sh)
5280#[inline]
5281#[target_feature(enable = "avx512fp16")]
5282#[cfg_attr(test, assert_instr(vfmadd))]
5283#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5284pub unsafe fn _mm_mask3_fmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5285    let mut fmadd: f16 = simd_extract!(c, 0);
5286    if k & 1 != 0 {
5287        let extracta: f16 = simd_extract!(a, 0);
5288        let extractb: f16 = simd_extract!(b, 0);
5289        fmadd = fmaf16(extracta, extractb, fmadd);
5290    }
5291    simd_insert!(c, 0, fmadd)
5292}
5293
5294/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5295/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5296/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5297/// upper elements of dst.
5298///
5299/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sh)
5300#[inline]
5301#[target_feature(enable = "avx512fp16")]
5302#[cfg_attr(test, assert_instr(vfmadd))]
5303#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5304pub unsafe fn _mm_maskz_fmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5305    let mut fmadd: f16 = 0.0;
5306    if k & 1 != 0 {
5307        let extracta: f16 = simd_extract!(a, 0);
5308        let extractb: f16 = simd_extract!(b, 0);
5309        let extractc: f16 = simd_extract!(c, 0);
5310        fmadd = fmaf16(extracta, extractb, extractc);
5311    }
5312    simd_insert!(a, 0, fmadd)
5313}
5314
5315/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5316/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5317/// 7 packed elements from a to the upper elements of dst.
5318///
5319/// Rounding is done according to the rounding parameter, which can be one of:
5320///
5321/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5322/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5323/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5324/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5325/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5326///
5327/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sh)
5328#[inline]
5329#[target_feature(enable = "avx512fp16")]
5330#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5331#[rustc_legacy_const_generics(3)]
5332#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5333pub unsafe fn _mm_fmadd_round_sh<const ROUNDING: i32>(
5334    a: __m128h,
5335    b: __m128h,
5336    c: __m128h,
5337) -> __m128h {
5338    static_assert_rounding!(ROUNDING);
5339    let extracta: f16 = simd_extract!(a, 0);
5340    let extractb: f16 = simd_extract!(b, 0);
5341    let extractc: f16 = simd_extract!(c, 0);
5342    let r = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5343    simd_insert!(a, 0, r)
5344}
5345
5346/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5347/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5348/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5349/// upper elements of dst.
5350///
5351/// Rounding is done according to the rounding parameter, which can be one of:
5352///
5353/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5354/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5355/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5356/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5357/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5358///
5359/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sh)
5360#[inline]
5361#[target_feature(enable = "avx512fp16")]
5362#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5363#[rustc_legacy_const_generics(4)]
5364#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5365pub unsafe fn _mm_mask_fmadd_round_sh<const ROUNDING: i32>(
5366    a: __m128h,
5367    k: __mmask8,
5368    b: __m128h,
5369    c: __m128h,
5370) -> __m128h {
5371    static_assert_rounding!(ROUNDING);
5372    let mut fmadd: f16 = simd_extract!(a, 0);
5373    if k & 1 != 0 {
5374        let extractb: f16 = simd_extract!(b, 0);
5375        let extractc: f16 = simd_extract!(c, 0);
5376        fmadd = vfmaddsh(fmadd, extractb, extractc, ROUNDING);
5377    }
5378    simd_insert!(a, 0, fmadd)
5379}
5380
5381/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5382/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5383/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5384/// upper elements of dst.
5385///
5386/// Rounding is done according to the rounding parameter, which can be one of:
5387///
5388/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5389/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5390/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5391/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5392/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5393///
5394/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sh)
5395#[inline]
5396#[target_feature(enable = "avx512fp16")]
5397#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5398#[rustc_legacy_const_generics(4)]
5399#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5400pub unsafe fn _mm_mask3_fmadd_round_sh<const ROUNDING: i32>(
5401    a: __m128h,
5402    b: __m128h,
5403    c: __m128h,
5404    k: __mmask8,
5405) -> __m128h {
5406    static_assert_rounding!(ROUNDING);
5407    let mut fmadd: f16 = simd_extract!(c, 0);
5408    if k & 1 != 0 {
5409        let extracta: f16 = simd_extract!(a, 0);
5410        let extractb: f16 = simd_extract!(b, 0);
5411        fmadd = vfmaddsh(extracta, extractb, fmadd, ROUNDING);
5412    }
5413    simd_insert!(c, 0, fmadd)
5414}
5415
5416/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5417/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5418/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5419/// upper elements of dst.
5420///
5421/// Rounding is done according to the rounding parameter, which can be one of:
5422///
5423/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5424/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5425/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5426/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5427/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5428///
5429/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sh)
5430#[inline]
5431#[target_feature(enable = "avx512fp16")]
5432#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5433#[rustc_legacy_const_generics(4)]
5434#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5435pub unsafe fn _mm_maskz_fmadd_round_sh<const ROUNDING: i32>(
5436    k: __mmask8,
5437    a: __m128h,
5438    b: __m128h,
5439    c: __m128h,
5440) -> __m128h {
5441    static_assert_rounding!(ROUNDING);
5442    let mut fmadd: f16 = 0.0;
5443    if k & 1 != 0 {
5444        let extracta: f16 = simd_extract!(a, 0);
5445        let extractb: f16 = simd_extract!(b, 0);
5446        let extractc: f16 = simd_extract!(c, 0);
5447        fmadd = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5448    }
5449    simd_insert!(a, 0, fmadd)
5450}
5451
5452/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5453/// in c from the intermediate result, and store the results in dst.
5454/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5455///
5456/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ph)
5457#[inline]
5458#[target_feature(enable = "avx512fp16,avx512vl")]
5459#[cfg_attr(test, assert_instr(vfmsub))]
5460#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5461pub unsafe fn _mm_fmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5462    simd_fma(a, b, simd_neg(c))
5463}
5464
5465/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5466/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5467/// from a when the corresponding mask bit is not set).
5468///
5469/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_ph)
5470#[inline]
5471#[target_feature(enable = "avx512fp16,avx512vl")]
5472#[cfg_attr(test, assert_instr(vfmsub))]
5473#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5474pub unsafe fn _mm_mask_fmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5475    simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), a)
5476}
5477
5478/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5479/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5480/// from c when the corresponding mask bit is not set).
5481///
5482/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_ph)
5483#[inline]
5484#[target_feature(enable = "avx512fp16,avx512vl")]
5485#[cfg_attr(test, assert_instr(vfmsub))]
5486#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5487pub unsafe fn _mm_mask3_fmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5488    simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), c)
5489}
5490
5491/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5492/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5493/// out when the corresponding mask bit is not set).
5494///
5495/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_ph)
5496#[inline]
5497#[target_feature(enable = "avx512fp16,avx512vl")]
5498#[cfg_attr(test, assert_instr(vfmsub))]
5499#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5500pub unsafe fn _mm_maskz_fmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5501    simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), _mm_setzero_ph())
5502}
5503
5504/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5505/// in c from the intermediate result, and store the results in dst.
5506///
5507/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_ph)
5508#[inline]
5509#[target_feature(enable = "avx512fp16,avx512vl")]
5510#[cfg_attr(test, assert_instr(vfmsub))]
5511#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5512pub unsafe fn _mm256_fmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5513    simd_fma(a, b, simd_neg(c))
5514}
5515
5516/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5517/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5518/// from a when the corresponding mask bit is not set).
5519///
5520/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_ph)
5521#[inline]
5522#[target_feature(enable = "avx512fp16,avx512vl")]
5523#[cfg_attr(test, assert_instr(vfmsub))]
5524#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5525pub unsafe fn _mm256_mask_fmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5526    simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), a)
5527}
5528
5529/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5530/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5531/// from c when the corresponding mask bit is not set).
5532///
5533/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsub_ph)
5534#[inline]
5535#[target_feature(enable = "avx512fp16,avx512vl")]
5536#[cfg_attr(test, assert_instr(vfmsub))]
5537#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5538pub unsafe fn _mm256_mask3_fmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5539    simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), c)
5540}
5541
5542/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5543/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5544/// out when the corresponding mask bit is not set).
5545///
5546/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_ph)
5547#[inline]
5548#[target_feature(enable = "avx512fp16,avx512vl")]
5549#[cfg_attr(test, assert_instr(vfmsub))]
5550#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5551pub unsafe fn _mm256_maskz_fmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5552    simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), _mm256_setzero_ph())
5553}
5554
5555/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5556/// in c from the intermediate result, and store the results in dst.
5557///
5558/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_ph)
5559#[inline]
5560#[target_feature(enable = "avx512fp16")]
5561#[cfg_attr(test, assert_instr(vfmsub))]
5562#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5563pub unsafe fn _mm512_fmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5564    simd_fma(a, b, simd_neg(c))
5565}
5566
5567/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5568/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5569/// from a when the corresponding mask bit is not set).
5570///
5571/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_ph)
5572#[inline]
5573#[target_feature(enable = "avx512fp16")]
5574#[cfg_attr(test, assert_instr(vfmsub))]
5575#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5576pub unsafe fn _mm512_mask_fmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5577    simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), a)
5578}
5579
5580/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5581/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5582/// from c when the corresponding mask bit is not set).
5583///
5584/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_ph)
5585#[inline]
5586#[target_feature(enable = "avx512fp16")]
5587#[cfg_attr(test, assert_instr(vfmsub))]
5588#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5589pub unsafe fn _mm512_mask3_fmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5590    simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), c)
5591}
5592
5593/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5594/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5595/// out when the corresponding mask bit is not set).
5596///
5597/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_ph)
5598#[inline]
5599#[target_feature(enable = "avx512fp16")]
5600#[cfg_attr(test, assert_instr(vfmsub))]
5601#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5602pub unsafe fn _mm512_maskz_fmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5603    simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), _mm512_setzero_ph())
5604}
5605
5606/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5607/// in c from the intermediate result, and store the results in dst.
5608///
5609/// Rounding is done according to the rounding parameter, which can be one of:
5610///
5611/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5612/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5613/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5614/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5615/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5616///
5617/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_ph)
5618#[inline]
5619#[target_feature(enable = "avx512fp16")]
5620#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5621#[rustc_legacy_const_generics(3)]
5622#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5623pub unsafe fn _mm512_fmsub_round_ph<const ROUNDING: i32>(
5624    a: __m512h,
5625    b: __m512h,
5626    c: __m512h,
5627) -> __m512h {
5628    static_assert_rounding!(ROUNDING);
5629    vfmaddph_512(a, b, simd_neg(c), ROUNDING)
5630}
5631
5632/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5633/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5634/// from a when the corresponding mask bit is not set).
5635///
5636/// Rounding is done according to the rounding parameter, which can be one of:
5637///
5638/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5639/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5640/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5641/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5642/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5643///
5644/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_ph)
5645#[inline]
5646#[target_feature(enable = "avx512fp16")]
5647#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5648#[rustc_legacy_const_generics(4)]
5649#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5650pub unsafe fn _mm512_mask_fmsub_round_ph<const ROUNDING: i32>(
5651    a: __m512h,
5652    k: __mmask32,
5653    b: __m512h,
5654    c: __m512h,
5655) -> __m512h {
5656    static_assert_rounding!(ROUNDING);
5657    simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), a)
5658}
5659
5660/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5661/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5662/// from c when the corresponding mask bit is not set).
5663///
5664/// Rounding is done according to the rounding parameter, which can be one of:
5665///
5666/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5667/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5668/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5669/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5670/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5671///
5672/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_round_ph)
5673#[inline]
5674#[target_feature(enable = "avx512fp16")]
5675#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5676#[rustc_legacy_const_generics(4)]
5677#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5678pub unsafe fn _mm512_mask3_fmsub_round_ph<const ROUNDING: i32>(
5679    a: __m512h,
5680    b: __m512h,
5681    c: __m512h,
5682    k: __mmask32,
5683) -> __m512h {
5684    static_assert_rounding!(ROUNDING);
5685    simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), c)
5686}
5687
5688/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5689/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5690/// out when the corresponding mask bit is not set).
5691///
5692/// Rounding is done according to the rounding parameter, which can be one of:
5693///
5694/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5695/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5696/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5697/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5698/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5699///
5700/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_ph)
5701#[inline]
5702#[target_feature(enable = "avx512fp16")]
5703#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5704#[rustc_legacy_const_generics(4)]
5705#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5706pub unsafe fn _mm512_maskz_fmsub_round_ph<const ROUNDING: i32>(
5707    k: __mmask32,
5708    a: __m512h,
5709    b: __m512h,
5710    c: __m512h,
5711) -> __m512h {
5712    static_assert_rounding!(ROUNDING);
5713    simd_select_bitmask(
5714        k,
5715        _mm512_fmsub_round_ph::<ROUNDING>(a, b, c),
5716        _mm512_setzero_ph(),
5717    )
5718}
5719
5720/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5721/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
5722/// 7 packed elements from a to the upper elements of dst.
5723///
5724/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_sh)
5725#[inline]
5726#[target_feature(enable = "avx512fp16")]
5727#[cfg_attr(test, assert_instr(vfmsub))]
5728#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5729pub unsafe fn _mm_fmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5730    let extracta: f16 = simd_extract!(a, 0);
5731    let extractb: f16 = simd_extract!(b, 0);
5732    let extractc: f16 = simd_extract!(c, 0);
5733    let r = fmaf16(extracta, extractb, -extractc);
5734    simd_insert!(a, 0, r)
5735}
5736
5737/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5738/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
5739/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5740/// upper elements of dst.
5741///
5742/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_sh)
5743#[inline]
5744#[target_feature(enable = "avx512fp16")]
5745#[cfg_attr(test, assert_instr(vfmsub))]
5746#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5747pub unsafe fn _mm_mask_fmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5748    let mut fmsub: f16 = simd_extract!(a, 0);
5749    if k & 1 != 0 {
5750        let extractb: f16 = simd_extract!(b, 0);
5751        let extractc: f16 = simd_extract!(c, 0);
5752        fmsub = fmaf16(fmsub, extractb, -extractc);
5753    }
5754    simd_insert!(a, 0, fmsub)
5755}
5756
5757/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5758/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
5759/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5760/// upper elements of dst.
5761///
5762/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_sh)
5763#[inline]
5764#[target_feature(enable = "avx512fp16")]
5765#[cfg_attr(test, assert_instr(vfmsub))]
5766#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5767pub unsafe fn _mm_mask3_fmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5768    let mut fmsub: f16 = simd_extract!(c, 0);
5769    if k & 1 != 0 {
5770        let extracta: f16 = simd_extract!(a, 0);
5771        let extractb: f16 = simd_extract!(b, 0);
5772        fmsub = fmaf16(extracta, extractb, -fmsub);
5773    }
5774    simd_insert!(c, 0, fmsub)
5775}
5776
5777/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5778/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
5779/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5780/// upper elements of dst.
5781///
5782/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_sh)
5783#[inline]
5784#[target_feature(enable = "avx512fp16")]
5785#[cfg_attr(test, assert_instr(vfmsub))]
5786#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5787pub unsafe fn _mm_maskz_fmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5788    let mut fmsub: f16 = 0.0;
5789    if k & 1 != 0 {
5790        let extracta: f16 = simd_extract!(a, 0);
5791        let extractb: f16 = simd_extract!(b, 0);
5792        let extractc: f16 = simd_extract!(c, 0);
5793        fmsub = fmaf16(extracta, extractb, -extractc);
5794    }
5795    simd_insert!(a, 0, fmsub)
5796}
5797
5798/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5799/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
5800/// 7 packed elements from a to the upper elements of dst.
5801///
5802/// Rounding is done according to the rounding parameter, which can be one of:
5803///
5804/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5805/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5806/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5807/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5808/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5809///
5810/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_sh)
5811#[inline]
5812#[target_feature(enable = "avx512fp16")]
5813#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5814#[rustc_legacy_const_generics(3)]
5815#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5816pub unsafe fn _mm_fmsub_round_sh<const ROUNDING: i32>(
5817    a: __m128h,
5818    b: __m128h,
5819    c: __m128h,
5820) -> __m128h {
5821    static_assert_rounding!(ROUNDING);
5822    let extracta: f16 = simd_extract!(a, 0);
5823    let extractb: f16 = simd_extract!(b, 0);
5824    let extractc: f16 = simd_extract!(c, 0);
5825    let r = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
5826    simd_insert!(a, 0, r)
5827}
5828
5829/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5830/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
5831/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5832/// upper elements of dst.
5833///
5834/// Rounding is done according to the rounding parameter, which can be one of:
5835///
5836/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5837/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5838/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5839/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5840/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5841///
5842/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_sh)
5843#[inline]
5844#[target_feature(enable = "avx512fp16")]
5845#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5846#[rustc_legacy_const_generics(4)]
5847#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5848pub unsafe fn _mm_mask_fmsub_round_sh<const ROUNDING: i32>(
5849    a: __m128h,
5850    k: __mmask8,
5851    b: __m128h,
5852    c: __m128h,
5853) -> __m128h {
5854    static_assert_rounding!(ROUNDING);
5855    let mut fmsub: f16 = simd_extract!(a, 0);
5856    if k & 1 != 0 {
5857        let extractb: f16 = simd_extract!(b, 0);
5858        let extractc: f16 = simd_extract!(c, 0);
5859        fmsub = vfmaddsh(fmsub, extractb, -extractc, ROUNDING);
5860    }
5861    simd_insert!(a, 0, fmsub)
5862}
5863
5864/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5865/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
5866/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5867/// upper elements of dst.
5868///
5869/// Rounding is done according to the rounding parameter, which can be one of:
5870///
5871/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5872/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5873/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5874/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5875/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5876///
5877/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_sh)
5878#[inline]
5879#[target_feature(enable = "avx512fp16")]
5880#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5881#[rustc_legacy_const_generics(4)]
5882#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5883pub unsafe fn _mm_mask3_fmsub_round_sh<const ROUNDING: i32>(
5884    a: __m128h,
5885    b: __m128h,
5886    c: __m128h,
5887    k: __mmask8,
5888) -> __m128h {
5889    static_assert_rounding!(ROUNDING);
5890    let mut fmsub: f16 = simd_extract!(c, 0);
5891    if k & 1 != 0 {
5892        let extracta: f16 = simd_extract!(a, 0);
5893        let extractb: f16 = simd_extract!(b, 0);
5894        fmsub = vfmaddsh(extracta, extractb, -fmsub, ROUNDING);
5895    }
5896    simd_insert!(c, 0, fmsub)
5897}
5898
5899/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5900/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
5901/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5902/// upper elements of dst.
5903///
5904/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_sh)
5905#[inline]
5906#[target_feature(enable = "avx512fp16")]
5907#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5908#[rustc_legacy_const_generics(4)]
5909#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5910pub unsafe fn _mm_maskz_fmsub_round_sh<const ROUNDING: i32>(
5911    k: __mmask8,
5912    a: __m128h,
5913    b: __m128h,
5914    c: __m128h,
5915) -> __m128h {
5916    static_assert_rounding!(ROUNDING);
5917    let mut fmsub: f16 = 0.0;
5918    if k & 1 != 0 {
5919        let extracta: f16 = simd_extract!(a, 0);
5920        let extractb: f16 = simd_extract!(b, 0);
5921        let extractc: f16 = simd_extract!(c, 0);
5922        fmsub = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
5923    }
5924    simd_insert!(a, 0, fmsub)
5925}
5926
5927/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
5928/// result from packed elements in c, and store the results in dst.
5929///
5930/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ph)
5931#[inline]
5932#[target_feature(enable = "avx512fp16,avx512vl")]
5933#[cfg_attr(test, assert_instr(vfnmadd))]
5934#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5935pub unsafe fn _mm_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5936    simd_fma(simd_neg(a), b, c)
5937}
5938
5939/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
5940/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
5941/// from a when the corresponding mask bit is not set).
5942///
5943/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_ph)
5944#[inline]
5945#[target_feature(enable = "avx512fp16,avx512vl")]
5946#[cfg_attr(test, assert_instr(vfnmadd))]
5947#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5948pub unsafe fn _mm_mask_fnmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5949    simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), a)
5950}
5951
5952/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
5953/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
5954/// from c when the corresponding mask bit is not set).
5955///
5956/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_ph)
5957#[inline]
5958#[target_feature(enable = "avx512fp16,avx512vl")]
5959#[cfg_attr(test, assert_instr(vfnmadd))]
5960#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5961pub unsafe fn _mm_mask3_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5962    simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), c)
5963}
5964
5965/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
5966/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5967/// out when the corresponding mask bit is not set).
5968///
5969/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_ph)
5970#[inline]
5971#[target_feature(enable = "avx512fp16,avx512vl")]
5972#[cfg_attr(test, assert_instr(vfnmadd))]
5973#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5974pub unsafe fn _mm_maskz_fnmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5975    simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), _mm_setzero_ph())
5976}
5977
5978/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
5979/// result from packed elements in c, and store the results in dst.
5980///
5981/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_ph)
5982#[inline]
5983#[target_feature(enable = "avx512fp16,avx512vl")]
5984#[cfg_attr(test, assert_instr(vfnmadd))]
5985#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5986pub unsafe fn _mm256_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5987    simd_fma(simd_neg(a), b, c)
5988}
5989
5990/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
5991/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
5992/// from a when the corresponding mask bit is not set).
5993///
5994/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_ph)
5995#[inline]
5996#[target_feature(enable = "avx512fp16,avx512vl")]
5997#[cfg_attr(test, assert_instr(vfnmadd))]
5998#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5999pub unsafe fn _mm256_mask_fnmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6000    simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), a)
6001}
6002
6003/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6004/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6005/// from c when the corresponding mask bit is not set).
6006///
6007/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmadd_ph)
6008#[inline]
6009#[target_feature(enable = "avx512fp16,avx512vl")]
6010#[cfg_attr(test, assert_instr(vfnmadd))]
6011#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6012pub unsafe fn _mm256_mask3_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6013    simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), c)
6014}
6015
6016/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6017/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6018/// out when the corresponding mask bit is not set).
6019///
6020/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_ph)
6021#[inline]
6022#[target_feature(enable = "avx512fp16,avx512vl")]
6023#[cfg_attr(test, assert_instr(vfnmadd))]
6024#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6025pub unsafe fn _mm256_maskz_fnmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6026    simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), _mm256_setzero_ph())
6027}
6028
6029/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6030/// result from packed elements in c, and store the results in dst.
6031///
6032/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_ph)
6033#[inline]
6034#[target_feature(enable = "avx512fp16")]
6035#[cfg_attr(test, assert_instr(vfnmadd))]
6036#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6037pub unsafe fn _mm512_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6038    simd_fma(simd_neg(a), b, c)
6039}
6040
6041/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6042/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6043/// from a when the corresponding mask bit is not set).
6044///
6045/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_ph)
6046#[inline]
6047#[target_feature(enable = "avx512fp16")]
6048#[cfg_attr(test, assert_instr(vfnmadd))]
6049#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6050pub unsafe fn _mm512_mask_fnmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6051    simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), a)
6052}
6053
6054/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6055/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6056/// from c when the corresponding mask bit is not set).
6057///
6058/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_ph)
6059#[inline]
6060#[target_feature(enable = "avx512fp16")]
6061#[cfg_attr(test, assert_instr(vfnmadd))]
6062#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6063pub unsafe fn _mm512_mask3_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6064    simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), c)
6065}
6066
6067/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6068/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6069/// out when the corresponding mask bit is not set).
6070///
6071/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_ph)
6072#[inline]
6073#[target_feature(enable = "avx512fp16")]
6074#[cfg_attr(test, assert_instr(vfnmadd))]
6075#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6076pub unsafe fn _mm512_maskz_fnmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6077    simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), _mm512_setzero_ph())
6078}
6079
6080/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6081/// result from packed elements in c, and store the results in dst.
6082///
6083/// Rounding is done according to the rounding parameter, which can be one of:
6084///
6085/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6086/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6087/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6088/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6089/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6090///
6091/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_round_ph)
6092#[inline]
6093#[target_feature(enable = "avx512fp16")]
6094#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6095#[rustc_legacy_const_generics(3)]
6096#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6097pub unsafe fn _mm512_fnmadd_round_ph<const ROUNDING: i32>(
6098    a: __m512h,
6099    b: __m512h,
6100    c: __m512h,
6101) -> __m512h {
6102    static_assert_rounding!(ROUNDING);
6103    vfmaddph_512(simd_neg(a), b, c, ROUNDING)
6104}
6105
6106/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6107/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6108/// from a when the corresponding mask bit is not set).
6109///
6110/// Rounding is done according to the rounding parameter, which can be one of:
6111///
6112/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6113/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6114/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6115/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6116/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6117///
6118/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_ph)
6119#[inline]
6120#[target_feature(enable = "avx512fp16")]
6121#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6122#[rustc_legacy_const_generics(4)]
6123#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6124pub unsafe fn _mm512_mask_fnmadd_round_ph<const ROUNDING: i32>(
6125    a: __m512h,
6126    k: __mmask32,
6127    b: __m512h,
6128    c: __m512h,
6129) -> __m512h {
6130    static_assert_rounding!(ROUNDING);
6131    simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), a)
6132}
6133
6134/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6135/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6136/// from c when the corresponding mask bit is not set).
6137///
6138/// Rounding is done according to the rounding parameter, which can be one of:
6139///
6140/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6141/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6142/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6143/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6144/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6145///
6146/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_round_ph)
6147#[inline]
6148#[target_feature(enable = "avx512fp16")]
6149#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6150#[rustc_legacy_const_generics(4)]
6151#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6152pub unsafe fn _mm512_mask3_fnmadd_round_ph<const ROUNDING: i32>(
6153    a: __m512h,
6154    b: __m512h,
6155    c: __m512h,
6156    k: __mmask32,
6157) -> __m512h {
6158    static_assert_rounding!(ROUNDING);
6159    simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), c)
6160}
6161
6162/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6163/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6164/// out when the corresponding mask bit is not set).
6165///
6166/// Rounding is done according to the rounding parameter, which can be one of:
6167///
6168/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6169/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6170/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6171/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6172/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6173///
6174/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_ph)
6175#[inline]
6176#[target_feature(enable = "avx512fp16")]
6177#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6178#[rustc_legacy_const_generics(4)]
6179#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6180pub unsafe fn _mm512_maskz_fnmadd_round_ph<const ROUNDING: i32>(
6181    k: __mmask32,
6182    a: __m512h,
6183    b: __m512h,
6184    c: __m512h,
6185) -> __m512h {
6186    static_assert_rounding!(ROUNDING);
6187    simd_select_bitmask(
6188        k,
6189        _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c),
6190        _mm512_setzero_ph(),
6191    )
6192}
6193
6194/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6195/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6196/// elements from a to the upper elements of dst.
6197///
6198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_sh)
6199#[inline]
6200#[target_feature(enable = "avx512fp16")]
6201#[cfg_attr(test, assert_instr(vfnmadd))]
6202#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6203pub unsafe fn _mm_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6204    let extracta: f16 = simd_extract!(a, 0);
6205    let extractb: f16 = simd_extract!(b, 0);
6206    let extractc: f16 = simd_extract!(c, 0);
6207    let r = fmaf16(-extracta, extractb, extractc);
6208    simd_insert!(a, 0, r)
6209}
6210
6211/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6212/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6213/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6214/// elements of dst.
6215///
6216/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_sh)
6217#[inline]
6218#[target_feature(enable = "avx512fp16")]
6219#[cfg_attr(test, assert_instr(vfnmadd))]
6220#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6221pub unsafe fn _mm_mask_fnmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6222    let mut fnmadd: f16 = simd_extract!(a, 0);
6223    if k & 1 != 0 {
6224        let extractb: f16 = simd_extract!(b, 0);
6225        let extractc: f16 = simd_extract!(c, 0);
6226        fnmadd = fmaf16(-fnmadd, extractb, extractc);
6227    }
6228    simd_insert!(a, 0, fnmadd)
6229}
6230
6231/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6232/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6233/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6234/// elements of dst.
6235///
6236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_sh)
6237#[inline]
6238#[target_feature(enable = "avx512fp16")]
6239#[cfg_attr(test, assert_instr(vfnmadd))]
6240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6241pub unsafe fn _mm_mask3_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6242    let mut fnmadd: f16 = simd_extract!(c, 0);
6243    if k & 1 != 0 {
6244        let extracta: f16 = simd_extract!(a, 0);
6245        let extractb: f16 = simd_extract!(b, 0);
6246        fnmadd = fmaf16(-extracta, extractb, fnmadd);
6247    }
6248    simd_insert!(c, 0, fnmadd)
6249}
6250
6251/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6252/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6253/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6254/// elements of dst.
6255///
6256/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_sh)
6257#[inline]
6258#[target_feature(enable = "avx512fp16")]
6259#[cfg_attr(test, assert_instr(vfnmadd))]
6260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6261pub unsafe fn _mm_maskz_fnmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6262    let mut fnmadd: f16 = 0.0;
6263    if k & 1 != 0 {
6264        let extracta: f16 = simd_extract!(a, 0);
6265        let extractb: f16 = simd_extract!(b, 0);
6266        let extractc: f16 = simd_extract!(c, 0);
6267        fnmadd = fmaf16(-extracta, extractb, extractc);
6268    }
6269    simd_insert!(a, 0, fnmadd)
6270}
6271
6272/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6273/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6274/// elements from a to the upper elements of dst.
6275///
6276/// Rounding is done according to the rounding parameter, which can be one of:
6277///
6278/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6279/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6280/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6281/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6282/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6283///
6284/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_sh)
6285#[inline]
6286#[target_feature(enable = "avx512fp16")]
6287#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6288#[rustc_legacy_const_generics(3)]
6289#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6290pub unsafe fn _mm_fnmadd_round_sh<const ROUNDING: i32>(
6291    a: __m128h,
6292    b: __m128h,
6293    c: __m128h,
6294) -> __m128h {
6295    static_assert_rounding!(ROUNDING);
6296    let extracta: f16 = simd_extract!(a, 0);
6297    let extractb: f16 = simd_extract!(b, 0);
6298    let extractc: f16 = simd_extract!(c, 0);
6299    let r = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6300    simd_insert!(a, 0, r)
6301}
6302
6303/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6304/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6305/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6306/// elements of dst.
6307///
6308/// Rounding is done according to the rounding parameter, which can be one of:
6309///
6310/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6311/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6312/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6313/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6314/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6315///
6316/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_round_sh)
6317#[inline]
6318#[target_feature(enable = "avx512fp16")]
6319#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6320#[rustc_legacy_const_generics(4)]
6321#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6322pub unsafe fn _mm_mask_fnmadd_round_sh<const ROUNDING: i32>(
6323    a: __m128h,
6324    k: __mmask8,
6325    b: __m128h,
6326    c: __m128h,
6327) -> __m128h {
6328    static_assert_rounding!(ROUNDING);
6329    let mut fnmadd: f16 = simd_extract!(a, 0);
6330    if k & 1 != 0 {
6331        let extractb: f16 = simd_extract!(b, 0);
6332        let extractc: f16 = simd_extract!(c, 0);
6333        fnmadd = vfmaddsh(-fnmadd, extractb, extractc, ROUNDING);
6334    }
6335    simd_insert!(a, 0, fnmadd)
6336}
6337
6338/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6339/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6340/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6341/// elements of dst.
6342///
6343/// Rounding is done according to the rounding parameter, which can be one of:
6344///
6345/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6346/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6347/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6348/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6349/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6350///
6351/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_round_sh)
6352#[inline]
6353#[target_feature(enable = "avx512fp16")]
6354#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6355#[rustc_legacy_const_generics(4)]
6356#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6357pub unsafe fn _mm_mask3_fnmadd_round_sh<const ROUNDING: i32>(
6358    a: __m128h,
6359    b: __m128h,
6360    c: __m128h,
6361    k: __mmask8,
6362) -> __m128h {
6363    static_assert_rounding!(ROUNDING);
6364    let mut fnmadd: f16 = simd_extract!(c, 0);
6365    if k & 1 != 0 {
6366        let extracta: f16 = simd_extract!(a, 0);
6367        let extractb: f16 = simd_extract!(b, 0);
6368        fnmadd = vfmaddsh(-extracta, extractb, fnmadd, ROUNDING);
6369    }
6370    simd_insert!(c, 0, fnmadd)
6371}
6372
6373/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6374/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6375/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6376/// elements of dst.
6377///
6378/// Rounding is done according to the rounding parameter, which can be one of:
6379///
6380/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6381/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6382/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6383/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6384/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6385///
6386/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_round_sh)
6387#[inline]
6388#[target_feature(enable = "avx512fp16")]
6389#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6390#[rustc_legacy_const_generics(4)]
6391#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6392pub unsafe fn _mm_maskz_fnmadd_round_sh<const ROUNDING: i32>(
6393    k: __mmask8,
6394    a: __m128h,
6395    b: __m128h,
6396    c: __m128h,
6397) -> __m128h {
6398    static_assert_rounding!(ROUNDING);
6399    let mut fnmadd: f16 = 0.0;
6400    if k & 1 != 0 {
6401        let extracta: f16 = simd_extract!(a, 0);
6402        let extractb: f16 = simd_extract!(b, 0);
6403        let extractc: f16 = simd_extract!(c, 0);
6404        fnmadd = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6405    }
6406    simd_insert!(a, 0, fnmadd)
6407}
6408
6409/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6410/// in c from the negated intermediate result, and store the results in dst.
6411///
6412/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ph)
6413#[inline]
6414#[target_feature(enable = "avx512fp16,avx512vl")]
6415#[cfg_attr(test, assert_instr(vfnmsub))]
6416#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6417pub unsafe fn _mm_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6418    simd_fma(simd_neg(a), b, simd_neg(c))
6419}
6420
6421/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6422/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6423/// copied from a when the corresponding mask bit is not set).
6424///
6425/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ph)
6426#[inline]
6427#[target_feature(enable = "avx512fp16,avx512vl")]
6428#[cfg_attr(test, assert_instr(vfnmsub))]
6429#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6430pub unsafe fn _mm_mask_fnmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6431    simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), a)
6432}
6433
6434/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6435/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6436/// copied from c when the corresponding mask bit is not set).
6437///
6438/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ph)
6439#[inline]
6440#[target_feature(enable = "avx512fp16,avx512vl")]
6441#[cfg_attr(test, assert_instr(vfnmsub))]
6442#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6443pub unsafe fn _mm_mask3_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6444    simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), c)
6445}
6446
6447/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6448/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6449/// zeroed out when the corresponding mask bit is not set).
6450///
6451/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ph)
6452#[inline]
6453#[target_feature(enable = "avx512fp16,avx512vl")]
6454#[cfg_attr(test, assert_instr(vfnmsub))]
6455#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6456pub unsafe fn _mm_maskz_fnmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6457    simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), _mm_setzero_ph())
6458}
6459
6460/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6461/// in c from the negated intermediate result, and store the results in dst.
6462///
6463/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_ph)
6464#[inline]
6465#[target_feature(enable = "avx512fp16,avx512vl")]
6466#[cfg_attr(test, assert_instr(vfnmsub))]
6467#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6468pub unsafe fn _mm256_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6469    simd_fma(simd_neg(a), b, simd_neg(c))
6470}
6471
6472/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6473/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6474/// copied from a when the corresponding mask bit is not set).
6475///
6476/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_ph)
6477#[inline]
6478#[target_feature(enable = "avx512fp16,avx512vl")]
6479#[cfg_attr(test, assert_instr(vfnmsub))]
6480#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6481pub unsafe fn _mm256_mask_fnmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6482    simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), a)
6483}
6484
6485/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6486/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6487/// copied from c when the corresponding mask bit is not set).
6488///
6489/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_ph)
6490#[inline]
6491#[target_feature(enable = "avx512fp16,avx512vl")]
6492#[cfg_attr(test, assert_instr(vfnmsub))]
6493#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6494pub unsafe fn _mm256_mask3_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6495    simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), c)
6496}
6497
6498/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6499/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6500/// zeroed out when the corresponding mask bit is not set).
6501///
6502/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_ph)
6503#[inline]
6504#[target_feature(enable = "avx512fp16,avx512vl")]
6505#[cfg_attr(test, assert_instr(vfnmsub))]
6506#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6507pub unsafe fn _mm256_maskz_fnmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6508    simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), _mm256_setzero_ph())
6509}
6510
6511/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6512/// in c from the negated intermediate result, and store the results in dst.
6513///
6514/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_ph)
6515#[inline]
6516#[target_feature(enable = "avx512fp16")]
6517#[cfg_attr(test, assert_instr(vfnmsub))]
6518#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6519pub unsafe fn _mm512_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6520    simd_fma(simd_neg(a), b, simd_neg(c))
6521}
6522
6523/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6524/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6525/// copied from a when the corresponding mask bit is not set).
6526///
6527/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_ph)
6528#[inline]
6529#[target_feature(enable = "avx512fp16")]
6530#[cfg_attr(test, assert_instr(vfnmsub))]
6531#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6532pub unsafe fn _mm512_mask_fnmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6533    simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), a)
6534}
6535
6536/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6537/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6538/// copied from c when the corresponding mask bit is not set).
6539///
6540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_ph)
6541#[inline]
6542#[target_feature(enable = "avx512fp16")]
6543#[cfg_attr(test, assert_instr(vfnmsub))]
6544#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6545pub unsafe fn _mm512_mask3_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6546    simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), c)
6547}
6548
6549/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6550/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6551/// zeroed out when the corresponding mask bit is not set).
6552///
6553/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_ph)
6554#[inline]
6555#[target_feature(enable = "avx512fp16")]
6556#[cfg_attr(test, assert_instr(vfnmsub))]
6557#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6558pub unsafe fn _mm512_maskz_fnmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6559    simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), _mm512_setzero_ph())
6560}
6561
6562/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6563/// in c from the negated intermediate result, and store the results in dst.
6564///
6565/// Rounding is done according to the rounding parameter, which can be one of:
6566///
6567/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6568/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6569/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6570/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6571/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6572///
6573/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_round_ph)
6574#[inline]
6575#[target_feature(enable = "avx512fp16")]
6576#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6577#[rustc_legacy_const_generics(3)]
6578#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6579pub unsafe fn _mm512_fnmsub_round_ph<const ROUNDING: i32>(
6580    a: __m512h,
6581    b: __m512h,
6582    c: __m512h,
6583) -> __m512h {
6584    static_assert_rounding!(ROUNDING);
6585    vfmaddph_512(simd_neg(a), b, simd_neg(c), ROUNDING)
6586}
6587
6588/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6589/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6590/// copied from a when the corresponding mask bit is not set).
6591///
6592/// Rounding is done according to the rounding parameter, which can be one of:
6593///
6594/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6595/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6596/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6597/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6598/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6599///
6600/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_round_ph)
6601#[inline]
6602#[target_feature(enable = "avx512fp16")]
6603#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6604#[rustc_legacy_const_generics(4)]
6605#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6606pub unsafe fn _mm512_mask_fnmsub_round_ph<const ROUNDING: i32>(
6607    a: __m512h,
6608    k: __mmask32,
6609    b: __m512h,
6610    c: __m512h,
6611) -> __m512h {
6612    static_assert_rounding!(ROUNDING);
6613    simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), a)
6614}
6615
6616/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6617/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6618/// copied from c when the corresponding mask bit is not set).
6619///
6620/// Rounding is done according to the rounding parameter, which can be one of:
6621///
6622/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6623/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6624/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6625/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6626/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6627///
6628/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_round_ph)
6629#[inline]
6630#[target_feature(enable = "avx512fp16")]
6631#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6632#[rustc_legacy_const_generics(4)]
6633#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6634pub unsafe fn _mm512_mask3_fnmsub_round_ph<const ROUNDING: i32>(
6635    a: __m512h,
6636    b: __m512h,
6637    c: __m512h,
6638    k: __mmask32,
6639) -> __m512h {
6640    static_assert_rounding!(ROUNDING);
6641    simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), c)
6642}
6643
6644/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6645/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6646/// zeroed out when the corresponding mask bit is not set).
6647///
6648/// Rounding is done according to the rounding parameter, which can be one of:
6649///
6650/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6651/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6652/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6653/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6654/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6655///
6656/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_round_ph)
6657#[inline]
6658#[target_feature(enable = "avx512fp16")]
6659#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6660#[rustc_legacy_const_generics(4)]
6661#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6662pub unsafe fn _mm512_maskz_fnmsub_round_ph<const ROUNDING: i32>(
6663    k: __mmask32,
6664    a: __m512h,
6665    b: __m512h,
6666    c: __m512h,
6667) -> __m512h {
6668    static_assert_rounding!(ROUNDING);
6669    simd_select_bitmask(
6670        k,
6671        _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c),
6672        _mm512_setzero_ph(),
6673    )
6674}
6675
6676/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6677/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6678/// elements from a to the upper elements of dst.
6679///
6680/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_sh)
6681#[inline]
6682#[target_feature(enable = "avx512fp16")]
6683#[cfg_attr(test, assert_instr(vfnmsub))]
6684#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6685pub unsafe fn _mm_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6686    let extracta: f16 = simd_extract!(a, 0);
6687    let extractb: f16 = simd_extract!(b, 0);
6688    let extractc: f16 = simd_extract!(c, 0);
6689    let r = fmaf16(-extracta, extractb, -extractc);
6690    simd_insert!(a, 0, r)
6691}
6692
6693/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6694/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6695/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6696/// elements of dst.
6697///
6698/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_sh)
6699#[inline]
6700#[target_feature(enable = "avx512fp16")]
6701#[cfg_attr(test, assert_instr(vfnmsub))]
6702#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6703pub unsafe fn _mm_mask_fnmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6704    let mut fnmsub: f16 = simd_extract!(a, 0);
6705    if k & 1 != 0 {
6706        let extractb: f16 = simd_extract!(b, 0);
6707        let extractc: f16 = simd_extract!(c, 0);
6708        fnmsub = fmaf16(-fnmsub, extractb, -extractc);
6709    }
6710    simd_insert!(a, 0, fnmsub)
6711}
6712
6713/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6714/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6715/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6716/// elements of dst.
6717///
6718/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_sh)
6719#[inline]
6720#[target_feature(enable = "avx512fp16")]
6721#[cfg_attr(test, assert_instr(vfnmsub))]
6722#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6723pub unsafe fn _mm_mask3_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6724    let mut fnmsub: f16 = simd_extract!(c, 0);
6725    if k & 1 != 0 {
6726        let extracta: f16 = simd_extract!(a, 0);
6727        let extractb: f16 = simd_extract!(b, 0);
6728        fnmsub = fmaf16(-extracta, extractb, -fnmsub);
6729    }
6730    simd_insert!(c, 0, fnmsub)
6731}
6732
6733/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6734/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6735/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6736/// elements of dst.
6737///
6738/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_sh)
6739#[inline]
6740#[target_feature(enable = "avx512fp16")]
6741#[cfg_attr(test, assert_instr(vfnmsub))]
6742#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6743pub unsafe fn _mm_maskz_fnmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6744    let mut fnmsub: f16 = 0.0;
6745    if k & 1 != 0 {
6746        let extracta: f16 = simd_extract!(a, 0);
6747        let extractb: f16 = simd_extract!(b, 0);
6748        let extractc: f16 = simd_extract!(c, 0);
6749        fnmsub = fmaf16(-extracta, extractb, -extractc);
6750    }
6751    simd_insert!(a, 0, fnmsub)
6752}
6753
6754/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6755/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6756/// elements from a to the upper elements of dst.
6757///
6758/// Rounding is done according to the rounding parameter, which can be one of:
6759///
6760/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6761/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6762/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6763/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6764/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6765///
6766/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_sh)
6767#[inline]
6768#[target_feature(enable = "avx512fp16")]
6769#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6770#[rustc_legacy_const_generics(3)]
6771#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6772pub unsafe fn _mm_fnmsub_round_sh<const ROUNDING: i32>(
6773    a: __m128h,
6774    b: __m128h,
6775    c: __m128h,
6776) -> __m128h {
6777    static_assert_rounding!(ROUNDING);
6778    let extracta: f16 = simd_extract!(a, 0);
6779    let extractb: f16 = simd_extract!(b, 0);
6780    let extractc: f16 = simd_extract!(c, 0);
6781    let r = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
6782    simd_insert!(a, 0, r)
6783}
6784
6785/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6786/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6787/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6788/// elements of dst.
6789///
6790/// Rounding is done according to the rounding parameter, which can be one of:
6791///
6792/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6793/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6794/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6795/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6796/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6797///
6798/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_round_sh)
6799#[inline]
6800#[target_feature(enable = "avx512fp16")]
6801#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6802#[rustc_legacy_const_generics(4)]
6803#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6804pub unsafe fn _mm_mask_fnmsub_round_sh<const ROUNDING: i32>(
6805    a: __m128h,
6806    k: __mmask8,
6807    b: __m128h,
6808    c: __m128h,
6809) -> __m128h {
6810    static_assert_rounding!(ROUNDING);
6811    let mut fnmsub: f16 = simd_extract!(a, 0);
6812    if k & 1 != 0 {
6813        let extractb: f16 = simd_extract!(b, 0);
6814        let extractc: f16 = simd_extract!(c, 0);
6815        fnmsub = vfmaddsh(-fnmsub, extractb, -extractc, ROUNDING);
6816    }
6817    simd_insert!(a, 0, fnmsub)
6818}
6819
6820/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6821/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6822/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6823/// elements of dst.
6824///
6825/// Rounding is done according to the rounding parameter, which can be one of:
6826///
6827/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6828/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6829/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6830/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6831/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6832///
6833/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_round_sh)
6834#[inline]
6835#[target_feature(enable = "avx512fp16")]
6836#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6837#[rustc_legacy_const_generics(4)]
6838#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6839pub unsafe fn _mm_mask3_fnmsub_round_sh<const ROUNDING: i32>(
6840    a: __m128h,
6841    b: __m128h,
6842    c: __m128h,
6843    k: __mmask8,
6844) -> __m128h {
6845    static_assert_rounding!(ROUNDING);
6846    let mut fnmsub: f16 = simd_extract!(c, 0);
6847    if k & 1 != 0 {
6848        let extracta: f16 = simd_extract!(a, 0);
6849        let extractb: f16 = simd_extract!(b, 0);
6850        fnmsub = vfmaddsh(-extracta, extractb, -fnmsub, ROUNDING);
6851    }
6852    simd_insert!(c, 0, fnmsub)
6853}
6854
6855/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6856/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6857/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6858/// elements of dst.
6859///
6860/// Rounding is done according to the rounding parameter, which can be one of:
6861///
6862/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6863/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6864/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6865/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6866/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6867///
6868/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_round_sh)
6869#[inline]
6870#[target_feature(enable = "avx512fp16")]
6871#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6872#[rustc_legacy_const_generics(4)]
6873#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6874pub unsafe fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
6875    k: __mmask8,
6876    a: __m128h,
6877    b: __m128h,
6878    c: __m128h,
6879) -> __m128h {
6880    static_assert_rounding!(ROUNDING);
6881    let mut fnmsub: f16 = 0.0;
6882    if k & 1 != 0 {
6883        let extracta: f16 = simd_extract!(a, 0);
6884        let extractb: f16 = simd_extract!(b, 0);
6885        let extractc: f16 = simd_extract!(c, 0);
6886        fnmsub = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
6887    }
6888    simd_insert!(a, 0, fnmsub)
6889}
6890
6891/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
6892/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
6893///
6894/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_ph)
6895#[inline]
6896#[target_feature(enable = "avx512fp16,avx512vl")]
6897#[cfg_attr(test, assert_instr(vfmaddsub))]
6898#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6899pub unsafe fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6900    vfmaddsubph_128(a, b, c)
6901}
6902
6903/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
6904/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
6905/// (the element is copied from a when the corresponding mask bit is not set).
6906///
6907/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_ph)
6908#[inline]
6909#[target_feature(enable = "avx512fp16,avx512vl")]
6910#[cfg_attr(test, assert_instr(vfmaddsub))]
6911#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6912pub unsafe fn _mm_mask_fmaddsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6913    simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), a)
6914}
6915
6916/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
6917/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
6918/// (the element is copied from c when the corresponding mask bit is not set).
6919///
6920/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmaddsub_ph)
6921#[inline]
6922#[target_feature(enable = "avx512fp16,avx512vl")]
6923#[cfg_attr(test, assert_instr(vfmaddsub))]
6924#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6925pub unsafe fn _mm_mask3_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6926    simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), c)
6927}
6928
6929/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
6930/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
6931/// (the element is zeroed out when the corresponding mask bit is not set).
6932///
6933/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ph)
6934#[inline]
6935#[target_feature(enable = "avx512fp16,avx512vl")]
6936#[cfg_attr(test, assert_instr(vfmaddsub))]
6937#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6938pub unsafe fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6939    simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), _mm_setzero_ph())
6940}
6941
6942/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
6943/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
6944///
6945/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_ph)
6946#[inline]
6947#[target_feature(enable = "avx512fp16,avx512vl")]
6948#[cfg_attr(test, assert_instr(vfmaddsub))]
6949#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6950pub unsafe fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6951    vfmaddsubph_256(a, b, c)
6952}
6953
6954/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
6955/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
6956/// (the element is copied from a when the corresponding mask bit is not set).
6957///
6958/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_ph)
6959#[inline]
6960#[target_feature(enable = "avx512fp16,avx512vl")]
6961#[cfg_attr(test, assert_instr(vfmaddsub))]
6962#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6963pub unsafe fn _mm256_mask_fmaddsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6964    simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), a)
6965}
6966
6967/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
6968/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
6969/// (the element is copied from c when the corresponding mask bit is not set).
6970///
6971/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmaddsub_ph)
6972#[inline]
6973#[target_feature(enable = "avx512fp16,avx512vl")]
6974#[cfg_attr(test, assert_instr(vfmaddsub))]
6975#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6976pub unsafe fn _mm256_mask3_fmaddsub_ph(
6977    a: __m256h,
6978    b: __m256h,
6979    c: __m256h,
6980    k: __mmask16,
6981) -> __m256h {
6982    simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), c)
6983}
6984
6985/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
6986/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
6987/// (the element is zeroed out when the corresponding mask bit is not set).
6988///
6989/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_ph)
6990#[inline]
6991#[target_feature(enable = "avx512fp16,avx512vl")]
6992#[cfg_attr(test, assert_instr(vfmaddsub))]
6993#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6994pub unsafe fn _mm256_maskz_fmaddsub_ph(
6995    k: __mmask16,
6996    a: __m256h,
6997    b: __m256h,
6998    c: __m256h,
6999) -> __m256h {
7000    simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), _mm256_setzero_ph())
7001}
7002
7003/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7004/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7005///
7006/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_ph)
7007#[inline]
7008#[target_feature(enable = "avx512fp16")]
7009#[cfg_attr(test, assert_instr(vfmaddsub))]
7010#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7011pub unsafe fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7012    _mm512_fmaddsub_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
7013}
7014
7015/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7016/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7017/// (the element is copied from a when the corresponding mask bit is not set).
7018///
7019/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_ph)
7020#[inline]
7021#[target_feature(enable = "avx512fp16")]
7022#[cfg_attr(test, assert_instr(vfmaddsub))]
7023#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7024pub unsafe fn _mm512_mask_fmaddsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7025    simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), a)
7026}
7027
7028/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7029/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7030/// (the element is copied from c when the corresponding mask bit is not set).
7031///
7032/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_ph)
7033#[inline]
7034#[target_feature(enable = "avx512fp16")]
7035#[cfg_attr(test, assert_instr(vfmaddsub))]
7036#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7037pub unsafe fn _mm512_mask3_fmaddsub_ph(
7038    a: __m512h,
7039    b: __m512h,
7040    c: __m512h,
7041    k: __mmask32,
7042) -> __m512h {
7043    simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), c)
7044}
7045
7046/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7047/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7048/// (the element is zeroed out when the corresponding mask bit is not set).
7049///
7050/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_ph)
7051#[inline]
7052#[target_feature(enable = "avx512fp16")]
7053#[cfg_attr(test, assert_instr(vfmaddsub))]
7054#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7055pub unsafe fn _mm512_maskz_fmaddsub_ph(
7056    k: __mmask32,
7057    a: __m512h,
7058    b: __m512h,
7059    c: __m512h,
7060) -> __m512h {
7061    simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), _mm512_setzero_ph())
7062}
7063
7064/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7065/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7066///
7067/// Rounding is done according to the rounding parameter, which can be one of:
7068///
7069/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7070/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7071/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7072/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7073/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7074///
7075/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_round_ph)
7076#[inline]
7077#[target_feature(enable = "avx512fp16")]
7078#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7079#[rustc_legacy_const_generics(3)]
7080#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7081pub unsafe fn _mm512_fmaddsub_round_ph<const ROUNDING: i32>(
7082    a: __m512h,
7083    b: __m512h,
7084    c: __m512h,
7085) -> __m512h {
7086    static_assert_rounding!(ROUNDING);
7087    vfmaddsubph_512(a, b, c, ROUNDING)
7088}
7089
7090/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7091/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7092/// (the element is copied from a when the corresponding mask bit is not set).
7093///
7094/// Rounding is done according to the rounding parameter, which can be one of:
7095///
7096/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7097/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7098/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7099/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7100/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7101///
7102/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_round_ph)
7103#[inline]
7104#[target_feature(enable = "avx512fp16")]
7105#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7106#[rustc_legacy_const_generics(4)]
7107#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7108pub unsafe fn _mm512_mask_fmaddsub_round_ph<const ROUNDING: i32>(
7109    a: __m512h,
7110    k: __mmask32,
7111    b: __m512h,
7112    c: __m512h,
7113) -> __m512h {
7114    static_assert_rounding!(ROUNDING);
7115    simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), a)
7116}
7117
7118/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7119/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7120/// (the element is copied from c when the corresponding mask bit is not set).
7121///
7122/// Rounding is done according to the rounding parameter, which can be one of:
7123///
7124/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7125/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7126/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7127/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7128/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7129///
7130/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_round_ph)
7131#[inline]
7132#[target_feature(enable = "avx512fp16")]
7133#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7134#[rustc_legacy_const_generics(4)]
7135#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7136pub unsafe fn _mm512_mask3_fmaddsub_round_ph<const ROUNDING: i32>(
7137    a: __m512h,
7138    b: __m512h,
7139    c: __m512h,
7140    k: __mmask32,
7141) -> __m512h {
7142    static_assert_rounding!(ROUNDING);
7143    simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), c)
7144}
7145
7146/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7147/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7148/// (the element is zeroed out when the corresponding mask bit is not set).
7149///
7150/// Rounding is done according to the rounding parameter, which can be one of:
7151///
7152/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7153/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7154/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7155/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7156/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7157///
7158/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_round_ph)
7159#[inline]
7160#[target_feature(enable = "avx512fp16")]
7161#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7162#[rustc_legacy_const_generics(4)]
7163#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7164pub unsafe fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
7165    k: __mmask32,
7166    a: __m512h,
7167    b: __m512h,
7168    c: __m512h,
7169) -> __m512h {
7170    static_assert_rounding!(ROUNDING);
7171    simd_select_bitmask(
7172        k,
7173        _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c),
7174        _mm512_setzero_ph(),
7175    )
7176}
7177
7178/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7179/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7180///
7181/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_ph)
7182#[inline]
7183#[target_feature(enable = "avx512fp16,avx512vl")]
7184#[cfg_attr(test, assert_instr(vfmsubadd))]
7185#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7186pub unsafe fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7187    vfmaddsubph_128(a, b, simd_neg(c))
7188}
7189
7190/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7191/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7192/// (the element is copied from a when the corresponding mask bit is not set).
7193///
7194/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_ph)
7195#[inline]
7196#[target_feature(enable = "avx512fp16,avx512vl")]
7197#[cfg_attr(test, assert_instr(vfmsubadd))]
7198#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7199pub unsafe fn _mm_mask_fmsubadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7200    simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), a)
7201}
7202
7203/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7204/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7205/// (the element is copied from c when the corresponding mask bit is not set).
7206///
7207/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsubadd_ph)
7208#[inline]
7209#[target_feature(enable = "avx512fp16,avx512vl")]
7210#[cfg_attr(test, assert_instr(vfmsubadd))]
7211#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7212pub unsafe fn _mm_mask3_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7213    simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), c)
7214}
7215
7216/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7217/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7218/// (the element is zeroed out when the corresponding mask bit is not set).
7219///
7220/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_ph)
7221#[inline]
7222#[target_feature(enable = "avx512fp16,avx512vl")]
7223#[cfg_attr(test, assert_instr(vfmsubadd))]
7224#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7225pub unsafe fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7226    simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), _mm_setzero_ph())
7227}
7228
7229/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7230/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7231///
7232/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_ph)
7233#[inline]
7234#[target_feature(enable = "avx512fp16,avx512vl")]
7235#[cfg_attr(test, assert_instr(vfmsubadd))]
7236#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7237pub unsafe fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7238    vfmaddsubph_256(a, b, simd_neg(c))
7239}
7240
7241/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7242/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7243/// (the element is copied from a when the corresponding mask bit is not set).
7244///
7245/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_ph)
7246#[inline]
7247#[target_feature(enable = "avx512fp16,avx512vl")]
7248#[cfg_attr(test, assert_instr(vfmsubadd))]
7249#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7250pub unsafe fn _mm256_mask_fmsubadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7251    simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), a)
7252}
7253
7254/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7255/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7256/// (the element is copied from c when the corresponding mask bit is not set).
7257///
7258/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsubadd_ph)
7259#[inline]
7260#[target_feature(enable = "avx512fp16,avx512vl")]
7261#[cfg_attr(test, assert_instr(vfmsubadd))]
7262#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7263pub unsafe fn _mm256_mask3_fmsubadd_ph(
7264    a: __m256h,
7265    b: __m256h,
7266    c: __m256h,
7267    k: __mmask16,
7268) -> __m256h {
7269    simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), c)
7270}
7271
7272/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7273/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7274/// (the element is zeroed out when the corresponding mask bit is not set).
7275///
7276/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_ph)
7277#[inline]
7278#[target_feature(enable = "avx512fp16,avx512vl")]
7279#[cfg_attr(test, assert_instr(vfmsubadd))]
7280#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7281pub unsafe fn _mm256_maskz_fmsubadd_ph(
7282    k: __mmask16,
7283    a: __m256h,
7284    b: __m256h,
7285    c: __m256h,
7286) -> __m256h {
7287    simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), _mm256_setzero_ph())
7288}
7289
7290/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7291/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7292///
7293/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_ph)
7294#[inline]
7295#[target_feature(enable = "avx512fp16")]
7296#[cfg_attr(test, assert_instr(vfmsubadd))]
7297#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7298pub unsafe fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7299    _mm512_fmsubadd_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
7300}
7301
7302/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7303/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7304/// (the element is copied from a when the corresponding mask bit is not set).
7305///
7306/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_ph)
7307#[inline]
7308#[target_feature(enable = "avx512fp16")]
7309#[cfg_attr(test, assert_instr(vfmsubadd))]
7310#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7311pub unsafe fn _mm512_mask_fmsubadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7312    simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), a)
7313}
7314
7315/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7316/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7317/// (the element is copied from c when the corresponding mask bit is not set).
7318///
7319/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_ph)
7320#[inline]
7321#[target_feature(enable = "avx512fp16")]
7322#[cfg_attr(test, assert_instr(vfmsubadd))]
7323#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7324pub unsafe fn _mm512_mask3_fmsubadd_ph(
7325    a: __m512h,
7326    b: __m512h,
7327    c: __m512h,
7328    k: __mmask32,
7329) -> __m512h {
7330    simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), c)
7331}
7332
7333/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7334/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7335/// (the element is zeroed out when the corresponding mask bit is not set).
7336///
7337/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_ph)
7338#[inline]
7339#[target_feature(enable = "avx512fp16")]
7340#[cfg_attr(test, assert_instr(vfmsubadd))]
7341#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7342pub unsafe fn _mm512_maskz_fmsubadd_ph(
7343    k: __mmask32,
7344    a: __m512h,
7345    b: __m512h,
7346    c: __m512h,
7347) -> __m512h {
7348    simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), _mm512_setzero_ph())
7349}
7350
7351/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7352/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7353///
7354/// Rounding is done according to the rounding parameter, which can be one of:
7355///
7356/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7357/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7358/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7359/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7360/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7361///
7362/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_ph)
7363#[inline]
7364#[target_feature(enable = "avx512fp16")]
7365#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7366#[rustc_legacy_const_generics(3)]
7367#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7368pub unsafe fn _mm512_fmsubadd_round_ph<const ROUNDING: i32>(
7369    a: __m512h,
7370    b: __m512h,
7371    c: __m512h,
7372) -> __m512h {
7373    static_assert_rounding!(ROUNDING);
7374    vfmaddsubph_512(a, b, simd_neg(c), ROUNDING)
7375}
7376
7377/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7378/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7379/// (the element is copied from a when the corresponding mask bit is not set).
7380///
7381/// Rounding is done according to the rounding parameter, which can be one of:
7382///
7383/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7384/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7385/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7386/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7387/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7388///
7389/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_ph)
7390#[inline]
7391#[target_feature(enable = "avx512fp16")]
7392#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7393#[rustc_legacy_const_generics(4)]
7394#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7395pub unsafe fn _mm512_mask_fmsubadd_round_ph<const ROUNDING: i32>(
7396    a: __m512h,
7397    k: __mmask32,
7398    b: __m512h,
7399    c: __m512h,
7400) -> __m512h {
7401    static_assert_rounding!(ROUNDING);
7402    simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), a)
7403}
7404
7405/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7406/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7407/// (the element is copied from c when the corresponding mask bit is not set).
7408///
7409/// Rounding is done according to the rounding parameter, which can be one of:
7410///
7411/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7412/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7413/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7414/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7415/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7416///
7417/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_round_ph)
7418#[inline]
7419#[target_feature(enable = "avx512fp16")]
7420#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7421#[rustc_legacy_const_generics(4)]
7422#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7423pub unsafe fn _mm512_mask3_fmsubadd_round_ph<const ROUNDING: i32>(
7424    a: __m512h,
7425    b: __m512h,
7426    c: __m512h,
7427    k: __mmask32,
7428) -> __m512h {
7429    static_assert_rounding!(ROUNDING);
7430    simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), c)
7431}
7432
7433/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7434/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7435/// (the element is zeroed out when the corresponding mask bit is not set).
7436///
7437/// Rounding is done according to the rounding parameter, which can be one of:
7438///
7439/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7440/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7441/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7442/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7443/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7444///
7445/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_ph)
7446#[inline]
7447#[target_feature(enable = "avx512fp16")]
7448#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7449#[rustc_legacy_const_generics(4)]
7450#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7451pub unsafe fn _mm512_maskz_fmsubadd_round_ph<const ROUNDING: i32>(
7452    k: __mmask32,
7453    a: __m512h,
7454    b: __m512h,
7455    c: __m512h,
7456) -> __m512h {
7457    static_assert_rounding!(ROUNDING);
7458    simd_select_bitmask(
7459        k,
7460        _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c),
7461        _mm512_setzero_ph(),
7462    )
7463}
7464
7465/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7466/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7467///
7468/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ph)
7469#[inline]
7470#[target_feature(enable = "avx512fp16,avx512vl")]
7471#[cfg_attr(test, assert_instr(vrcpph))]
7472#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7473pub unsafe fn _mm_rcp_ph(a: __m128h) -> __m128h {
7474    _mm_mask_rcp_ph(_mm_undefined_ph(), 0xff, a)
7475}
7476
7477/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7478/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7479/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7480///
7481/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_ph)
7482#[inline]
7483#[target_feature(enable = "avx512fp16,avx512vl")]
7484#[cfg_attr(test, assert_instr(vrcpph))]
7485#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7486pub unsafe fn _mm_mask_rcp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7487    vrcpph_128(a, src, k)
7488}
7489
7490/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7491/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7492/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7493///
7494/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_ph)
7495#[inline]
7496#[target_feature(enable = "avx512fp16,avx512vl")]
7497#[cfg_attr(test, assert_instr(vrcpph))]
7498#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7499pub unsafe fn _mm_maskz_rcp_ph(k: __mmask8, a: __m128h) -> __m128h {
7500    _mm_mask_rcp_ph(_mm_setzero_ph(), k, a)
7501}
7502
7503/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7504/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7505///
7506/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ph)
7507#[inline]
7508#[target_feature(enable = "avx512fp16,avx512vl")]
7509#[cfg_attr(test, assert_instr(vrcpph))]
7510#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7511pub unsafe fn _mm256_rcp_ph(a: __m256h) -> __m256h {
7512    _mm256_mask_rcp_ph(_mm256_undefined_ph(), 0xffff, a)
7513}
7514
7515/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7516/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7517/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7518///
7519/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp_ph)
7520#[inline]
7521#[target_feature(enable = "avx512fp16,avx512vl")]
7522#[cfg_attr(test, assert_instr(vrcpph))]
7523#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7524pub unsafe fn _mm256_mask_rcp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
7525    vrcpph_256(a, src, k)
7526}
7527
7528/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7529/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7530/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7531///
7532/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp_ph)
7533#[inline]
7534#[target_feature(enable = "avx512fp16,avx512vl")]
7535#[cfg_attr(test, assert_instr(vrcpph))]
7536#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7537pub unsafe fn _mm256_maskz_rcp_ph(k: __mmask16, a: __m256h) -> __m256h {
7538    _mm256_mask_rcp_ph(_mm256_setzero_ph(), k, a)
7539}
7540
7541/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7542/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7543///
7544/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp_ph)
7545#[inline]
7546#[target_feature(enable = "avx512fp16")]
7547#[cfg_attr(test, assert_instr(vrcpph))]
7548#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7549pub unsafe fn _mm512_rcp_ph(a: __m512h) -> __m512h {
7550    _mm512_mask_rcp_ph(_mm512_undefined_ph(), 0xffffffff, a)
7551}
7552
7553/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7554/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7555/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7556///
7557/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp_ph)
7558#[inline]
7559#[target_feature(enable = "avx512fp16")]
7560#[cfg_attr(test, assert_instr(vrcpph))]
7561#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7562pub unsafe fn _mm512_mask_rcp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
7563    vrcpph_512(a, src, k)
7564}
7565
7566/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7567/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7568/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7569///
7570/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp_ph)
7571#[inline]
7572#[target_feature(enable = "avx512fp16")]
7573#[cfg_attr(test, assert_instr(vrcpph))]
7574#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7575pub unsafe fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h {
7576    _mm512_mask_rcp_ph(_mm512_setzero_ph(), k, a)
7577}
7578
7579/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7580/// store the result in the lower element of dst, and copy the upper 7 packed elements from a to the
7581/// upper elements of dst.
7582/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7583///
7584/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_sh)
7585#[inline]
7586#[target_feature(enable = "avx512fp16")]
7587#[cfg_attr(test, assert_instr(vrcpsh))]
7588#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7589pub unsafe fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h {
7590    _mm_mask_rcp_sh(_mm_undefined_ph(), 0xff, a, b)
7591}
7592
7593/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7594/// store the result in the lower element of dst using writemask k (the element is copied from src when
7595/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7596/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7597///
7598/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_sh)
7599#[inline]
7600#[target_feature(enable = "avx512fp16")]
7601#[cfg_attr(test, assert_instr(vrcpsh))]
7602#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7603pub unsafe fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7604    vrcpsh(a, b, src, k)
7605}
7606
7607/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7608/// store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
7609/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7610/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7611///
7612/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_sh)
7613#[inline]
7614#[target_feature(enable = "avx512fp16")]
7615#[cfg_attr(test, assert_instr(vrcpsh))]
7616#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7617pub unsafe fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7618    _mm_mask_rcp_sh(_mm_setzero_ph(), k, a, b)
7619}
7620
7621/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7622/// elements in a, and store the results in dst.
7623/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7624///
7625/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ph)
7626#[inline]
7627#[target_feature(enable = "avx512fp16,avx512vl")]
7628#[cfg_attr(test, assert_instr(vrsqrtph))]
7629#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7630pub unsafe fn _mm_rsqrt_ph(a: __m128h) -> __m128h {
7631    _mm_mask_rsqrt_ph(_mm_undefined_ph(), 0xff, a)
7632}
7633
7634/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7635/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7636/// the corresponding mask bit is not set).
7637/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7638///
7639/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_ph)
7640#[inline]
7641#[target_feature(enable = "avx512fp16,avx512vl")]
7642#[cfg_attr(test, assert_instr(vrsqrtph))]
7643#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7644pub unsafe fn _mm_mask_rsqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7645    vrsqrtph_128(a, src, k)
7646}
7647
7648/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7649/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7650/// corresponding mask bit is not set).
7651/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7652///
7653/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_ph)
7654#[inline]
7655#[target_feature(enable = "avx512fp16,avx512vl")]
7656#[cfg_attr(test, assert_instr(vrsqrtph))]
7657#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7658pub unsafe fn _mm_maskz_rsqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
7659    _mm_mask_rsqrt_ph(_mm_setzero_ph(), k, a)
7660}
7661
7662/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7663/// elements in a, and store the results in dst.
7664/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7665///
7666/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt_ph)
7667#[inline]
7668#[target_feature(enable = "avx512fp16,avx512vl")]
7669#[cfg_attr(test, assert_instr(vrsqrtph))]
7670#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7671pub unsafe fn _mm256_rsqrt_ph(a: __m256h) -> __m256h {
7672    _mm256_mask_rsqrt_ph(_mm256_undefined_ph(), 0xffff, a)
7673}
7674
7675/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7676/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7677/// the corresponding mask bit is not set).
7678/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7679///
7680/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt_ph)
7681#[inline]
7682#[target_feature(enable = "avx512fp16,avx512vl")]
7683#[cfg_attr(test, assert_instr(vrsqrtph))]
7684#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7685pub unsafe fn _mm256_mask_rsqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
7686    vrsqrtph_256(a, src, k)
7687}
7688
7689/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7690/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7691/// corresponding mask bit is not set).
7692/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7693///
7694/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rsqrt_ph)
7695#[inline]
7696#[target_feature(enable = "avx512fp16,avx512vl")]
7697#[cfg_attr(test, assert_instr(vrsqrtph))]
7698#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7699pub unsafe fn _mm256_maskz_rsqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
7700    _mm256_mask_rsqrt_ph(_mm256_setzero_ph(), k, a)
7701}
7702
7703/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7704/// elements in a, and store the results in dst.
7705/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7706///
7707/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt_ph)
7708#[inline]
7709#[target_feature(enable = "avx512fp16")]
7710#[cfg_attr(test, assert_instr(vrsqrtph))]
7711#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7712pub unsafe fn _mm512_rsqrt_ph(a: __m512h) -> __m512h {
7713    _mm512_mask_rsqrt_ph(_mm512_undefined_ph(), 0xffffffff, a)
7714}
7715
7716/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7717/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7718/// the corresponding mask bit is not set).
7719/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7720///
7721/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt_ph)
7722#[inline]
7723#[target_feature(enable = "avx512fp16")]
7724#[cfg_attr(test, assert_instr(vrsqrtph))]
7725#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7726pub unsafe fn _mm512_mask_rsqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
7727    vrsqrtph_512(a, src, k)
7728}
7729
7730/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7731/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7732/// corresponding mask bit is not set).
7733/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7734///
7735/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt_ph)
7736#[inline]
7737#[target_feature(enable = "avx512fp16")]
7738#[cfg_attr(test, assert_instr(vrsqrtph))]
7739#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7740pub unsafe fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
7741    _mm512_mask_rsqrt_ph(_mm512_setzero_ph(), k, a)
7742}
7743
7744/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
7745/// element in b, store the result in the lower element of dst, and copy the upper 7 packed elements from a
7746/// to the upper elements of dst.
7747/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7748///
7749/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_sh)
7750#[inline]
7751#[target_feature(enable = "avx512fp16")]
7752#[cfg_attr(test, assert_instr(vrsqrtsh))]
7753#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7754pub unsafe fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h {
7755    _mm_mask_rsqrt_sh(_mm_undefined_ph(), 0xff, a, b)
7756}
7757
7758/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
7759/// element in b, store the result in the lower element of dst using writemask k (the element is copied from src
7760/// when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7761/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7762///
7763/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_sh)
7764#[inline]
7765#[target_feature(enable = "avx512fp16")]
7766#[cfg_attr(test, assert_instr(vrsqrtsh))]
7767#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7768pub unsafe fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7769    vrsqrtsh(a, b, src, k)
7770}
7771
7772/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
7773/// element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when
7774/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7775/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7776///
7777/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_sh)
7778#[inline]
7779#[target_feature(enable = "avx512fp16")]
7780#[cfg_attr(test, assert_instr(vrsqrtsh))]
7781#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7782pub unsafe fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7783    _mm_mask_rsqrt_sh(_mm_setzero_ph(), k, a, b)
7784}
7785
7786/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7787/// results in dst.
7788///
7789/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ph)
7790#[inline]
7791#[target_feature(enable = "avx512fp16,avx512vl")]
7792#[cfg_attr(test, assert_instr(vsqrtph))]
7793#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7794pub unsafe fn _mm_sqrt_ph(a: __m128h) -> __m128h {
7795    simd_fsqrt(a)
7796}
7797
7798/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7799/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7800///
7801/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ph)
7802#[inline]
7803#[target_feature(enable = "avx512fp16,avx512vl")]
7804#[cfg_attr(test, assert_instr(vsqrtph))]
7805#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7806pub unsafe fn _mm_mask_sqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7807    simd_select_bitmask(k, _mm_sqrt_ph(a), src)
7808}
7809
7810/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7811/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7812///
7813/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ph)
7814#[inline]
7815#[target_feature(enable = "avx512fp16,avx512vl")]
7816#[cfg_attr(test, assert_instr(vsqrtph))]
7817#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7818pub unsafe fn _mm_maskz_sqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
7819    simd_select_bitmask(k, _mm_sqrt_ph(a), _mm_setzero_ph())
7820}
7821
7822/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7823/// results in dst.
7824///
7825/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ph)
7826#[inline]
7827#[target_feature(enable = "avx512fp16,avx512vl")]
7828#[cfg_attr(test, assert_instr(vsqrtph))]
7829#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7830pub unsafe fn _mm256_sqrt_ph(a: __m256h) -> __m256h {
7831    simd_fsqrt(a)
7832}
7833
7834/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7835/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7836///
7837/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_ph)
7838#[inline]
7839#[target_feature(enable = "avx512fp16,avx512vl")]
7840#[cfg_attr(test, assert_instr(vsqrtph))]
7841#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7842pub unsafe fn _mm256_mask_sqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
7843    simd_select_bitmask(k, _mm256_sqrt_ph(a), src)
7844}
7845
7846/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7847/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7848///
7849/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_ph)
7850#[inline]
7851#[target_feature(enable = "avx512fp16,avx512vl")]
7852#[cfg_attr(test, assert_instr(vsqrtph))]
7853#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7854pub unsafe fn _mm256_maskz_sqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
7855    simd_select_bitmask(k, _mm256_sqrt_ph(a), _mm256_setzero_ph())
7856}
7857
7858/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7859/// results in dst.
7860///
7861/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_ph)
7862#[inline]
7863#[target_feature(enable = "avx512fp16")]
7864#[cfg_attr(test, assert_instr(vsqrtph))]
7865#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7866pub unsafe fn _mm512_sqrt_ph(a: __m512h) -> __m512h {
7867    simd_fsqrt(a)
7868}
7869
7870/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7871/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7872///
7873/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_ph)
7874#[inline]
7875#[target_feature(enable = "avx512fp16")]
7876#[cfg_attr(test, assert_instr(vsqrtph))]
7877#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7878pub unsafe fn _mm512_mask_sqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
7879    simd_select_bitmask(k, _mm512_sqrt_ph(a), src)
7880}
7881
7882/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7883/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7884///
7885/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_ph)
7886#[inline]
7887#[target_feature(enable = "avx512fp16")]
7888#[cfg_attr(test, assert_instr(vsqrtph))]
7889#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7890pub unsafe fn _mm512_maskz_sqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
7891    simd_select_bitmask(k, _mm512_sqrt_ph(a), _mm512_setzero_ph())
7892}
7893
7894/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7895/// results in dst.
7896/// Rounding is done according to the rounding parameter, which can be one of:
7897///
7898/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7899/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7900/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7901/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7902/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7903///
7904/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_ph)
7905#[inline]
7906#[target_feature(enable = "avx512fp16")]
7907#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
7908#[rustc_legacy_const_generics(1)]
7909#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7910pub unsafe fn _mm512_sqrt_round_ph<const ROUNDING: i32>(a: __m512h) -> __m512h {
7911    static_assert_rounding!(ROUNDING);
7912    vsqrtph_512(a, ROUNDING)
7913}
7914
7915/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7916/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
7917/// Rounding is done according to the rounding parameter, which can be one of:
7918///
7919/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7920/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7921/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7922/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7923/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7924///
7925/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_ph)
7926#[inline]
7927#[target_feature(enable = "avx512fp16")]
7928#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
7929#[rustc_legacy_const_generics(3)]
7930#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7931pub unsafe fn _mm512_mask_sqrt_round_ph<const ROUNDING: i32>(
7932    src: __m512h,
7933    k: __mmask32,
7934    a: __m512h,
7935) -> __m512h {
7936    static_assert_rounding!(ROUNDING);
7937    simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), src)
7938}
7939
7940/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
7941/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
7942/// Rounding is done according to the rounding parameter, which can be one of:
7943///
7944/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7945/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7946/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7947/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7948/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7949///
7950/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_round_ph)
7951#[inline]
7952#[target_feature(enable = "avx512fp16")]
7953#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
7954#[rustc_legacy_const_generics(2)]
7955#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7956pub unsafe fn _mm512_maskz_sqrt_round_ph<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512h {
7957    static_assert_rounding!(ROUNDING);
7958    simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), _mm512_setzero_ph())
7959}
7960
7961/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
7962/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
7963/// elements of dst.
7964///
7965/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sh)
7966#[inline]
7967#[target_feature(enable = "avx512fp16")]
7968#[cfg_attr(test, assert_instr(vsqrtsh))]
7969#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7970pub unsafe fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h {
7971    _mm_mask_sqrt_sh(_mm_undefined_ph(), 0xff, a, b)
7972}
7973
7974/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
7975/// the result in the lower element of dst using writemask k (the element is copied from src when mask
7976/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7977///
7978/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sh)
7979#[inline]
7980#[target_feature(enable = "avx512fp16")]
7981#[cfg_attr(test, assert_instr(vsqrtsh))]
7982#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7983pub unsafe fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7984    _mm_mask_sqrt_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
7985}
7986
7987/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
7988/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
7989/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7990///
7991/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sh)
7992#[inline]
7993#[target_feature(enable = "avx512fp16")]
7994#[cfg_attr(test, assert_instr(vsqrtsh))]
7995#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7996pub unsafe fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7997    _mm_mask_sqrt_sh(_mm_setzero_ph(), k, a, b)
7998}
7999
8000/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8001/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8002/// elements of dst.
8003/// Rounding is done according to the rounding parameter, which can be one of:
8004///
8005/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8006/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8007/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8008/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8009/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8010///
8011/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sh)
8012#[inline]
8013#[target_feature(enable = "avx512fp16")]
8014#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8015#[rustc_legacy_const_generics(2)]
8016#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8017pub unsafe fn _mm_sqrt_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
8018    static_assert_rounding!(ROUNDING);
8019    _mm_mask_sqrt_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
8020}
8021
8022/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8023/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8024/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8025/// Rounding is done according to the rounding parameter, which can be one of:
8026///
8027/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8028/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8029/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8030/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8031/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8032///
8033/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sh)
8034#[inline]
8035#[target_feature(enable = "avx512fp16")]
8036#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8037#[rustc_legacy_const_generics(4)]
8038#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8039pub unsafe fn _mm_mask_sqrt_round_sh<const ROUNDING: i32>(
8040    src: __m128h,
8041    k: __mmask8,
8042    a: __m128h,
8043    b: __m128h,
8044) -> __m128h {
8045    static_assert_rounding!(ROUNDING);
8046    vsqrtsh(a, b, src, k, ROUNDING)
8047}
8048
8049/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8050/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8051/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8052/// Rounding is done according to the rounding parameter, which can be one of:
8053///
8054/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8055/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8056/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8057/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8058/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8059///
8060/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sh)
8061#[inline]
8062#[target_feature(enable = "avx512fp16")]
8063#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8064#[rustc_legacy_const_generics(3)]
8065#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8066pub unsafe fn _mm_maskz_sqrt_round_sh<const ROUNDING: i32>(
8067    k: __mmask8,
8068    a: __m128h,
8069    b: __m128h,
8070) -> __m128h {
8071    static_assert_rounding!(ROUNDING);
8072    _mm_mask_sqrt_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
8073}
8074
8075/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8076/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8077/// value when inputs are NaN or signed-zero values.
8078///
8079/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ph)
8080#[inline]
8081#[target_feature(enable = "avx512fp16,avx512vl")]
8082#[cfg_attr(test, assert_instr(vmaxph))]
8083#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8084pub unsafe fn _mm_max_ph(a: __m128h, b: __m128h) -> __m128h {
8085    vmaxph_128(a, b)
8086}
8087
8088/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8089/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8090/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8091/// NaN or signed-zero values.
8092///
8093/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ph)
8094#[inline]
8095#[target_feature(enable = "avx512fp16,avx512vl")]
8096#[cfg_attr(test, assert_instr(vmaxph))]
8097#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8098pub unsafe fn _mm_mask_max_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8099    simd_select_bitmask(k, _mm_max_ph(a, b), src)
8100}
8101
8102/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8103/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8104/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8105/// NaN or signed-zero values.
8106///
8107/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ph)
8108#[inline]
8109#[target_feature(enable = "avx512fp16,avx512vl")]
8110#[cfg_attr(test, assert_instr(vmaxph))]
8111#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8112pub unsafe fn _mm_maskz_max_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8113    simd_select_bitmask(k, _mm_max_ph(a, b), _mm_setzero_ph())
8114}
8115
8116/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8117/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8118/// value when inputs are NaN or signed-zero values.
8119///
8120/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ph)
8121#[inline]
8122#[target_feature(enable = "avx512fp16,avx512vl")]
8123#[cfg_attr(test, assert_instr(vmaxph))]
8124#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8125pub unsafe fn _mm256_max_ph(a: __m256h, b: __m256h) -> __m256h {
8126    vmaxph_256(a, b)
8127}
8128
8129/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8130/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8131/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8132/// NaN or signed-zero values.
8133///
8134/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_ph)
8135#[inline]
8136#[target_feature(enable = "avx512fp16,avx512vl")]
8137#[cfg_attr(test, assert_instr(vmaxph))]
8138#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8139pub unsafe fn _mm256_mask_max_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8140    simd_select_bitmask(k, _mm256_max_ph(a, b), src)
8141}
8142
8143/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8144/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8145/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8146/// NaN or signed-zero values.
8147///
8148/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_ph)
8149#[inline]
8150#[target_feature(enable = "avx512fp16,avx512vl")]
8151#[cfg_attr(test, assert_instr(vmaxph))]
8152#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8153pub unsafe fn _mm256_maskz_max_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8154    simd_select_bitmask(k, _mm256_max_ph(a, b), _mm256_setzero_ph())
8155}
8156
8157/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8158/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8159/// value when inputs are NaN or signed-zero values.
8160///
8161/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_ph)
8162#[inline]
8163#[target_feature(enable = "avx512fp16")]
8164#[cfg_attr(test, assert_instr(vmaxph))]
8165#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8166pub unsafe fn _mm512_max_ph(a: __m512h, b: __m512h) -> __m512h {
8167    _mm512_max_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8168}
8169
8170/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8171/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8172/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8173/// NaN or signed-zero values.
8174///
8175/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_ph)
8176#[inline]
8177#[target_feature(enable = "avx512fp16")]
8178#[cfg_attr(test, assert_instr(vmaxph))]
8179#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8180pub unsafe fn _mm512_mask_max_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8181    simd_select_bitmask(k, _mm512_max_ph(a, b), src)
8182}
8183
8184/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8185/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8186/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8187/// NaN or signed-zero values.
8188///
8189/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_ph)
8190#[inline]
8191#[target_feature(enable = "avx512fp16")]
8192#[cfg_attr(test, assert_instr(vmaxph))]
8193#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8194pub unsafe fn _mm512_maskz_max_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8195    simd_select_bitmask(k, _mm512_max_ph(a, b), _mm512_setzero_ph())
8196}
8197
8198/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8199/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8200/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8201/// NaN or signed-zero values.
8202///
8203/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_ph)
8204#[inline]
8205#[target_feature(enable = "avx512fp16")]
8206#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8207#[rustc_legacy_const_generics(2)]
8208#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8209pub unsafe fn _mm512_max_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8210    static_assert_sae!(SAE);
8211    vmaxph_512(a, b, SAE)
8212}
8213
8214/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8215/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8216/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8217/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8218///
8219/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_ph)
8220#[inline]
8221#[target_feature(enable = "avx512fp16")]
8222#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8223#[rustc_legacy_const_generics(4)]
8224#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8225pub unsafe fn _mm512_mask_max_round_ph<const SAE: i32>(
8226    src: __m512h,
8227    k: __mmask32,
8228    a: __m512h,
8229    b: __m512h,
8230) -> __m512h {
8231    static_assert_sae!(SAE);
8232    simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), src)
8233}
8234
8235/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8236/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8237/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8238/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8239///
8240/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_ph)
8241#[inline]
8242#[target_feature(enable = "avx512fp16")]
8243#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8244#[rustc_legacy_const_generics(3)]
8245#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8246pub unsafe fn _mm512_maskz_max_round_ph<const SAE: i32>(
8247    k: __mmask32,
8248    a: __m512h,
8249    b: __m512h,
8250) -> __m512h {
8251    static_assert_sae!(SAE);
8252    simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), _mm512_setzero_ph())
8253}
8254
8255/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8256/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
8257/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value
8258/// when inputs are NaN or signed-zero values.
8259///
8260/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sh)
8261#[inline]
8262#[target_feature(enable = "avx512fp16,avx512vl")]
8263#[cfg_attr(test, assert_instr(vmaxsh))]
8264#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8265pub unsafe fn _mm_max_sh(a: __m128h, b: __m128h) -> __m128h {
8266    _mm_mask_max_sh(_mm_undefined_ph(), 0xff, a, b)
8267}
8268
8269/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8270/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
8271/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
8272/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8273///
8274/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sh)
8275#[inline]
8276#[target_feature(enable = "avx512fp16,avx512vl")]
8277#[cfg_attr(test, assert_instr(vmaxsh))]
8278#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8279pub unsafe fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8280    _mm_mask_max_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8281}
8282
8283/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8284/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8285/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
8286/// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8287///
8288/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sh)
8289#[inline]
8290#[target_feature(enable = "avx512fp16,avx512vl")]
8291#[cfg_attr(test, assert_instr(vmaxsh))]
8292#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8293pub unsafe fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8294    _mm_mask_max_sh(_mm_setzero_ph(), k, a, b)
8295}
8296
8297/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8298/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
8299/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8300/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8301///
8302/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sh)
8303#[inline]
8304#[target_feature(enable = "avx512fp16,avx512vl")]
8305#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8306#[rustc_legacy_const_generics(2)]
8307#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8308pub unsafe fn _mm_max_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
8309    static_assert_sae!(SAE);
8310    _mm_mask_max_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
8311}
8312
8313/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8314/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
8315/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8316/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8317/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8318///
8319/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sh)
8320#[inline]
8321#[target_feature(enable = "avx512fp16,avx512vl")]
8322#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8323#[rustc_legacy_const_generics(4)]
8324#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8325pub unsafe fn _mm_mask_max_round_sh<const SAE: i32>(
8326    src: __m128h,
8327    k: __mmask8,
8328    a: __m128h,
8329    b: __m128h,
8330) -> __m128h {
8331    static_assert_sae!(SAE);
8332    vmaxsh(a, b, src, k, SAE)
8333}
8334
8335/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8336/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8337/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8338/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8339/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8340///
8341/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sh)
8342#[inline]
8343#[target_feature(enable = "avx512fp16,avx512vl")]
8344#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8345#[rustc_legacy_const_generics(3)]
8346#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8347pub unsafe fn _mm_maskz_max_round_sh<const SAE: i32>(
8348    k: __mmask8,
8349    a: __m128h,
8350    b: __m128h,
8351) -> __m128h {
8352    static_assert_sae!(SAE);
8353    _mm_mask_max_round_sh::<SAE>(_mm_setzero_ph(), k, a, b)
8354}
8355
8356/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8357/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8358/// when inputs are NaN or signed-zero values.
8359///
8360/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ph)
8361#[inline]
8362#[target_feature(enable = "avx512fp16,avx512vl")]
8363#[cfg_attr(test, assert_instr(vminph))]
8364#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8365pub unsafe fn _mm_min_ph(a: __m128h, b: __m128h) -> __m128h {
8366    vminph_128(a, b)
8367}
8368
8369/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8370/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8371/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8372/// NaN or signed-zero values.
8373///
8374/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ph)
8375#[inline]
8376#[target_feature(enable = "avx512fp16,avx512vl")]
8377#[cfg_attr(test, assert_instr(vminph))]
8378#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8379pub unsafe fn _mm_mask_min_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8380    simd_select_bitmask(k, _mm_min_ph(a, b), src)
8381}
8382
8383/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8384/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8385/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8386/// NaN or signed-zero values.
8387///
8388/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ph)
8389#[inline]
8390#[target_feature(enable = "avx512fp16,avx512vl")]
8391#[cfg_attr(test, assert_instr(vminph))]
8392#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8393pub unsafe fn _mm_maskz_min_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8394    simd_select_bitmask(k, _mm_min_ph(a, b), _mm_setzero_ph())
8395}
8396
8397/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8398/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8399/// when inputs are NaN or signed-zero values.
8400///
8401/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ph)
8402#[inline]
8403#[target_feature(enable = "avx512fp16,avx512vl")]
8404#[cfg_attr(test, assert_instr(vminph))]
8405#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8406pub unsafe fn _mm256_min_ph(a: __m256h, b: __m256h) -> __m256h {
8407    vminph_256(a, b)
8408}
8409
8410/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8411/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8412/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8413/// NaN or signed-zero values.
8414///
8415/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_ph)
8416#[inline]
8417#[target_feature(enable = "avx512fp16,avx512vl")]
8418#[cfg_attr(test, assert_instr(vminph))]
8419#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8420pub unsafe fn _mm256_mask_min_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8421    simd_select_bitmask(k, _mm256_min_ph(a, b), src)
8422}
8423
8424/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8425/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8426/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8427/// NaN or signed-zero values.
8428///
8429/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_ph)
8430#[inline]
8431#[target_feature(enable = "avx512fp16,avx512vl")]
8432#[cfg_attr(test, assert_instr(vminph))]
8433#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8434pub unsafe fn _mm256_maskz_min_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8435    simd_select_bitmask(k, _mm256_min_ph(a, b), _mm256_setzero_ph())
8436}
8437
8438/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8439/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8440/// when inputs are NaN or signed-zero values.
8441///
8442/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_ph)
8443#[inline]
8444#[target_feature(enable = "avx512fp16")]
8445#[cfg_attr(test, assert_instr(vminph))]
8446#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8447pub unsafe fn _mm512_min_ph(a: __m512h, b: __m512h) -> __m512h {
8448    _mm512_min_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8449}
8450
8451/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8452/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8453/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8454/// NaN or signed-zero values.
8455///
8456/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ph)
8457#[inline]
8458#[target_feature(enable = "avx512fp16")]
8459#[cfg_attr(test, assert_instr(vminph))]
8460#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8461pub unsafe fn _mm512_mask_min_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8462    simd_select_bitmask(k, _mm512_min_ph(a, b), src)
8463}
8464
8465/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8466/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8467/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8468/// NaN or signed-zero values.
8469///
8470/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ph)
8471#[inline]
8472#[target_feature(enable = "avx512fp16")]
8473#[cfg_attr(test, assert_instr(vminph))]
8474#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8475pub unsafe fn _mm512_maskz_min_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8476    simd_select_bitmask(k, _mm512_min_ph(a, b), _mm512_setzero_ph())
8477}
8478
8479/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8480/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not
8481/// follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8482///
8483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_ph)
8484#[inline]
8485#[target_feature(enable = "avx512fp16")]
8486#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8487#[rustc_legacy_const_generics(2)]
8488#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8489pub unsafe fn _mm512_min_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8490    static_assert_sae!(SAE);
8491    vminph_512(a, b, SAE)
8492}
8493
8494/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8495/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8496/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8497/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8498///
8499/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ph)
8500#[inline]
8501#[target_feature(enable = "avx512fp16")]
8502#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8503#[rustc_legacy_const_generics(4)]
8504#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8505pub unsafe fn _mm512_mask_min_round_ph<const SAE: i32>(
8506    src: __m512h,
8507    k: __mmask32,
8508    a: __m512h,
8509    b: __m512h,
8510) -> __m512h {
8511    static_assert_sae!(SAE);
8512    simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), src)
8513}
8514
8515/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8516/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8517/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8518/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8519///
8520/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ph)
8521#[inline]
8522#[target_feature(enable = "avx512fp16")]
8523#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8524#[rustc_legacy_const_generics(3)]
8525#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8526pub unsafe fn _mm512_maskz_min_round_ph<const SAE: i32>(
8527    k: __mmask32,
8528    a: __m512h,
8529    b: __m512h,
8530) -> __m512h {
8531    static_assert_sae!(SAE);
8532    simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), _mm512_setzero_ph())
8533}
8534
8535/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
8536/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
8537/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
8538/// inputs are NaN or signed-zero values.
8539///
8540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sh)
8541#[inline]
8542#[target_feature(enable = "avx512fp16,avx512vl")]
8543#[cfg_attr(test, assert_instr(vminsh))]
8544#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8545pub unsafe fn _mm_min_sh(a: __m128h, b: __m128h) -> __m128h {
8546    _mm_mask_min_sh(_mm_undefined_ph(), 0xff, a, b)
8547}
8548
8549/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
8550/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
8551/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
8552/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8553///
8554/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sh)
8555#[inline]
8556#[target_feature(enable = "avx512fp16,avx512vl")]
8557#[cfg_attr(test, assert_instr(vminsh))]
8558#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8559pub unsafe fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8560    _mm_mask_min_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8561}
8562
8563/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8564/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8565/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
8566/// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8567///
8568/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sh)
8569#[inline]
8570#[target_feature(enable = "avx512fp16,avx512vl")]
8571#[cfg_attr(test, assert_instr(vminsh))]
8572#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8573pub unsafe fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8574    _mm_mask_min_sh(_mm_setzero_ph(), k, a, b)
8575}
8576
8577/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8578/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
8579/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8580/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8581///
8582/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sh)
8583#[inline]
8584#[target_feature(enable = "avx512fp16,avx512vl")]
8585#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8586#[rustc_legacy_const_generics(2)]
8587#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8588pub unsafe fn _mm_min_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
8589    static_assert_sae!(SAE);
8590    _mm_mask_min_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
8591}
8592
8593/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8594/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
8595/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8596/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8597/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8598///
8599/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sh)
8600#[inline]
8601#[target_feature(enable = "avx512fp16,avx512vl")]
8602#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8603#[rustc_legacy_const_generics(4)]
8604#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8605pub unsafe fn _mm_mask_min_round_sh<const SAE: i32>(
8606    src: __m128h,
8607    k: __mmask8,
8608    a: __m128h,
8609    b: __m128h,
8610) -> __m128h {
8611    static_assert_sae!(SAE);
8612    vminsh(a, b, src, k, SAE)
8613}
8614
8615/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8616/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8617/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8618/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8619/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8620///
8621/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sh)
8622#[inline]
8623#[target_feature(enable = "avx512fp16,avx512vl")]
8624#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8625#[rustc_legacy_const_generics(3)]
8626#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8627pub unsafe fn _mm_maskz_min_round_sh<const SAE: i32>(
8628    k: __mmask8,
8629    a: __m128h,
8630    b: __m128h,
8631) -> __m128h {
8632    static_assert_sae!(SAE);
8633    _mm_mask_min_round_sh::<SAE>(_mm_setzero_ph(), k, a, b)
8634}
8635
8636/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8637/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8638/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8639///
8640/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ph)
8641#[inline]
8642#[target_feature(enable = "avx512fp16,avx512vl")]
8643#[cfg_attr(test, assert_instr(vgetexpph))]
8644#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8645pub unsafe fn _mm_getexp_ph(a: __m128h) -> __m128h {
8646    _mm_mask_getexp_ph(_mm_undefined_ph(), 0xff, a)
8647}
8648
8649/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8650/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8651/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8652/// `floor(log2(x))` for each element.
8653///
8654/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ph)
8655#[inline]
8656#[target_feature(enable = "avx512fp16,avx512vl")]
8657#[cfg_attr(test, assert_instr(vgetexpph))]
8658#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8659pub unsafe fn _mm_mask_getexp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
8660    vgetexpph_128(a, src, k)
8661}
8662
8663/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8664/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8665/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8666/// `floor(log2(x))` for each element.
8667///
8668/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ph)
8669#[inline]
8670#[target_feature(enable = "avx512fp16,avx512vl")]
8671#[cfg_attr(test, assert_instr(vgetexpph))]
8672#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8673pub unsafe fn _mm_maskz_getexp_ph(k: __mmask8, a: __m128h) -> __m128h {
8674    _mm_mask_getexp_ph(_mm_setzero_ph(), k, a)
8675}
8676
8677/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8678/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8679/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8680///
8681/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_ph)
8682#[inline]
8683#[target_feature(enable = "avx512fp16,avx512vl")]
8684#[cfg_attr(test, assert_instr(vgetexpph))]
8685#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8686pub unsafe fn _mm256_getexp_ph(a: __m256h) -> __m256h {
8687    _mm256_mask_getexp_ph(_mm256_undefined_ph(), 0xffff, a)
8688}
8689
8690/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8691/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8692/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8693/// `floor(log2(x))` for each element.
8694///
8695/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_ph)
8696#[inline]
8697#[target_feature(enable = "avx512fp16,avx512vl")]
8698#[cfg_attr(test, assert_instr(vgetexpph))]
8699#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8700pub unsafe fn _mm256_mask_getexp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8701    vgetexpph_256(a, src, k)
8702}
8703
8704/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8705/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8706/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8707/// `floor(log2(x))` for each element.
8708///
8709/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ph)
8710#[inline]
8711#[target_feature(enable = "avx512fp16,avx512vl")]
8712#[cfg_attr(test, assert_instr(vgetexpph))]
8713#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8714pub unsafe fn _mm256_maskz_getexp_ph(k: __mmask16, a: __m256h) -> __m256h {
8715    _mm256_mask_getexp_ph(_mm256_setzero_ph(), k, a)
8716}
8717
8718/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8719/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8720/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8721///
8722/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_ph)
8723#[inline]
8724#[target_feature(enable = "avx512fp16")]
8725#[cfg_attr(test, assert_instr(vgetexpph))]
8726#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8727pub unsafe fn _mm512_getexp_ph(a: __m512h) -> __m512h {
8728    _mm512_mask_getexp_ph(_mm512_undefined_ph(), 0xffffffff, a)
8729}
8730
8731/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8732/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8733/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8734/// `floor(log2(x))` for each element.
8735///
8736/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_ph)
8737#[inline]
8738#[target_feature(enable = "avx512fp16")]
8739#[cfg_attr(test, assert_instr(vgetexpph))]
8740#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8741pub unsafe fn _mm512_mask_getexp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8742    _mm512_mask_getexp_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
8743}
8744
8745/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8746/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8747/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8748/// `floor(log2(x))` for each element.
8749///
8750/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_ph)
8751#[inline]
8752#[target_feature(enable = "avx512fp16")]
8753#[cfg_attr(test, assert_instr(vgetexpph))]
8754#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8755pub unsafe fn _mm512_maskz_getexp_ph(k: __mmask32, a: __m512h) -> __m512h {
8756    _mm512_mask_getexp_ph(_mm512_setzero_ph(), k, a)
8757}
8758
8759/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8760/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8761/// This intrinsic essentially calculates `floor(log2(x))` for each element. Exceptions can be suppressed
8762/// by passing _MM_FROUND_NO_EXC in the sae parameter
8763///
8764/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_ph)
8765#[inline]
8766#[target_feature(enable = "avx512fp16")]
8767#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
8768#[rustc_legacy_const_generics(1)]
8769#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8770pub unsafe fn _mm512_getexp_round_ph<const SAE: i32>(a: __m512h) -> __m512h {
8771    static_assert_sae!(SAE);
8772    _mm512_mask_getexp_round_ph::<SAE>(_mm512_undefined_ph(), 0xffffffff, a)
8773}
8774
8775/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8776/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8777/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8778/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
8779///
8780/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_ph)
8781#[inline]
8782#[target_feature(enable = "avx512fp16")]
8783#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
8784#[rustc_legacy_const_generics(3)]
8785#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8786pub unsafe fn _mm512_mask_getexp_round_ph<const SAE: i32>(
8787    src: __m512h,
8788    k: __mmask32,
8789    a: __m512h,
8790) -> __m512h {
8791    static_assert_sae!(SAE);
8792    vgetexpph_512(a, src, k, SAE)
8793}
8794
8795/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8796/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8797/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8798/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
8799///
8800/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_ph)
8801#[inline]
8802#[target_feature(enable = "avx512fp16")]
8803#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
8804#[rustc_legacy_const_generics(2)]
8805#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8806pub unsafe fn _mm512_maskz_getexp_round_ph<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512h {
8807    static_assert_sae!(SAE);
8808    _mm512_mask_getexp_round_ph::<SAE>(_mm512_setzero_ph(), k, a)
8809}
8810
8811/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
8812/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
8813/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
8814/// calculates `floor(log2(x))` for the lower element.
8815///
8816/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sh)
8817#[inline]
8818#[target_feature(enable = "avx512fp16")]
8819#[cfg_attr(test, assert_instr(vgetexpsh))]
8820#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8821pub unsafe fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
8822    _mm_mask_getexp_sh(_mm_undefined_ph(), 0xff, a, b)
8823}
8824
8825/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
8826/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
8827/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
8828/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
8829/// for the lower element.
8830///
8831/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sh)
8832#[inline]
8833#[target_feature(enable = "avx512fp16")]
8834#[cfg_attr(test, assert_instr(vgetexpsh))]
8835#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8836pub unsafe fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8837    _mm_mask_getexp_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8838}
8839
8840/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
8841/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
8842/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
8843/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
8844/// lower element.
8845///
8846/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sh)
8847#[inline]
8848#[target_feature(enable = "avx512fp16")]
8849#[cfg_attr(test, assert_instr(vgetexpsh))]
8850#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8851pub unsafe fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8852    _mm_mask_getexp_sh(_mm_setzero_ph(), k, a, b)
8853}
8854
8855/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
8856/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
8857/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
8858/// calculates `floor(log2(x))` for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
8859/// in the sae parameter
8860///
8861/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sh)
8862#[inline]
8863#[target_feature(enable = "avx512fp16")]
8864#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
8865#[rustc_legacy_const_generics(2)]
8866#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8867pub unsafe fn _mm_getexp_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
8868    static_assert_sae!(SAE);
8869    _mm_mask_getexp_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
8870}
8871
8872/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
8873/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
8874/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
8875/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
8876/// for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
8877///
8878/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sh)
8879#[inline]
8880#[target_feature(enable = "avx512fp16")]
8881#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
8882#[rustc_legacy_const_generics(4)]
8883#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8884pub unsafe fn _mm_mask_getexp_round_sh<const SAE: i32>(
8885    src: __m128h,
8886    k: __mmask8,
8887    a: __m128h,
8888    b: __m128h,
8889) -> __m128h {
8890    static_assert_sae!(SAE);
8891    vgetexpsh(a, b, src, k, SAE)
8892}
8893
8894/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
8895/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
8896/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
8897/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
8898/// lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
8899///
8900/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sh)
8901#[inline]
8902#[target_feature(enable = "avx512fp16")]
8903#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
8904#[rustc_legacy_const_generics(3)]
8905#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8906pub unsafe fn _mm_maskz_getexp_round_sh<const SAE: i32>(
8907    k: __mmask8,
8908    a: __m128h,
8909    b: __m128h,
8910) -> __m128h {
8911    static_assert_sae!(SAE);
8912    _mm_mask_getexp_round_sh::<SAE>(_mm_setzero_ph(), k, a, b)
8913}
8914
8915/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
8916/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
8917/// on the interval range defined by norm and the sign depends on sign and the source sign.
8918///
8919/// The mantissa is normalized to the interval specified by interv, which can take the following values:
8920///
8921///     _MM_MANT_NORM_1_2     // interval [1, 2)
8922///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
8923///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
8924///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
8925///
8926/// The sign is determined by sc which can take the following values:
8927///
8928///     _MM_MANT_SIGN_src     // sign = sign(src)
8929///     _MM_MANT_SIGN_zero    // sign = 0
8930///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
8931///
8932/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ph)
8933#[inline]
8934#[target_feature(enable = "avx512fp16,avx512vl")]
8935#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
8936#[rustc_legacy_const_generics(1, 2)]
8937#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8938pub unsafe fn _mm_getmant_ph<
8939    const NORM: _MM_MANTISSA_NORM_ENUM,
8940    const SIGN: _MM_MANTISSA_SIGN_ENUM,
8941>(
8942    a: __m128h,
8943) -> __m128h {
8944    static_assert_uimm_bits!(NORM, 4);
8945    static_assert_uimm_bits!(SIGN, 2);
8946    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_undefined_ph(), 0xff, a)
8947}
8948
8949/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
8950/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8951/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
8952/// by norm and the sign depends on sign and the source sign.
8953///
8954/// The mantissa is normalized to the interval specified by interv, which can take the following values:
8955///
8956///     _MM_MANT_NORM_1_2     // interval [1, 2)
8957///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
8958///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
8959///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
8960///
8961/// The sign is determined by sc which can take the following values:
8962///
8963///     _MM_MANT_SIGN_src     // sign = sign(src)
8964///     _MM_MANT_SIGN_zero    // sign = 0
8965///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
8966///
8967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ph)
8968#[inline]
8969#[target_feature(enable = "avx512fp16,avx512vl")]
8970#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
8971#[rustc_legacy_const_generics(3, 4)]
8972#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8973pub unsafe fn _mm_mask_getmant_ph<
8974    const NORM: _MM_MANTISSA_NORM_ENUM,
8975    const SIGN: _MM_MANTISSA_SIGN_ENUM,
8976>(
8977    src: __m128h,
8978    k: __mmask8,
8979    a: __m128h,
8980) -> __m128h {
8981    static_assert_uimm_bits!(NORM, 4);
8982    static_assert_uimm_bits!(SIGN, 2);
8983    vgetmantph_128(a, (SIGN << 2) | NORM, src, k)
8984}
8985
8986/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
8987/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8988/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
8989/// by norm and the sign depends on sign and the source sign.
8990///
8991/// The mantissa is normalized to the interval specified by interv, which can take the following values:
8992///
8993///     _MM_MANT_NORM_1_2     // interval [1, 2)
8994///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
8995///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
8996///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
8997///
8998/// The sign is determined by sc which can take the following values:
8999///
9000///     _MM_MANT_SIGN_src     // sign = sign(src)
9001///     _MM_MANT_SIGN_zero    // sign = 0
9002///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9003///
9004/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ph)
9005#[inline]
9006#[target_feature(enable = "avx512fp16,avx512vl")]
9007#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9008#[rustc_legacy_const_generics(2, 3)]
9009#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9010pub unsafe fn _mm_maskz_getmant_ph<
9011    const NORM: _MM_MANTISSA_NORM_ENUM,
9012    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9013>(
9014    k: __mmask8,
9015    a: __m128h,
9016) -> __m128h {
9017    static_assert_uimm_bits!(NORM, 4);
9018    static_assert_uimm_bits!(SIGN, 2);
9019    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_setzero_ph(), k, a)
9020}
9021
9022/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9023/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9024/// on the interval range defined by norm and the sign depends on sign and the source sign.
9025///
9026/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9027///
9028///     _MM_MANT_NORM_1_2     // interval [1, 2)
9029///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9030///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9031///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9032///
9033/// The sign is determined by sc which can take the following values:
9034///
9035///     _MM_MANT_SIGN_src     // sign = sign(src)
9036///     _MM_MANT_SIGN_zero    // sign = 0
9037///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9038///
9039/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_ph)
9040#[inline]
9041#[target_feature(enable = "avx512fp16,avx512vl")]
9042#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9043#[rustc_legacy_const_generics(1, 2)]
9044#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9045pub unsafe fn _mm256_getmant_ph<
9046    const NORM: _MM_MANTISSA_NORM_ENUM,
9047    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9048>(
9049    a: __m256h,
9050) -> __m256h {
9051    static_assert_uimm_bits!(NORM, 4);
9052    static_assert_uimm_bits!(SIGN, 2);
9053    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_undefined_ph(), 0xffff, a)
9054}
9055
9056/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9057/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9058/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9059/// by norm and the sign depends on sign and the source sign.
9060///
9061/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9062///
9063///     _MM_MANT_NORM_1_2     // interval [1, 2)
9064///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9065///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9066///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9067///
9068/// The sign is determined by sc which can take the following values:
9069///
9070///     _MM_MANT_SIGN_src     // sign = sign(src)
9071///     _MM_MANT_SIGN_zero    // sign = 0
9072///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9073///
9074/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ph)
9075#[inline]
9076#[target_feature(enable = "avx512fp16,avx512vl")]
9077#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9078#[rustc_legacy_const_generics(3, 4)]
9079#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9080pub unsafe fn _mm256_mask_getmant_ph<
9081    const NORM: _MM_MANTISSA_NORM_ENUM,
9082    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9083>(
9084    src: __m256h,
9085    k: __mmask16,
9086    a: __m256h,
9087) -> __m256h {
9088    static_assert_uimm_bits!(NORM, 4);
9089    static_assert_uimm_bits!(SIGN, 2);
9090    vgetmantph_256(a, (SIGN << 2) | NORM, src, k)
9091}
9092
9093/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9094/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9095/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9096/// by norm and the sign depends on sign and the source sign.
9097///
9098/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9099///
9100///     _MM_MANT_NORM_1_2     // interval [1, 2)
9101///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9102///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9103///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9104///
9105/// The sign is determined by sc which can take the following values:
9106///
9107///     _MM_MANT_SIGN_src     // sign = sign(src)
9108///     _MM_MANT_SIGN_zero    // sign = 0
9109///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9110///
9111/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ph)
9112#[inline]
9113#[target_feature(enable = "avx512fp16,avx512vl")]
9114#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9115#[rustc_legacy_const_generics(2, 3)]
9116#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9117pub unsafe fn _mm256_maskz_getmant_ph<
9118    const NORM: _MM_MANTISSA_NORM_ENUM,
9119    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9120>(
9121    k: __mmask16,
9122    a: __m256h,
9123) -> __m256h {
9124    static_assert_uimm_bits!(NORM, 4);
9125    static_assert_uimm_bits!(SIGN, 2);
9126    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_setzero_ph(), k, a)
9127}
9128
9129/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9130/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9131/// on the interval range defined by norm and the sign depends on sign and the source sign.
9132///
9133/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9134///
9135///     _MM_MANT_NORM_1_2     // interval [1, 2)
9136///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9137///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9138///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9139///
9140/// The sign is determined by sc which can take the following values:
9141///
9142///     _MM_MANT_SIGN_src     // sign = sign(src)
9143///     _MM_MANT_SIGN_zero    // sign = 0
9144///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9145///
9146/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_ph)
9147#[inline]
9148#[target_feature(enable = "avx512fp16")]
9149#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9150#[rustc_legacy_const_generics(1, 2)]
9151#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9152pub unsafe fn _mm512_getmant_ph<
9153    const NORM: _MM_MANTISSA_NORM_ENUM,
9154    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9155>(
9156    a: __m512h,
9157) -> __m512h {
9158    static_assert_uimm_bits!(NORM, 4);
9159    static_assert_uimm_bits!(SIGN, 2);
9160    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_undefined_ph(), 0xffffffff, a)
9161}
9162
9163/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9164/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9165/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9166/// by norm and the sign depends on sign and the source sign.
9167///
9168/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9169///
9170///     _MM_MANT_NORM_1_2     // interval [1, 2)
9171///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9172///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9173///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9174///
9175/// The sign is determined by sc which can take the following values:
9176///
9177///     _MM_MANT_SIGN_src     // sign = sign(src)
9178///     _MM_MANT_SIGN_zero    // sign = 0
9179///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9180///
9181/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_ph)
9182#[inline]
9183#[target_feature(enable = "avx512fp16")]
9184#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9185#[rustc_legacy_const_generics(3, 4)]
9186#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9187pub unsafe fn _mm512_mask_getmant_ph<
9188    const NORM: _MM_MANTISSA_NORM_ENUM,
9189    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9190>(
9191    src: __m512h,
9192    k: __mmask32,
9193    a: __m512h,
9194) -> __m512h {
9195    static_assert_uimm_bits!(NORM, 4);
9196    static_assert_uimm_bits!(SIGN, 2);
9197    _mm512_mask_getmant_round_ph::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a)
9198}
9199
9200/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9201/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9202/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9203/// by norm and the sign depends on sign and the source sign.
9204///
9205/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9206///
9207///     _MM_MANT_NORM_1_2     // interval [1, 2)
9208///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9209///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9210///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9211///
9212/// The sign is determined by sc which can take the following values:
9213///
9214///     _MM_MANT_SIGN_src     // sign = sign(src)
9215///     _MM_MANT_SIGN_zero    // sign = 0
9216///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9217///
9218/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ph)
9219#[inline]
9220#[target_feature(enable = "avx512fp16")]
9221#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9222#[rustc_legacy_const_generics(2, 3)]
9223#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9224pub unsafe fn _mm512_maskz_getmant_ph<
9225    const NORM: _MM_MANTISSA_NORM_ENUM,
9226    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9227>(
9228    k: __mmask32,
9229    a: __m512h,
9230) -> __m512h {
9231    static_assert_uimm_bits!(NORM, 4);
9232    static_assert_uimm_bits!(SIGN, 2);
9233    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_setzero_ph(), k, a)
9234}
9235
9236/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9237/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9238/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9239/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9240///
9241/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9242///
9243///     _MM_MANT_NORM_1_2     // interval [1, 2)
9244///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9245///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9246///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9247///
9248/// The sign is determined by sc which can take the following values:
9249///
9250///     _MM_MANT_SIGN_src     // sign = sign(src)
9251///     _MM_MANT_SIGN_zero    // sign = 0
9252///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9253///
9254/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9255///
9256/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_ph)
9257#[inline]
9258#[target_feature(enable = "avx512fp16")]
9259#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9260#[rustc_legacy_const_generics(1, 2, 3)]
9261#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9262pub unsafe fn _mm512_getmant_round_ph<
9263    const NORM: _MM_MANTISSA_NORM_ENUM,
9264    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9265    const SAE: i32,
9266>(
9267    a: __m512h,
9268) -> __m512h {
9269    static_assert_uimm_bits!(NORM, 4);
9270    static_assert_uimm_bits!(SIGN, 2);
9271    static_assert_sae!(SAE);
9272    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
9273}
9274
9275/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9276/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9277/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9278/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9279/// in the sae parameter
9280///
9281/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9282///
9283///     _MM_MANT_NORM_1_2     // interval [1, 2)
9284///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9285///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9286///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9287///
9288/// The sign is determined by sc which can take the following values:
9289///
9290///     _MM_MANT_SIGN_src     // sign = sign(src)
9291///     _MM_MANT_SIGN_zero    // sign = 0
9292///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9293///
9294/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9295///
9296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_ph)
9297#[inline]
9298#[target_feature(enable = "avx512fp16")]
9299#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9300#[rustc_legacy_const_generics(3, 4, 5)]
9301#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9302pub unsafe fn _mm512_mask_getmant_round_ph<
9303    const NORM: _MM_MANTISSA_NORM_ENUM,
9304    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9305    const SAE: i32,
9306>(
9307    src: __m512h,
9308    k: __mmask32,
9309    a: __m512h,
9310) -> __m512h {
9311    static_assert_uimm_bits!(NORM, 4);
9312    static_assert_uimm_bits!(SIGN, 2);
9313    static_assert_sae!(SAE);
9314    vgetmantph_512(a, (SIGN << 2) | NORM, src, k, SAE)
9315}
9316
9317/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9318/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9319/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9320/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9321/// in the sae parameter
9322///
9323/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9324///
9325///     _MM_MANT_NORM_1_2     // interval [1, 2)
9326///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9327///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9328///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9329///
9330/// The sign is determined by sc which can take the following values:
9331///
9332///     _MM_MANT_SIGN_src     // sign = sign(src)
9333///     _MM_MANT_SIGN_zero    // sign = 0
9334///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9335///
9336/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9337///
9338/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ph)
9339#[inline]
9340#[target_feature(enable = "avx512fp16")]
9341#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9342#[rustc_legacy_const_generics(2, 3, 4)]
9343#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9344pub unsafe fn _mm512_maskz_getmant_round_ph<
9345    const NORM: _MM_MANTISSA_NORM_ENUM,
9346    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9347    const SAE: i32,
9348>(
9349    k: __mmask32,
9350    a: __m512h,
9351) -> __m512h {
9352    static_assert_uimm_bits!(NORM, 4);
9353    static_assert_uimm_bits!(SIGN, 2);
9354    static_assert_sae!(SAE);
9355    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_setzero_ph(), k, a)
9356}
9357
9358/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9359/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9360/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9361/// on the interval range defined by norm and the sign depends on sign and the source sign.
9362///
9363/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9364///
9365///     _MM_MANT_NORM_1_2     // interval [1, 2)
9366///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9367///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9368///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9369///
9370/// The sign is determined by sc which can take the following values:
9371///
9372///     _MM_MANT_SIGN_src     // sign = sign(src)
9373///     _MM_MANT_SIGN_zero    // sign = 0
9374///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9375///
9376/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sh)
9377#[inline]
9378#[target_feature(enable = "avx512fp16")]
9379#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9380#[rustc_legacy_const_generics(2, 3)]
9381#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9382pub unsafe fn _mm_getmant_sh<
9383    const NORM: _MM_MANTISSA_NORM_ENUM,
9384    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9385>(
9386    a: __m128h,
9387    b: __m128h,
9388) -> __m128h {
9389    static_assert_uimm_bits!(NORM, 4);
9390    static_assert_uimm_bits!(SIGN, 2);
9391    _mm_mask_getmant_sh::<NORM, SIGN>(_mm_undefined_ph(), 0xff, a, b)
9392}
9393
9394/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9395/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9396/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9397/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9398/// the source sign.
9399///
9400/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9401///
9402///     _MM_MANT_NORM_1_2     // interval [1, 2)
9403///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9404///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9405///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9406///
9407/// The sign is determined by sc which can take the following values:
9408///
9409///     _MM_MANT_SIGN_src     // sign = sign(src)
9410///     _MM_MANT_SIGN_zero    // sign = 0
9411///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9412///
9413/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sh)
9414#[inline]
9415#[target_feature(enable = "avx512fp16")]
9416#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9417#[rustc_legacy_const_generics(4, 5)]
9418#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9419pub unsafe fn _mm_mask_getmant_sh<
9420    const NORM: _MM_MANTISSA_NORM_ENUM,
9421    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9422>(
9423    src: __m128h,
9424    k: __mmask8,
9425    a: __m128h,
9426    b: __m128h,
9427) -> __m128h {
9428    static_assert_uimm_bits!(NORM, 4);
9429    static_assert_uimm_bits!(SIGN, 2);
9430    _mm_mask_getmant_round_sh::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9431}
9432
9433/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9434/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
9435/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9436/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9437/// the source sign.
9438///
9439/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9440///
9441///     _MM_MANT_NORM_1_2     // interval [1, 2)
9442///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9443///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9444///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9445///
9446/// The sign is determined by sc which can take the following values:
9447///
9448///     _MM_MANT_SIGN_src     // sign = sign(src)
9449///     _MM_MANT_SIGN_zero    // sign = 0
9450///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9451///
9452/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sh)
9453#[inline]
9454#[target_feature(enable = "avx512fp16")]
9455#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9456#[rustc_legacy_const_generics(3, 4)]
9457#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9458pub unsafe fn _mm_maskz_getmant_sh<
9459    const NORM: _MM_MANTISSA_NORM_ENUM,
9460    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9461>(
9462    k: __mmask8,
9463    a: __m128h,
9464    b: __m128h,
9465) -> __m128h {
9466    static_assert_uimm_bits!(NORM, 4);
9467    static_assert_uimm_bits!(SIGN, 2);
9468    _mm_mask_getmant_sh::<NORM, SIGN>(_mm_setzero_ph(), k, a, b)
9469}
9470
9471/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9472/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9473/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9474/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9475/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9476///
9477/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9478///
9479///     _MM_MANT_NORM_1_2     // interval [1, 2)
9480///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9481///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9482///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9483///
9484/// The sign is determined by sc which can take the following values:
9485///
9486///     _MM_MANT_SIGN_src     // sign = sign(src)
9487///     _MM_MANT_SIGN_zero    // sign = 0
9488///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9489///
9490/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9491///
9492/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sh)
9493#[inline]
9494#[target_feature(enable = "avx512fp16")]
9495#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9496#[rustc_legacy_const_generics(2, 3, 4)]
9497#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9498pub unsafe fn _mm_getmant_round_sh<
9499    const NORM: _MM_MANTISSA_NORM_ENUM,
9500    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9501    const SAE: i32,
9502>(
9503    a: __m128h,
9504    b: __m128h,
9505) -> __m128h {
9506    static_assert_uimm_bits!(NORM, 4);
9507    static_assert_uimm_bits!(SIGN, 2);
9508    static_assert_sae!(SAE);
9509    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(_mm_undefined_ph(), 0xff, a, b)
9510}
9511
9512/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9513/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9514/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9515/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9516/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9517///
9518/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9519///
9520///     _MM_MANT_NORM_1_2     // interval [1, 2)
9521///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9522///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9523///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9524///
9525/// The sign is determined by sc which can take the following values:
9526///
9527///     _MM_MANT_SIGN_src     // sign = sign(src)
9528///     _MM_MANT_SIGN_zero    // sign = 0
9529///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9530///
9531/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9532///
9533/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sh)
9534#[inline]
9535#[target_feature(enable = "avx512fp16")]
9536#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9537#[rustc_legacy_const_generics(4, 5, 6)]
9538#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9539pub unsafe fn _mm_mask_getmant_round_sh<
9540    const NORM: _MM_MANTISSA_NORM_ENUM,
9541    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9542    const SAE: i32,
9543>(
9544    src: __m128h,
9545    k: __mmask8,
9546    a: __m128h,
9547    b: __m128h,
9548) -> __m128h {
9549    static_assert_uimm_bits!(NORM, 4);
9550    static_assert_uimm_bits!(SIGN, 2);
9551    static_assert_sae!(SAE);
9552    vgetmantsh(a, b, (SIGN << 2) | NORM, src, k, SAE)
9553}
9554
9555/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9556/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
9557/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9558/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9559/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9560///
9561/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9562///
9563///     _MM_MANT_NORM_1_2     // interval [1, 2)
9564///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9565///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9566///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9567///
9568/// The sign is determined by sc which can take the following values:
9569///
9570///     _MM_MANT_SIGN_src     // sign = sign(src)
9571///     _MM_MANT_SIGN_zero    // sign = 0
9572///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9573///
9574/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9575///
9576/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sh)
9577#[inline]
9578#[target_feature(enable = "avx512fp16")]
9579#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9580#[rustc_legacy_const_generics(3, 4, 5)]
9581#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9582pub unsafe fn _mm_maskz_getmant_round_sh<
9583    const NORM: _MM_MANTISSA_NORM_ENUM,
9584    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9585    const SAE: i32,
9586>(
9587    k: __mmask8,
9588    a: __m128h,
9589    b: __m128h,
9590) -> __m128h {
9591    static_assert_uimm_bits!(NORM, 4);
9592    static_assert_uimm_bits!(SIGN, 2);
9593    static_assert_sae!(SAE);
9594    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(_mm_setzero_ph(), k, a, b)
9595}
9596
9597/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9598/// specified by imm8, and store the results in dst.
9599///
9600/// Rounding is done according to the imm8 parameter, which can be one of:
9601///
9602/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9603/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9604/// * [`_MM_FROUND_TO_POS_INF`] : round up
9605/// * [`_MM_FROUND_TO_ZERO`] : truncate
9606/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9607///
9608/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ph)
9609#[inline]
9610#[target_feature(enable = "avx512fp16,avx512vl")]
9611#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9612#[rustc_legacy_const_generics(1)]
9613#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9614pub unsafe fn _mm_roundscale_ph<const IMM8: i32>(a: __m128h) -> __m128h {
9615    static_assert_uimm_bits!(IMM8, 8);
9616    _mm_mask_roundscale_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
9617}
9618
9619/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9620/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9621/// the corresponding mask bit is not set).
9622///
9623/// Rounding is done according to the imm8 parameter, which can be one of:
9624///
9625/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9626/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9627/// * [`_MM_FROUND_TO_POS_INF`] : round up
9628/// * [`_MM_FROUND_TO_ZERO`] : truncate
9629/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9630///
9631/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ph)
9632#[inline]
9633#[target_feature(enable = "avx512fp16,avx512vl")]
9634#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9635#[rustc_legacy_const_generics(3)]
9636#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9637pub unsafe fn _mm_mask_roundscale_ph<const IMM8: i32>(
9638    src: __m128h,
9639    k: __mmask8,
9640    a: __m128h,
9641) -> __m128h {
9642    static_assert_uimm_bits!(IMM8, 8);
9643    vrndscaleph_128(a, IMM8, src, k)
9644}
9645
9646/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9647/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9648/// mask bit is not set).
9649///
9650/// Rounding is done according to the imm8 parameter, which can be one of:
9651///
9652/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9653/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9654/// * [`_MM_FROUND_TO_POS_INF`] : round up
9655/// * [`_MM_FROUND_TO_ZERO`] : truncate
9656/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9657///
9658/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ph)
9659#[inline]
9660#[target_feature(enable = "avx512fp16,avx512vl")]
9661#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9662#[rustc_legacy_const_generics(2)]
9663#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9664pub unsafe fn _mm_maskz_roundscale_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
9665    static_assert_uimm_bits!(IMM8, 8);
9666    _mm_mask_roundscale_ph::<IMM8>(_mm_setzero_ph(), k, a)
9667}
9668
9669/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9670/// specified by imm8, and store the results in dst.
9671///
9672/// Rounding is done according to the imm8 parameter, which can be one of:
9673///
9674/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9675/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9676/// * [`_MM_FROUND_TO_POS_INF`] : round up
9677/// * [`_MM_FROUND_TO_ZERO`] : truncate
9678/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9679///
9680/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_ph)
9681#[inline]
9682#[target_feature(enable = "avx512fp16,avx512vl")]
9683#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9684#[rustc_legacy_const_generics(1)]
9685#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9686pub unsafe fn _mm256_roundscale_ph<const IMM8: i32>(a: __m256h) -> __m256h {
9687    static_assert_uimm_bits!(IMM8, 8);
9688    _mm256_mask_roundscale_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
9689}
9690
9691/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9692/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9693/// the corresponding mask bit is not set).
9694///
9695/// Rounding is done according to the imm8 parameter, which can be one of:
9696///
9697/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9698/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9699/// * [`_MM_FROUND_TO_POS_INF`] : round up
9700/// * [`_MM_FROUND_TO_ZERO`] : truncate
9701/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9702///
9703/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ph)
9704#[inline]
9705#[target_feature(enable = "avx512fp16,avx512vl")]
9706#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9707#[rustc_legacy_const_generics(3)]
9708#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9709pub unsafe fn _mm256_mask_roundscale_ph<const IMM8: i32>(
9710    src: __m256h,
9711    k: __mmask16,
9712    a: __m256h,
9713) -> __m256h {
9714    static_assert_uimm_bits!(IMM8, 8);
9715    vrndscaleph_256(a, IMM8, src, k)
9716}
9717
9718/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9719/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9720/// mask bit is not set).
9721///
9722/// Rounding is done according to the imm8 parameter, which can be one of:
9723///
9724/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9725/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9726/// * [`_MM_FROUND_TO_POS_INF`] : round up
9727/// * [`_MM_FROUND_TO_ZERO`] : truncate
9728/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9729///
9730/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ph)
9731#[inline]
9732#[target_feature(enable = "avx512fp16,avx512vl")]
9733#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9734#[rustc_legacy_const_generics(2)]
9735#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9736pub unsafe fn _mm256_maskz_roundscale_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
9737    static_assert_uimm_bits!(IMM8, 8);
9738    _mm256_mask_roundscale_ph::<IMM8>(_mm256_setzero_ph(), k, a)
9739}
9740
9741/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9742/// specified by imm8, and store the results in dst.
9743///
9744/// Rounding is done according to the imm8 parameter, which can be one of:
9745///
9746/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9747/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9748/// * [`_MM_FROUND_TO_POS_INF`] : round up
9749/// * [`_MM_FROUND_TO_ZERO`] : truncate
9750/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9751///
9752/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ph)
9753#[inline]
9754#[target_feature(enable = "avx512fp16")]
9755#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9756#[rustc_legacy_const_generics(1)]
9757#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9758pub unsafe fn _mm512_roundscale_ph<const IMM8: i32>(a: __m512h) -> __m512h {
9759    static_assert_uimm_bits!(IMM8, 8);
9760    _mm512_mask_roundscale_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
9761}
9762
9763/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9764/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9765/// the corresponding mask bit is not set).
9766///
9767/// Rounding is done according to the imm8 parameter, which can be one of:
9768///
9769/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9770/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9771/// * [`_MM_FROUND_TO_POS_INF`] : round up
9772/// * [`_MM_FROUND_TO_ZERO`] : truncate
9773/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9774///
9775/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ph)
9776#[inline]
9777#[target_feature(enable = "avx512fp16")]
9778#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9779#[rustc_legacy_const_generics(3)]
9780#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9781pub unsafe fn _mm512_mask_roundscale_ph<const IMM8: i32>(
9782    src: __m512h,
9783    k: __mmask32,
9784    a: __m512h,
9785) -> __m512h {
9786    static_assert_uimm_bits!(IMM8, 8);
9787    _mm512_mask_roundscale_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
9788}
9789
9790/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9791/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9792/// mask bit is not set).
9793///
9794/// Rounding is done according to the imm8 parameter, which can be one of:
9795///
9796/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9797/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9798/// * [`_MM_FROUND_TO_POS_INF`] : round up
9799/// * [`_MM_FROUND_TO_ZERO`] : truncate
9800/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9801///
9802/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ph)
9803#[inline]
9804#[target_feature(enable = "avx512fp16")]
9805#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9806#[rustc_legacy_const_generics(2)]
9807#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9808pub unsafe fn _mm512_maskz_roundscale_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
9809    static_assert_uimm_bits!(IMM8, 8);
9810    _mm512_mask_roundscale_ph::<IMM8>(_mm512_setzero_ph(), k, a)
9811}
9812
9813/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9814/// specified by imm8, and store the results in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9815/// in the sae parameter
9816///
9817/// Rounding is done according to the imm8 parameter, which can be one of:
9818///
9819/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9820/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9821/// * [`_MM_FROUND_TO_POS_INF`] : round up
9822/// * [`_MM_FROUND_TO_ZERO`] : truncate
9823/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9824///
9825/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ph)
9826#[inline]
9827#[target_feature(enable = "avx512fp16")]
9828#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
9829#[rustc_legacy_const_generics(1, 2)]
9830#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9831pub unsafe fn _mm512_roundscale_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
9832    static_assert_uimm_bits!(IMM8, 8);
9833    static_assert_sae!(SAE);
9834    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
9835}
9836
9837/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9838/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9839/// the corresponding mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9840/// in the sae parameter
9841///
9842/// Rounding is done according to the imm8 parameter, which can be one of:
9843///
9844/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9845/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9846/// * [`_MM_FROUND_TO_POS_INF`] : round up
9847/// * [`_MM_FROUND_TO_ZERO`] : truncate
9848/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9849///
9850/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ph)
9851#[inline]
9852#[target_feature(enable = "avx512fp16")]
9853#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
9854#[rustc_legacy_const_generics(3, 4)]
9855#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9856pub unsafe fn _mm512_mask_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
9857    src: __m512h,
9858    k: __mmask32,
9859    a: __m512h,
9860) -> __m512h {
9861    static_assert_uimm_bits!(IMM8, 8);
9862    static_assert_sae!(SAE);
9863    vrndscaleph_512(a, IMM8, src, k, SAE)
9864}
9865
9866/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9867/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9868/// mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9869///
9870/// Rounding is done according to the imm8 parameter, which can be one of:
9871///
9872/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9873/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9874/// * [`_MM_FROUND_TO_POS_INF`] : round up
9875/// * [`_MM_FROUND_TO_ZERO`] : truncate
9876/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9877///
9878/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ph)
9879#[inline]
9880#[target_feature(enable = "avx512fp16")]
9881#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
9882#[rustc_legacy_const_generics(2, 3)]
9883#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9884pub unsafe fn _mm512_maskz_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
9885    k: __mmask32,
9886    a: __m512h,
9887) -> __m512h {
9888    static_assert_uimm_bits!(IMM8, 8);
9889    static_assert_sae!(SAE);
9890    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
9891}
9892
9893/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
9894/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
9895/// from a to the upper elements of dst.
9896///
9897/// Rounding is done according to the imm8 parameter, which can be one of:
9898///
9899/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9900/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9901/// * [`_MM_FROUND_TO_POS_INF`] : round up
9902/// * [`_MM_FROUND_TO_ZERO`] : truncate
9903/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9904///
9905/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sh)
9906#[inline]
9907#[target_feature(enable = "avx512fp16")]
9908#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
9909#[rustc_legacy_const_generics(2)]
9910#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9911pub unsafe fn _mm_roundscale_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
9912    static_assert_uimm_bits!(IMM8, 8);
9913    _mm_mask_roundscale_sh::<IMM8>(_mm_undefined_ph(), 0xff, a, b)
9914}
9915
9916/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
9917/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
9918/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
9919///
9920/// Rounding is done according to the imm8 parameter, which can be one of:
9921///
9922/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9923/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9924/// * [`_MM_FROUND_TO_POS_INF`] : round up
9925/// * [`_MM_FROUND_TO_ZERO`] : truncate
9926/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9927///
9928/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sh)
9929#[inline]
9930#[target_feature(enable = "avx512fp16")]
9931#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
9932#[rustc_legacy_const_generics(4)]
9933#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9934pub unsafe fn _mm_mask_roundscale_sh<const IMM8: i32>(
9935    src: __m128h,
9936    k: __mmask8,
9937    a: __m128h,
9938    b: __m128h,
9939) -> __m128h {
9940    static_assert_uimm_bits!(IMM8, 8);
9941    _mm_mask_roundscale_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9942}
9943
9944/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
9945/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
9946/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
9947///
9948/// Rounding is done according to the imm8 parameter, which can be one of:
9949///
9950/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9951/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9952/// * [`_MM_FROUND_TO_POS_INF`] : round up
9953/// * [`_MM_FROUND_TO_ZERO`] : truncate
9954/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9955///
9956/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sh)
9957#[inline]
9958#[target_feature(enable = "avx512fp16")]
9959#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
9960#[rustc_legacy_const_generics(3)]
9961#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9962pub unsafe fn _mm_maskz_roundscale_sh<const IMM8: i32>(
9963    k: __mmask8,
9964    a: __m128h,
9965    b: __m128h,
9966) -> __m128h {
9967    static_assert_uimm_bits!(IMM8, 8);
9968    _mm_mask_roundscale_sh::<IMM8>(_mm_setzero_ph(), k, a, b)
9969}
9970
9971/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
9972/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
9973/// from a to the upper elements of dst.
9974///
9975/// Rounding is done according to the imm8 parameter, which can be one of:
9976///
9977/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9978/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9979/// * [`_MM_FROUND_TO_POS_INF`] : round up
9980/// * [`_MM_FROUND_TO_ZERO`] : truncate
9981/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9982///
9983/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9984///
9985/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sh)
9986#[inline]
9987#[target_feature(enable = "avx512fp16")]
9988#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
9989#[rustc_legacy_const_generics(2, 3)]
9990#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9991pub unsafe fn _mm_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
9992    a: __m128h,
9993    b: __m128h,
9994) -> __m128h {
9995    static_assert_uimm_bits!(IMM8, 8);
9996    static_assert_sae!(SAE);
9997    _mm_mask_roundscale_round_sh::<IMM8, SAE>(_mm_undefined_ph(), 0xff, a, b)
9998}
9999
10000/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10001/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10002/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10003///
10004/// Rounding is done according to the imm8 parameter, which can be one of:
10005///
10006/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10007/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10008/// * [`_MM_FROUND_TO_POS_INF`] : round up
10009/// * [`_MM_FROUND_TO_ZERO`] : truncate
10010/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10011///
10012/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10013///
10014/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sh)
10015#[inline]
10016#[target_feature(enable = "avx512fp16")]
10017#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10018#[rustc_legacy_const_generics(4, 5)]
10019#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10020pub unsafe fn _mm_mask_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10021    src: __m128h,
10022    k: __mmask8,
10023    a: __m128h,
10024    b: __m128h,
10025) -> __m128h {
10026    static_assert_uimm_bits!(IMM8, 8);
10027    static_assert_sae!(SAE);
10028    vrndscalesh(a, b, src, k, IMM8, SAE)
10029}
10030
10031/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10032/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10033/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10034///
10035/// Rounding is done according to the imm8 parameter, which can be one of:
10036///
10037/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10038/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10039/// * [`_MM_FROUND_TO_POS_INF`] : round up
10040/// * [`_MM_FROUND_TO_ZERO`] : truncate
10041/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10042///
10043/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10044///
10045/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sh)
10046#[inline]
10047#[target_feature(enable = "avx512fp16")]
10048#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10049#[rustc_legacy_const_generics(3, 4)]
10050#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10051pub unsafe fn _mm_maskz_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10052    k: __mmask8,
10053    a: __m128h,
10054    b: __m128h,
10055) -> __m128h {
10056    static_assert_uimm_bits!(IMM8, 8);
10057    static_assert_sae!(SAE);
10058    _mm_mask_roundscale_round_sh::<IMM8, SAE>(_mm_setzero_ph(), k, a, b)
10059}
10060
10061/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10062/// the results in dst.
10063///
10064/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ph)
10065#[inline]
10066#[target_feature(enable = "avx512fp16,avx512vl")]
10067#[cfg_attr(test, assert_instr(vscalefph))]
10068#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10069pub unsafe fn _mm_scalef_ph(a: __m128h, b: __m128h) -> __m128h {
10070    _mm_mask_scalef_ph(_mm_undefined_ph(), 0xff, a, b)
10071}
10072
10073/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10074/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10075///
10076/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ph)
10077#[inline]
10078#[target_feature(enable = "avx512fp16,avx512vl")]
10079#[cfg_attr(test, assert_instr(vscalefph))]
10080#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10081pub unsafe fn _mm_mask_scalef_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10082    vscalefph_128(a, b, src, k)
10083}
10084
10085/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10086/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10087///
10088/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ph)
10089#[inline]
10090#[target_feature(enable = "avx512fp16,avx512vl")]
10091#[cfg_attr(test, assert_instr(vscalefph))]
10092#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10093pub unsafe fn _mm_maskz_scalef_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10094    _mm_mask_scalef_ph(_mm_setzero_ph(), k, a, b)
10095}
10096
10097/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10098/// the results in dst.
10099///
10100/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_ph)
10101#[inline]
10102#[target_feature(enable = "avx512fp16,avx512vl")]
10103#[cfg_attr(test, assert_instr(vscalefph))]
10104#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10105pub unsafe fn _mm256_scalef_ph(a: __m256h, b: __m256h) -> __m256h {
10106    _mm256_mask_scalef_ph(_mm256_undefined_ph(), 0xffff, a, b)
10107}
10108
10109/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10110/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10111///
10112/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ph)
10113#[inline]
10114#[target_feature(enable = "avx512fp16,avx512vl")]
10115#[cfg_attr(test, assert_instr(vscalefph))]
10116#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10117pub unsafe fn _mm256_mask_scalef_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10118    vscalefph_256(a, b, src, k)
10119}
10120
10121/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10122/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10123///
10124/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_ph)
10125#[inline]
10126#[target_feature(enable = "avx512fp16,avx512vl")]
10127#[cfg_attr(test, assert_instr(vscalefph))]
10128#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10129pub unsafe fn _mm256_maskz_scalef_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10130    _mm256_mask_scalef_ph(_mm256_setzero_ph(), k, a, b)
10131}
10132
10133/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10134/// the results in dst.
10135///
10136/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ph)
10137#[inline]
10138#[target_feature(enable = "avx512fp16")]
10139#[cfg_attr(test, assert_instr(vscalefph))]
10140#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10141pub unsafe fn _mm512_scalef_ph(a: __m512h, b: __m512h) -> __m512h {
10142    _mm512_mask_scalef_ph(_mm512_undefined_ph(), 0xffffffff, a, b)
10143}
10144
10145/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10146/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10147///
10148/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ph)
10149#[inline]
10150#[target_feature(enable = "avx512fp16")]
10151#[cfg_attr(test, assert_instr(vscalefph))]
10152#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10153pub unsafe fn _mm512_mask_scalef_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10154    _mm512_mask_scalef_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10155}
10156
10157/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10158/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10159///
10160/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ph)
10161#[inline]
10162#[target_feature(enable = "avx512fp16")]
10163#[cfg_attr(test, assert_instr(vscalefph))]
10164#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10165pub unsafe fn _mm512_maskz_scalef_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10166    _mm512_mask_scalef_ph(_mm512_setzero_ph(), k, a, b)
10167}
10168
10169/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10170/// the results in dst.
10171///
10172/// Rounding is done according to the rounding parameter, which can be one of:
10173///
10174/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10175/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10176/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10177/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10178/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10179///
10180/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ph)
10181#[inline]
10182#[target_feature(enable = "avx512fp16")]
10183#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10184#[rustc_legacy_const_generics(2)]
10185#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10186pub unsafe fn _mm512_scalef_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
10187    static_assert_rounding!(ROUNDING);
10188    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_undefined_ph(), 0xffffffff, a, b)
10189}
10190
10191/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10192/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10193///
10194/// Rounding is done according to the rounding parameter, which can be one of:
10195///
10196/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10197/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10198/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10199/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10200/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10201///
10202/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ph)
10203#[inline]
10204#[target_feature(enable = "avx512fp16")]
10205#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10206#[rustc_legacy_const_generics(4)]
10207#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10208pub unsafe fn _mm512_mask_scalef_round_ph<const ROUNDING: i32>(
10209    src: __m512h,
10210    k: __mmask32,
10211    a: __m512h,
10212    b: __m512h,
10213) -> __m512h {
10214    static_assert_rounding!(ROUNDING);
10215    vscalefph_512(a, b, src, k, ROUNDING)
10216}
10217
10218/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10219/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10220///
10221/// Rounding is done according to the rounding parameter, which can be one of:
10222///
10223/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10224/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10225/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10226/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10227/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10228///
10229/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ph)
10230#[inline]
10231#[target_feature(enable = "avx512fp16")]
10232#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10233#[rustc_legacy_const_generics(3)]
10234#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10235pub unsafe fn _mm512_maskz_scalef_round_ph<const ROUNDING: i32>(
10236    k: __mmask32,
10237    a: __m512h,
10238    b: __m512h,
10239) -> __m512h {
10240    static_assert_rounding!(ROUNDING);
10241    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
10242}
10243
10244/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10245/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10246/// elements of dst.
10247///
10248/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sh)
10249#[inline]
10250#[target_feature(enable = "avx512fp16")]
10251#[cfg_attr(test, assert_instr(vscalefsh))]
10252#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10253pub unsafe fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
10254    _mm_mask_scalef_sh(_mm_undefined_ph(), 0xff, a, b)
10255}
10256
10257/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10258/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10259/// and copy the upper 7 packed elements from a to the upper elements of dst.
10260///
10261/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sh)
10262#[inline]
10263#[target_feature(enable = "avx512fp16")]
10264#[cfg_attr(test, assert_instr(vscalefsh))]
10265#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10266pub unsafe fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10267    _mm_mask_scalef_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10268}
10269
10270/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10271/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10272/// and copy the upper 7 packed elements from a to the upper elements of dst.
10273///
10274/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sh)
10275#[inline]
10276#[target_feature(enable = "avx512fp16")]
10277#[cfg_attr(test, assert_instr(vscalefsh))]
10278#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10279pub unsafe fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10280    _mm_mask_scalef_sh(_mm_setzero_ph(), k, a, b)
10281}
10282
10283/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10284/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10285/// elements of dst.
10286///
10287/// Rounding is done according to the rounding parameter, which can be one of:
10288///
10289/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10290/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10291/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10292/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10293/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10294///
10295/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sh)
10296#[inline]
10297#[target_feature(enable = "avx512fp16")]
10298#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10299#[rustc_legacy_const_generics(2)]
10300#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10301pub unsafe fn _mm_scalef_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
10302    static_assert_rounding!(ROUNDING);
10303    _mm_mask_scalef_round_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
10304}
10305
10306/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10307/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10308/// and copy the upper 7 packed elements from a to the upper elements of dst.
10309///
10310/// Rounding is done according to the rounding parameter, which can be one of:
10311///
10312/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10313/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10314/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10315/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10316/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10317///
10318/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sh)
10319#[inline]
10320#[target_feature(enable = "avx512fp16")]
10321#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10322#[rustc_legacy_const_generics(4)]
10323#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10324pub unsafe fn _mm_mask_scalef_round_sh<const ROUNDING: i32>(
10325    src: __m128h,
10326    k: __mmask8,
10327    a: __m128h,
10328    b: __m128h,
10329) -> __m128h {
10330    static_assert_rounding!(ROUNDING);
10331    vscalefsh(a, b, src, k, ROUNDING)
10332}
10333
10334/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10335/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10336/// and copy the upper 7 packed elements from a to the upper elements of dst.
10337///
10338/// Rounding is done according to the rounding parameter, which can be one of:
10339///
10340/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10341/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10342/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10343/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10344/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10345///
10346/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sh)
10347#[inline]
10348#[target_feature(enable = "avx512fp16")]
10349#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10350#[rustc_legacy_const_generics(3)]
10351#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10352pub unsafe fn _mm_maskz_scalef_round_sh<const ROUNDING: i32>(
10353    k: __mmask8,
10354    a: __m128h,
10355    b: __m128h,
10356) -> __m128h {
10357    static_assert_rounding!(ROUNDING);
10358    _mm_mask_scalef_round_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
10359}
10360
10361/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10362/// number of bits specified by imm8, and store the results in dst.
10363///
10364/// Rounding is done according to the imm8 parameter, which can be one of:
10365///
10366/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10367/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10368/// * [`_MM_FROUND_TO_POS_INF`] : round up
10369/// * [`_MM_FROUND_TO_ZERO`] : truncate
10370/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10371///
10372/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_ph)
10373#[inline]
10374#[target_feature(enable = "avx512fp16,avx512vl")]
10375#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10376#[rustc_legacy_const_generics(1)]
10377#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10378pub unsafe fn _mm_reduce_ph<const IMM8: i32>(a: __m128h) -> __m128h {
10379    static_assert_uimm_bits!(IMM8, 8);
10380    _mm_mask_reduce_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
10381}
10382
10383/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10384/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10385/// from src when the corresponding mask bit is not set).
10386///
10387/// Rounding is done according to the imm8 parameter, which can be one of:
10388///
10389/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10390/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10391/// * [`_MM_FROUND_TO_POS_INF`] : round up
10392/// * [`_MM_FROUND_TO_ZERO`] : truncate
10393/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10394///
10395/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_ph)
10396#[inline]
10397#[target_feature(enable = "avx512fp16,avx512vl")]
10398#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10399#[rustc_legacy_const_generics(3)]
10400#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10401pub unsafe fn _mm_mask_reduce_ph<const IMM8: i32>(
10402    src: __m128h,
10403    k: __mmask8,
10404    a: __m128h,
10405) -> __m128h {
10406    static_assert_uimm_bits!(IMM8, 8);
10407    vreduceph_128(a, IMM8, src, k)
10408}
10409
10410/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10411/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10412/// out when the corresponding mask bit is not set).
10413///
10414/// Rounding is done according to the imm8 parameter, which can be one of:
10415///
10416/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10417/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10418/// * [`_MM_FROUND_TO_POS_INF`] : round up
10419/// * [`_MM_FROUND_TO_ZERO`] : truncate
10420/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10421///
10422/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_ph)
10423#[inline]
10424#[target_feature(enable = "avx512fp16,avx512vl")]
10425#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10426#[rustc_legacy_const_generics(2)]
10427#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10428pub unsafe fn _mm_maskz_reduce_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
10429    static_assert_uimm_bits!(IMM8, 8);
10430    _mm_mask_reduce_ph::<IMM8>(_mm_setzero_ph(), k, a)
10431}
10432
10433/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10434/// number of bits specified by imm8, and store the results in dst.
10435///
10436/// Rounding is done according to the imm8 parameter, which can be one of:
10437///
10438/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10439/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10440/// * [`_MM_FROUND_TO_POS_INF`] : round up
10441/// * [`_MM_FROUND_TO_ZERO`] : truncate
10442/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10443///
10444/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_ph)
10445#[inline]
10446#[target_feature(enable = "avx512fp16,avx512vl")]
10447#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10448#[rustc_legacy_const_generics(1)]
10449#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10450pub unsafe fn _mm256_reduce_ph<const IMM8: i32>(a: __m256h) -> __m256h {
10451    static_assert_uimm_bits!(IMM8, 8);
10452    _mm256_mask_reduce_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
10453}
10454
10455/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10456/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10457/// from src when the corresponding mask bit is not set).
10458///
10459/// Rounding is done according to the imm8 parameter, which can be one of:
10460///
10461/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10462/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10463/// * [`_MM_FROUND_TO_POS_INF`] : round up
10464/// * [`_MM_FROUND_TO_ZERO`] : truncate
10465/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10466///
10467/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_ph)
10468#[inline]
10469#[target_feature(enable = "avx512fp16,avx512vl")]
10470#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10471#[rustc_legacy_const_generics(3)]
10472#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10473pub unsafe fn _mm256_mask_reduce_ph<const IMM8: i32>(
10474    src: __m256h,
10475    k: __mmask16,
10476    a: __m256h,
10477) -> __m256h {
10478    static_assert_uimm_bits!(IMM8, 8);
10479    vreduceph_256(a, IMM8, src, k)
10480}
10481
10482/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10483/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10484/// out when the corresponding mask bit is not set).
10485///
10486/// Rounding is done according to the imm8 parameter, which can be one of:
10487///
10488/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10489/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10490/// * [`_MM_FROUND_TO_POS_INF`] : round up
10491/// * [`_MM_FROUND_TO_ZERO`] : truncate
10492/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10493///
10494/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_reduce_ph)
10495#[inline]
10496#[target_feature(enable = "avx512fp16,avx512vl")]
10497#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10498#[rustc_legacy_const_generics(2)]
10499#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10500pub unsafe fn _mm256_maskz_reduce_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
10501    static_assert_uimm_bits!(IMM8, 8);
10502    _mm256_mask_reduce_ph::<IMM8>(_mm256_setzero_ph(), k, a)
10503}
10504
10505/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10506/// number of bits specified by imm8, and store the results in dst.
10507///
10508/// Rounding is done according to the imm8 parameter, which can be one of:
10509///
10510/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10511/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10512/// * [`_MM_FROUND_TO_POS_INF`] : round up
10513/// * [`_MM_FROUND_TO_ZERO`] : truncate
10514/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10515///
10516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_ph)
10517#[inline]
10518#[target_feature(enable = "avx512fp16")]
10519#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10520#[rustc_legacy_const_generics(1)]
10521#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10522pub unsafe fn _mm512_reduce_ph<const IMM8: i32>(a: __m512h) -> __m512h {
10523    static_assert_uimm_bits!(IMM8, 8);
10524    _mm512_mask_reduce_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
10525}
10526
10527/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10528/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10529/// from src when the corresponding mask bit is not set).
10530///
10531/// Rounding is done according to the imm8 parameter, which can be one of:
10532///
10533/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10534/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10535/// * [`_MM_FROUND_TO_POS_INF`] : round up
10536/// * [`_MM_FROUND_TO_ZERO`] : truncate
10537/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10538///
10539/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_ph)
10540#[inline]
10541#[target_feature(enable = "avx512fp16")]
10542#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10543#[rustc_legacy_const_generics(3)]
10544#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10545pub unsafe fn _mm512_mask_reduce_ph<const IMM8: i32>(
10546    src: __m512h,
10547    k: __mmask32,
10548    a: __m512h,
10549) -> __m512h {
10550    static_assert_uimm_bits!(IMM8, 8);
10551    _mm512_mask_reduce_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
10552}
10553
10554/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10555/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10556/// out when the corresponding mask bit is not set).
10557///
10558/// Rounding is done according to the imm8 parameter, which can be one of:
10559///
10560/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10561/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10562/// * [`_MM_FROUND_TO_POS_INF`] : round up
10563/// * [`_MM_FROUND_TO_ZERO`] : truncate
10564/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10565///
10566/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_ph)
10567#[inline]
10568#[target_feature(enable = "avx512fp16")]
10569#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10570#[rustc_legacy_const_generics(2)]
10571#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10572pub unsafe fn _mm512_maskz_reduce_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
10573    static_assert_uimm_bits!(IMM8, 8);
10574    _mm512_mask_reduce_ph::<IMM8>(_mm512_setzero_ph(), k, a)
10575}
10576
10577/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10578/// number of bits specified by imm8, and store the results in dst.
10579///
10580/// Rounding is done according to the imm8 parameter, which can be one of:
10581///
10582/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10583/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10584/// * [`_MM_FROUND_TO_POS_INF`] : round up
10585/// * [`_MM_FROUND_TO_ZERO`] : truncate
10586/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10587///
10588/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10589///
10590/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_round_ph)
10591#[inline]
10592#[target_feature(enable = "avx512fp16")]
10593#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10594#[rustc_legacy_const_generics(1, 2)]
10595#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10596pub unsafe fn _mm512_reduce_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
10597    static_assert_uimm_bits!(IMM8, 8);
10598    static_assert_sae!(SAE);
10599    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
10600}
10601
10602/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10603/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10604/// from src when the corresponding mask bit is not set).
10605///
10606/// Rounding is done according to the imm8 parameter, which can be one of:
10607///
10608/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10609/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10610/// * [`_MM_FROUND_TO_POS_INF`] : round up
10611/// * [`_MM_FROUND_TO_ZERO`] : truncate
10612/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10613///
10614/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10615///
10616/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_round_ph)
10617#[inline]
10618#[target_feature(enable = "avx512fp16")]
10619#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10620#[rustc_legacy_const_generics(3, 4)]
10621#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10622pub unsafe fn _mm512_mask_reduce_round_ph<const IMM8: i32, const SAE: i32>(
10623    src: __m512h,
10624    k: __mmask32,
10625    a: __m512h,
10626) -> __m512h {
10627    static_assert_uimm_bits!(IMM8, 8);
10628    static_assert_sae!(SAE);
10629    vreduceph_512(a, IMM8, src, k, SAE)
10630}
10631
10632/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10633/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10634/// out when the corresponding mask bit is not set).
10635///
10636/// Rounding is done according to the imm8 parameter, which can be one of:
10637///
10638/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10639/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10640/// * [`_MM_FROUND_TO_POS_INF`] : round up
10641/// * [`_MM_FROUND_TO_ZERO`] : truncate
10642/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10643///
10644/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10645///
10646/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_round_ph)
10647#[inline]
10648#[target_feature(enable = "avx512fp16")]
10649#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10650#[rustc_legacy_const_generics(2, 3)]
10651#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10652pub unsafe fn _mm512_maskz_reduce_round_ph<const IMM8: i32, const SAE: i32>(
10653    k: __mmask32,
10654    a: __m512h,
10655) -> __m512h {
10656    static_assert_uimm_bits!(IMM8, 8);
10657    static_assert_sae!(SAE);
10658    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
10659}
10660
10661/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10662/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the
10663/// upper 7 packed elements from a to the upper elements of dst.
10664///
10665/// Rounding is done according to the imm8 parameter, which can be one of:
10666///
10667/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10668/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10669/// * [`_MM_FROUND_TO_POS_INF`] : round up
10670/// * [`_MM_FROUND_TO_ZERO`] : truncate
10671/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10672///
10673/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_sh)
10674#[inline]
10675#[target_feature(enable = "avx512fp16")]
10676#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10677#[rustc_legacy_const_generics(2)]
10678#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10679pub unsafe fn _mm_reduce_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
10680    static_assert_uimm_bits!(IMM8, 8);
10681    _mm_mask_reduce_sh::<IMM8>(_mm_undefined_ph(), 0xff, a, b)
10682}
10683
10684/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10685/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
10686/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from
10687/// a to the upper elements of dst.
10688///
10689/// Rounding is done according to the imm8 parameter, which can be one of:
10690///
10691/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10692/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10693/// * [`_MM_FROUND_TO_POS_INF`] : round up
10694/// * [`_MM_FROUND_TO_ZERO`] : truncate
10695/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10696///
10697/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_sh)
10698#[inline]
10699#[target_feature(enable = "avx512fp16")]
10700#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10701#[rustc_legacy_const_generics(4)]
10702#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10703pub unsafe fn _mm_mask_reduce_sh<const IMM8: i32>(
10704    src: __m128h,
10705    k: __mmask8,
10706    a: __m128h,
10707    b: __m128h,
10708) -> __m128h {
10709    static_assert_uimm_bits!(IMM8, 8);
10710    _mm_mask_reduce_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10711}
10712
10713/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10714/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
10715/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
10716/// to the upper elements of dst.
10717///
10718/// Rounding is done according to the imm8 parameter, which can be one of:
10719///
10720/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10721/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10722/// * [`_MM_FROUND_TO_POS_INF`] : round up
10723/// * [`_MM_FROUND_TO_ZERO`] : truncate
10724/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10725///
10726/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_sh)
10727#[inline]
10728#[target_feature(enable = "avx512fp16")]
10729#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10730#[rustc_legacy_const_generics(3)]
10731#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10732pub unsafe fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10733    static_assert_uimm_bits!(IMM8, 8);
10734    _mm_mask_reduce_sh::<IMM8>(_mm_setzero_ph(), k, a, b)
10735}
10736
10737/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10738/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper
10739/// 7 packed elements from a to the upper elements of dst.
10740///
10741/// Rounding is done according to the imm8 parameter, which can be one of:
10742///
10743/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10744/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10745/// * [`_MM_FROUND_TO_POS_INF`] : round up
10746/// * [`_MM_FROUND_TO_ZERO`] : truncate
10747/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10748///
10749/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10750///
10751/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_round_sh)
10752#[inline]
10753#[target_feature(enable = "avx512fp16")]
10754#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
10755#[rustc_legacy_const_generics(2, 3)]
10756#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10757pub unsafe fn _mm_reduce_round_sh<const IMM8: i32, const SAE: i32>(
10758    a: __m128h,
10759    b: __m128h,
10760) -> __m128h {
10761    static_assert_uimm_bits!(IMM8, 8);
10762    static_assert_sae!(SAE);
10763    _mm_mask_reduce_round_sh::<IMM8, SAE>(_mm_undefined_ph(), 0xff, a, b)
10764}
10765
10766/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10767/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
10768/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a
10769/// to the upper elements of dst.
10770///
10771/// Rounding is done according to the imm8 parameter, which can be one of:
10772///
10773/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10774/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10775/// * [`_MM_FROUND_TO_POS_INF`] : round up
10776/// * [`_MM_FROUND_TO_ZERO`] : truncate
10777/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10778///
10779/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10780///
10781/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_round_sh)
10782#[inline]
10783#[target_feature(enable = "avx512fp16")]
10784#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
10785#[rustc_legacy_const_generics(4, 5)]
10786#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10787pub unsafe fn _mm_mask_reduce_round_sh<const IMM8: i32, const SAE: i32>(
10788    src: __m128h,
10789    k: __mmask8,
10790    a: __m128h,
10791    b: __m128h,
10792) -> __m128h {
10793    static_assert_uimm_bits!(IMM8, 8);
10794    static_assert_sae!(SAE);
10795    vreducesh(a, b, src, k, IMM8, SAE)
10796}
10797
10798/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10799/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
10800/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
10801/// to the upper elements of dst.
10802///
10803/// Rounding is done according to the imm8 parameter, which can be one of:
10804///
10805/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10806/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10807/// * [`_MM_FROUND_TO_POS_INF`] : round up
10808/// * [`_MM_FROUND_TO_ZERO`] : truncate
10809/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10810///
10811/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10812///
10813/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_round_sh)
10814#[inline]
10815#[target_feature(enable = "avx512fp16")]
10816#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
10817#[rustc_legacy_const_generics(3, 4)]
10818#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10819pub unsafe fn _mm_maskz_reduce_round_sh<const IMM8: i32, const SAE: i32>(
10820    k: __mmask8,
10821    a: __m128h,
10822    b: __m128h,
10823) -> __m128h {
10824    static_assert_uimm_bits!(IMM8, 8);
10825    static_assert_sae!(SAE);
10826    _mm_mask_reduce_round_sh::<IMM8, SAE>(_mm_setzero_ph(), k, a, b)
10827}
10828
10829/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
10830/// sum of all elements in a.
10831///
10832/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_ph)
10833#[inline]
10834#[target_feature(enable = "avx512fp16,avx512vl")]
10835#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10836pub unsafe fn _mm_reduce_add_ph(a: __m128h) -> f16 {
10837    let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
10838    let a = _mm_add_ph(a, b);
10839    let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
10840    let a = _mm_add_ph(a, b);
10841    simd_extract::<_, f16>(a, 0) + simd_extract::<_, f16>(a, 1)
10842}
10843
10844/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
10845/// sum of all elements in a.
10846///
10847/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_ph)
10848#[inline]
10849#[target_feature(enable = "avx512fp16,avx512vl")]
10850#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10851pub unsafe fn _mm256_reduce_add_ph(a: __m256h) -> f16 {
10852    let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
10853    let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
10854    _mm_reduce_add_ph(_mm_add_ph(p, q))
10855}
10856
10857/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
10858/// sum of all elements in a.
10859///
10860/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ph)
10861#[inline]
10862#[target_feature(enable = "avx512fp16")]
10863#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10864pub unsafe fn _mm512_reduce_add_ph(a: __m512h) -> f16 {
10865    let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
10866    let q = simd_shuffle!(
10867        a,
10868        a,
10869        [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
10870    );
10871    _mm256_reduce_add_ph(_mm256_add_ph(p, q))
10872}
10873
10874/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
10875/// the product of all elements in a.
10876///
10877/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_ph)
10878#[inline]
10879#[target_feature(enable = "avx512fp16,avx512vl")]
10880#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10881pub unsafe fn _mm_reduce_mul_ph(a: __m128h) -> f16 {
10882    let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
10883    let a = _mm_mul_ph(a, b);
10884    let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
10885    let a = _mm_mul_ph(a, b);
10886    simd_extract::<_, f16>(a, 0) * simd_extract::<_, f16>(a, 1)
10887}
10888
10889/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
10890/// the product of all elements in a.
10891///
10892/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_ph)
10893#[inline]
10894#[target_feature(enable = "avx512fp16,avx512vl")]
10895#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10896pub unsafe fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
10897    let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
10898    let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
10899    _mm_reduce_mul_ph(_mm_mul_ph(p, q))
10900}
10901
10902/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
10903/// the product of all elements in a.
10904///
10905/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ph)
10906#[inline]
10907#[target_feature(enable = "avx512fp16")]
10908#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10909pub unsafe fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
10910    let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
10911    let q = simd_shuffle!(
10912        a,
10913        a,
10914        [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
10915    );
10916    _mm256_reduce_mul_ph(_mm256_mul_ph(p, q))
10917}
10918
10919/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
10920/// minimum of all elements in a.
10921///
10922/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_ph)
10923#[inline]
10924#[target_feature(enable = "avx512fp16,avx512vl")]
10925#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10926pub unsafe fn _mm_reduce_min_ph(a: __m128h) -> f16 {
10927    let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
10928    let a = _mm_min_ph(a, b);
10929    let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
10930    let a = _mm_min_ph(a, b);
10931    let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
10932    simd_extract!(_mm_min_sh(a, b), 0)
10933}
10934
10935/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
10936/// minimum of all elements in a.
10937///
10938/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_ph)
10939#[inline]
10940#[target_feature(enable = "avx512fp16,avx512vl")]
10941#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10942pub unsafe fn _mm256_reduce_min_ph(a: __m256h) -> f16 {
10943    let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
10944    let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
10945    _mm_reduce_min_ph(_mm_min_ph(p, q))
10946}
10947
10948/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
10949/// minimum of all elements in a.
10950///
10951/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ph)
10952#[inline]
10953#[target_feature(enable = "avx512fp16")]
10954#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10955pub unsafe fn _mm512_reduce_min_ph(a: __m512h) -> f16 {
10956    let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
10957    let q = simd_shuffle!(
10958        a,
10959        a,
10960        [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
10961    );
10962    _mm256_reduce_min_ph(_mm256_min_ph(p, q))
10963}
10964
10965/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
10966/// maximum of all elements in a.
10967///
10968/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_ph)
10969#[inline]
10970#[target_feature(enable = "avx512fp16,avx512vl")]
10971#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10972pub unsafe fn _mm_reduce_max_ph(a: __m128h) -> f16 {
10973    let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
10974    let a = _mm_max_ph(a, b);
10975    let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
10976    let a = _mm_max_ph(a, b);
10977    let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
10978    simd_extract!(_mm_max_sh(a, b), 0)
10979}
10980
10981/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
10982/// maximum of all elements in a.
10983///
10984/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_ph)
10985#[inline]
10986#[target_feature(enable = "avx512fp16,avx512vl")]
10987#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10988pub unsafe fn _mm256_reduce_max_ph(a: __m256h) -> f16 {
10989    let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
10990    let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
10991    _mm_reduce_max_ph(_mm_max_ph(p, q))
10992}
10993
10994/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
10995/// maximum of all elements in a.
10996///
10997/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ph)
10998#[inline]
10999#[target_feature(enable = "avx512fp16")]
11000#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11001pub unsafe fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
11002    let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11003    let q = simd_shuffle!(
11004        a,
11005        a,
11006        [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
11007    );
11008    _mm256_reduce_max_ph(_mm256_max_ph(p, q))
11009}
11010
11011macro_rules! fpclass_asm { // FIXME: use LLVM intrinsics
11012    ($mask_type: ty, $reg: ident, $a: expr) => {{
11013        let dst: $mask_type;
11014        asm!(
11015            "vfpclassph {k}, {src}, {imm8}",
11016            k = lateout(kreg) dst,
11017            src = in($reg) $a,
11018            imm8 = const IMM8,
11019            options(pure, nomem, nostack)
11020        );
11021        dst
11022    }};
11023    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr) => {{
11024        let dst: $mask_type;
11025        asm!(
11026            "vfpclassph {k} {{ {mask} }}, {src}, {imm8}",
11027            k = lateout(kreg) dst,
11028            mask = in(kreg) $mask,
11029            src = in($reg) $a,
11030            imm8 = const IMM8,
11031            options(pure, nomem, nostack)
11032        );
11033        dst
11034    }};
11035}
11036
11037/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11038/// by imm8, and store the results in mask vector k.
11039/// imm can be a combination of:
11040///
11041///     0x01 // QNaN
11042///     0x02 // Positive Zero
11043///     0x04 // Negative Zero
11044///     0x08 // Positive Infinity
11045///     0x10 // Negative Infinity
11046///     0x20 // Denormal
11047///     0x40 // Negative
11048///     0x80 // SNaN
11049///
11050/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_ph_mask)
11051#[inline]
11052#[target_feature(enable = "avx512fp16,avx512vl")]
11053#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11054#[rustc_legacy_const_generics(1)]
11055#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11056pub unsafe fn _mm_fpclass_ph_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11057    static_assert_uimm_bits!(IMM8, 8);
11058    fpclass_asm!(__mmask8, xmm_reg, a)
11059}
11060
11061/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11062/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11063/// corresponding mask bit is not set).
11064/// imm can be a combination of:
11065///
11066///     0x01 // QNaN
11067///     0x02 // Positive Zero
11068///     0x04 // Negative Zero
11069///     0x08 // Positive Infinity
11070///     0x10 // Negative Infinity
11071///     0x20 // Denormal
11072///     0x40 // Negative
11073///     0x80 // SNaN
11074///
11075/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_ph_mask)
11076#[inline]
11077#[target_feature(enable = "avx512fp16,avx512vl")]
11078#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11079#[rustc_legacy_const_generics(2)]
11080#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11081pub unsafe fn _mm_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11082    static_assert_uimm_bits!(IMM8, 8);
11083    fpclass_asm!(__mmask8, k1, xmm_reg, a)
11084}
11085
11086/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11087/// by imm8, and store the results in mask vector k.
11088/// imm can be a combination of:
11089///
11090///     0x01 // QNaN
11091///     0x02 // Positive Zero
11092///     0x04 // Negative Zero
11093///     0x08 // Positive Infinity
11094///     0x10 // Negative Infinity
11095///     0x20 // Denormal
11096///     0x40 // Negative
11097///     0x80 // SNaN
11098///
11099/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fpclass_ph_mask)
11100#[inline]
11101#[target_feature(enable = "avx512fp16,avx512vl")]
11102#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11103#[rustc_legacy_const_generics(1)]
11104#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11105pub unsafe fn _mm256_fpclass_ph_mask<const IMM8: i32>(a: __m256h) -> __mmask16 {
11106    static_assert_uimm_bits!(IMM8, 8);
11107    fpclass_asm!(__mmask16, ymm_reg, a)
11108}
11109
11110/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11111/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11112/// corresponding mask bit is not set).
11113/// imm can be a combination of:
11114///
11115///     0x01 // QNaN
11116///     0x02 // Positive Zero
11117///     0x04 // Negative Zero
11118///     0x08 // Positive Infinity
11119///     0x10 // Negative Infinity
11120///     0x20 // Denormal
11121///     0x40 // Negative
11122///     0x80 // SNaN
11123///
11124/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fpclass_ph_mask)
11125#[inline]
11126#[target_feature(enable = "avx512fp16,avx512vl")]
11127#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11128#[rustc_legacy_const_generics(2)]
11129#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11130pub unsafe fn _mm256_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask16, a: __m256h) -> __mmask16 {
11131    static_assert_uimm_bits!(IMM8, 8);
11132    fpclass_asm!(__mmask16, k1, ymm_reg, a)
11133}
11134
11135/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11136/// by imm8, and store the results in mask vector k.
11137/// imm can be a combination of:
11138///
11139///     0x01 // QNaN
11140///     0x02 // Positive Zero
11141///     0x04 // Negative Zero
11142///     0x08 // Positive Infinity
11143///     0x10 // Negative Infinity
11144///     0x20 // Denormal
11145///     0x40 // Negative
11146///     0x80 // SNaN
11147///
11148/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fpclass_ph_mask)
11149#[inline]
11150#[target_feature(enable = "avx512fp16")]
11151#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11152#[rustc_legacy_const_generics(1)]
11153#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11154pub unsafe fn _mm512_fpclass_ph_mask<const IMM8: i32>(a: __m512h) -> __mmask32 {
11155    static_assert_uimm_bits!(IMM8, 8);
11156    fpclass_asm!(__mmask32, zmm_reg, a)
11157}
11158
11159/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11160/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11161/// corresponding mask bit is not set).
11162/// imm can be a combination of:
11163///
11164///     0x01 // QNaN
11165///     0x02 // Positive Zero
11166///     0x04 // Negative Zero
11167///     0x08 // Positive Infinity
11168///     0x10 // Negative Infinity
11169///     0x20 // Denormal
11170///     0x40 // Negative
11171///     0x80 // SNaN
11172///
11173/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fpclass_ph_mask)
11174#[inline]
11175#[target_feature(enable = "avx512fp16")]
11176#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11177#[rustc_legacy_const_generics(2)]
11178#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11179pub unsafe fn _mm512_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask32, a: __m512h) -> __mmask32 {
11180    static_assert_uimm_bits!(IMM8, 8);
11181    fpclass_asm!(__mmask32, k1, zmm_reg, a)
11182}
11183
11184/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11185/// by imm8, and store the result in mask vector k.
11186/// imm can be a combination of:
11187///
11188///     0x01 // QNaN
11189///     0x02 // Positive Zero
11190///     0x04 // Negative Zero
11191///     0x08 // Positive Infinity
11192///     0x10 // Negative Infinity
11193///     0x20 // Denormal
11194///     0x40 // Negative
11195///     0x80 // SNaN
11196///
11197/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_sh_mask)
11198#[inline]
11199#[target_feature(enable = "avx512fp16")]
11200#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11201#[rustc_legacy_const_generics(1)]
11202#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11203pub unsafe fn _mm_fpclass_sh_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11204    _mm_mask_fpclass_sh_mask::<IMM8>(0xff, a)
11205}
11206
11207/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11208/// by imm8, and store the result in mask vector k using zeromask k (elements are zeroed out when the
11209/// corresponding mask bit is not set).
11210/// imm can be a combination of:
11211///
11212///     0x01 // QNaN
11213///     0x02 // Positive Zero
11214///     0x04 // Negative Zero
11215///     0x08 // Positive Infinity
11216///     0x10 // Negative Infinity
11217///     0x20 // Denormal
11218///     0x40 // Negative
11219///     0x80 // SNaN
11220///
11221/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_sh_mask)
11222#[inline]
11223#[target_feature(enable = "avx512fp16")]
11224#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11225#[rustc_legacy_const_generics(2)]
11226#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11227pub unsafe fn _mm_mask_fpclass_sh_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11228    static_assert_uimm_bits!(IMM8, 8);
11229    vfpclasssh(a, IMM8, k1)
11230}
11231
11232/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11233/// and store the results in dst.
11234///
11235/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ph)
11236#[inline]
11237#[target_feature(enable = "avx512fp16,avx512vl")]
11238#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11239pub unsafe fn _mm_mask_blend_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
11240    simd_select_bitmask(k, b, a)
11241}
11242
11243/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11244/// and store the results in dst.
11245///
11246/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ph)
11247#[inline]
11248#[target_feature(enable = "avx512fp16,avx512vl")]
11249#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11250pub unsafe fn _mm256_mask_blend_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
11251    simd_select_bitmask(k, b, a)
11252}
11253
11254/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11255/// and store the results in dst.
11256///
11257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ph)
11258#[inline]
11259#[target_feature(enable = "avx512fp16")]
11260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11261pub unsafe fn _mm512_mask_blend_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
11262    simd_select_bitmask(k, b, a)
11263}
11264
11265/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11266/// and index in idx, and store the results in dst.
11267///
11268/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_ph)
11269#[inline]
11270#[target_feature(enable = "avx512fp16,avx512vl")]
11271#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11272pub unsafe fn _mm_permutex2var_ph(a: __m128h, idx: __m128i, b: __m128h) -> __m128h {
11273    _mm_castsi128_ph(_mm_permutex2var_epi16(
11274        _mm_castph_si128(a),
11275        idx,
11276        _mm_castph_si128(b),
11277    ))
11278}
11279
11280/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11281/// and index in idx, and store the results in dst.
11282///
11283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_ph)
11284#[inline]
11285#[target_feature(enable = "avx512fp16,avx512vl")]
11286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11287pub unsafe fn _mm256_permutex2var_ph(a: __m256h, idx: __m256i, b: __m256h) -> __m256h {
11288    _mm256_castsi256_ph(_mm256_permutex2var_epi16(
11289        _mm256_castph_si256(a),
11290        idx,
11291        _mm256_castph_si256(b),
11292    ))
11293}
11294
11295/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11296/// and index in idx, and store the results in dst.
11297///
11298/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ph)
11299#[inline]
11300#[target_feature(enable = "avx512fp16")]
11301#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11302pub unsafe fn _mm512_permutex2var_ph(a: __m512h, idx: __m512i, b: __m512h) -> __m512h {
11303    _mm512_castsi512_ph(_mm512_permutex2var_epi16(
11304        _mm512_castph_si512(a),
11305        idx,
11306        _mm512_castph_si512(b),
11307    ))
11308}
11309
11310/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11311/// and store the results in dst.
11312///
11313/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutexvar_ph)
11314#[inline]
11315#[target_feature(enable = "avx512fp16,avx512vl")]
11316#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11317pub unsafe fn _mm_permutexvar_ph(idx: __m128i, a: __m128h) -> __m128h {
11318    _mm_castsi128_ph(_mm_permutexvar_epi16(idx, _mm_castph_si128(a)))
11319}
11320
11321/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11322/// and store the results in dst.
11323///
11324/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_ph)
11325#[inline]
11326#[target_feature(enable = "avx512fp16,avx512vl")]
11327#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11328pub unsafe fn _mm256_permutexvar_ph(idx: __m256i, a: __m256h) -> __m256h {
11329    _mm256_castsi256_ph(_mm256_permutexvar_epi16(idx, _mm256_castph_si256(a)))
11330}
11331
11332/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11333/// and store the results in dst.
11334///
11335/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ph)
11336#[inline]
11337#[target_feature(enable = "avx512fp16")]
11338#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11339pub unsafe fn _mm512_permutexvar_ph(idx: __m512i, a: __m512h) -> __m512h {
11340    _mm512_castsi512_ph(_mm512_permutexvar_epi16(idx, _mm512_castph_si512(a)))
11341}
11342
11343/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11344/// and store the results in dst.
11345///
11346/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_ph)
11347#[inline]
11348#[target_feature(enable = "avx512fp16,avx512vl")]
11349#[cfg_attr(test, assert_instr(vcvtw2ph))]
11350#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11351pub unsafe fn _mm_cvtepi16_ph(a: __m128i) -> __m128h {
11352    vcvtw2ph_128(a.as_i16x8(), _MM_FROUND_CUR_DIRECTION)
11353}
11354
11355/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11356/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11357/// mask bit is not set).
11358///
11359/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_ph)
11360#[inline]
11361#[target_feature(enable = "avx512fp16,avx512vl")]
11362#[cfg_attr(test, assert_instr(vcvtw2ph))]
11363#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11364pub unsafe fn _mm_mask_cvtepi16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11365    simd_select_bitmask(k, _mm_cvtepi16_ph(a), src)
11366}
11367
11368/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11369/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11370///
11371/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_ph)
11372#[inline]
11373#[target_feature(enable = "avx512fp16,avx512vl")]
11374#[cfg_attr(test, assert_instr(vcvtw2ph))]
11375#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11376pub unsafe fn _mm_maskz_cvtepi16_ph(k: __mmask8, a: __m128i) -> __m128h {
11377    _mm_mask_cvtepi16_ph(_mm_setzero_ph(), k, a)
11378}
11379
11380/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11381/// and store the results in dst.
11382///
11383/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_ph)
11384#[inline]
11385#[target_feature(enable = "avx512fp16,avx512vl")]
11386#[cfg_attr(test, assert_instr(vcvtw2ph))]
11387#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11388pub unsafe fn _mm256_cvtepi16_ph(a: __m256i) -> __m256h {
11389    vcvtw2ph_256(a.as_i16x16(), _MM_FROUND_CUR_DIRECTION)
11390}
11391
11392/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11393/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11394/// mask bit is not set).
11395///
11396/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_ph)
11397#[inline]
11398#[target_feature(enable = "avx512fp16,avx512vl")]
11399#[cfg_attr(test, assert_instr(vcvtw2ph))]
11400#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11401pub unsafe fn _mm256_mask_cvtepi16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
11402    simd_select_bitmask(k, _mm256_cvtepi16_ph(a), src)
11403}
11404
11405/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11406/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11407///
11408/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_ph)
11409#[inline]
11410#[target_feature(enable = "avx512fp16,avx512vl")]
11411#[cfg_attr(test, assert_instr(vcvtw2ph))]
11412#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11413pub unsafe fn _mm256_maskz_cvtepi16_ph(k: __mmask16, a: __m256i) -> __m256h {
11414    _mm256_mask_cvtepi16_ph(_mm256_setzero_ph(), k, a)
11415}
11416
11417/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11418/// and store the results in dst.
11419///
11420/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_ph)
11421#[inline]
11422#[target_feature(enable = "avx512fp16")]
11423#[cfg_attr(test, assert_instr(vcvtw2ph))]
11424#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11425pub unsafe fn _mm512_cvtepi16_ph(a: __m512i) -> __m512h {
11426    vcvtw2ph_512(a.as_i16x32(), _MM_FROUND_CUR_DIRECTION)
11427}
11428
11429/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11430/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11431/// mask bit is not set).
11432///
11433/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_ph)
11434#[inline]
11435#[target_feature(enable = "avx512fp16")]
11436#[cfg_attr(test, assert_instr(vcvtw2ph))]
11437#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11438pub unsafe fn _mm512_mask_cvtepi16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
11439    simd_select_bitmask(k, _mm512_cvtepi16_ph(a), src)
11440}
11441
11442/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11443/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11444///
11445/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_ph)
11446#[inline]
11447#[target_feature(enable = "avx512fp16")]
11448#[cfg_attr(test, assert_instr(vcvtw2ph))]
11449#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11450pub unsafe fn _mm512_maskz_cvtepi16_ph(k: __mmask32, a: __m512i) -> __m512h {
11451    _mm512_mask_cvtepi16_ph(_mm512_setzero_ph(), k, a)
11452}
11453
11454/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11455/// and store the results in dst.
11456///
11457/// Rounding is done according to the rounding parameter, which can be one of:
11458///
11459/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11460/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11461/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11462/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11463/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11464///
11465/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi16_ph)
11466#[inline]
11467#[target_feature(enable = "avx512fp16")]
11468#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11469#[rustc_legacy_const_generics(1)]
11470#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11471pub unsafe fn _mm512_cvt_roundepi16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
11472    static_assert_rounding!(ROUNDING);
11473    vcvtw2ph_512(a.as_i16x32(), ROUNDING)
11474}
11475
11476/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11477/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11478/// mask bit is not set).
11479///
11480/// Rounding is done according to the rounding parameter, which can be one of:
11481///
11482/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11483/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11484/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11485/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11486/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11487///
11488/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi16_ph)
11489#[inline]
11490#[target_feature(enable = "avx512fp16")]
11491#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11492#[rustc_legacy_const_generics(3)]
11493#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11494pub unsafe fn _mm512_mask_cvt_roundepi16_ph<const ROUNDING: i32>(
11495    src: __m512h,
11496    k: __mmask32,
11497    a: __m512i,
11498) -> __m512h {
11499    static_assert_rounding!(ROUNDING);
11500    simd_select_bitmask(k, _mm512_cvt_roundepi16_ph::<ROUNDING>(a), src)
11501}
11502
11503/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11504/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11505///
11506/// Rounding is done according to the rounding parameter, which can be one of:
11507///
11508/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11509/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11510/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11511/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11512/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11513///
11514/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi16_ph)
11515#[inline]
11516#[target_feature(enable = "avx512fp16")]
11517#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11518#[rustc_legacy_const_generics(2)]
11519#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11520pub unsafe fn _mm512_maskz_cvt_roundepi16_ph<const ROUNDING: i32>(
11521    k: __mmask32,
11522    a: __m512i,
11523) -> __m512h {
11524    static_assert_rounding!(ROUNDING);
11525    _mm512_mask_cvt_roundepi16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
11526}
11527
11528/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11529/// and store the results in dst.
11530///
11531/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_ph)
11532#[inline]
11533#[target_feature(enable = "avx512fp16,avx512vl")]
11534#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11535#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11536pub unsafe fn _mm_cvtepu16_ph(a: __m128i) -> __m128h {
11537    vcvtuw2ph_128(a.as_u16x8(), _MM_FROUND_CUR_DIRECTION)
11538}
11539
11540/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11541/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11542/// mask bit is not set).
11543///
11544/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_ph)
11545#[inline]
11546#[target_feature(enable = "avx512fp16,avx512vl")]
11547#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11548#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11549pub unsafe fn _mm_mask_cvtepu16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11550    simd_select_bitmask(k, _mm_cvtepu16_ph(a), src)
11551}
11552
11553/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11554/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11555///
11556/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_ph)
11557#[inline]
11558#[target_feature(enable = "avx512fp16,avx512vl")]
11559#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11560#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11561pub unsafe fn _mm_maskz_cvtepu16_ph(k: __mmask8, a: __m128i) -> __m128h {
11562    _mm_mask_cvtepu16_ph(_mm_setzero_ph(), k, a)
11563}
11564
11565/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11566/// and store the results in dst.
11567///
11568/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_ph)
11569#[inline]
11570#[target_feature(enable = "avx512fp16,avx512vl")]
11571#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11572#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11573pub unsafe fn _mm256_cvtepu16_ph(a: __m256i) -> __m256h {
11574    vcvtuw2ph_256(a.as_u16x16(), _MM_FROUND_CUR_DIRECTION)
11575}
11576
11577/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11578/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11579/// mask bit is not set).
11580///
11581/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_ph)
11582#[inline]
11583#[target_feature(enable = "avx512fp16,avx512vl")]
11584#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11585#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11586pub unsafe fn _mm256_mask_cvtepu16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
11587    simd_select_bitmask(k, _mm256_cvtepu16_ph(a), src)
11588}
11589
11590/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11591/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11592///
11593/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_ph)
11594#[inline]
11595#[target_feature(enable = "avx512fp16,avx512vl")]
11596#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11597#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11598pub unsafe fn _mm256_maskz_cvtepu16_ph(k: __mmask16, a: __m256i) -> __m256h {
11599    _mm256_mask_cvtepu16_ph(_mm256_setzero_ph(), k, a)
11600}
11601
11602/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11603/// and store the results in dst.
11604///
11605/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_ph)
11606#[inline]
11607#[target_feature(enable = "avx512fp16")]
11608#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11609#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11610pub unsafe fn _mm512_cvtepu16_ph(a: __m512i) -> __m512h {
11611    vcvtuw2ph_512(a.as_u16x32(), _MM_FROUND_CUR_DIRECTION)
11612}
11613
11614/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11615/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11616/// mask bit is not set).
11617///
11618/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu16_ph)
11619#[inline]
11620#[target_feature(enable = "avx512fp16")]
11621#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11622#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11623pub unsafe fn _mm512_mask_cvtepu16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
11624    simd_select_bitmask(k, _mm512_cvtepu16_ph(a), src)
11625}
11626
11627/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11628/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11629///
11630/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu16_ph)
11631#[inline]
11632#[target_feature(enable = "avx512fp16")]
11633#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11634#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11635pub unsafe fn _mm512_maskz_cvtepu16_ph(k: __mmask32, a: __m512i) -> __m512h {
11636    _mm512_mask_cvtepu16_ph(_mm512_setzero_ph(), k, a)
11637}
11638
11639/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11640/// and store the results in dst.
11641///
11642/// Rounding is done according to the rounding parameter, which can be one of:
11643///
11644/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11645/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11646/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11647/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11648/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11649///
11650/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu16_ph)
11651#[inline]
11652#[target_feature(enable = "avx512fp16")]
11653#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
11654#[rustc_legacy_const_generics(1)]
11655#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11656pub unsafe fn _mm512_cvt_roundepu16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
11657    static_assert_rounding!(ROUNDING);
11658    vcvtuw2ph_512(a.as_u16x32(), ROUNDING)
11659}
11660
11661/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11662/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11663/// mask bit is not set).
11664///
11665/// Rounding is done according to the rounding parameter, which can be one of:
11666///
11667/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11668/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11669/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11670/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11671/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11672///
11673/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu16_ph)
11674#[inline]
11675#[target_feature(enable = "avx512fp16")]
11676#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
11677#[rustc_legacy_const_generics(3)]
11678#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11679pub unsafe fn _mm512_mask_cvt_roundepu16_ph<const ROUNDING: i32>(
11680    src: __m512h,
11681    k: __mmask32,
11682    a: __m512i,
11683) -> __m512h {
11684    static_assert_rounding!(ROUNDING);
11685    simd_select_bitmask(k, _mm512_cvt_roundepu16_ph::<ROUNDING>(a), src)
11686}
11687
11688/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11689/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11690///
11691/// Rounding is done according to the rounding parameter, which can be one of:
11692///
11693/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11694/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11695/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11696/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11697/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11698///
11699/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu16_ph)
11700#[inline]
11701#[target_feature(enable = "avx512fp16")]
11702#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
11703#[rustc_legacy_const_generics(2)]
11704#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11705pub unsafe fn _mm512_maskz_cvt_roundepu16_ph<const ROUNDING: i32>(
11706    k: __mmask32,
11707    a: __m512i,
11708) -> __m512h {
11709    static_assert_rounding!(ROUNDING);
11710    _mm512_mask_cvt_roundepu16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
11711}
11712
11713/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11714/// and store the results in dst. The upper 64 bits of dst are zeroed out.
11715///
11716/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ph)
11717#[inline]
11718#[target_feature(enable = "avx512fp16,avx512vl")]
11719#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11720#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11721pub unsafe fn _mm_cvtepi32_ph(a: __m128i) -> __m128h {
11722    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), 0xff, a)
11723}
11724
11725/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11726/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11727/// mask bit is not set). The upper 64 bits of dst are zeroed out.
11728///
11729/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_ph)
11730#[inline]
11731#[target_feature(enable = "avx512fp16,avx512vl")]
11732#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11733#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11734pub unsafe fn _mm_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11735    vcvtdq2ph_128(a.as_i32x4(), src, k)
11736}
11737
11738/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11739/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11740/// The upper 64 bits of dst are zeroed out.
11741///
11742/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_ph)
11743#[inline]
11744#[target_feature(enable = "avx512fp16,avx512vl")]
11745#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11746#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11747pub unsafe fn _mm_maskz_cvtepi32_ph(k: __mmask8, a: __m128i) -> __m128h {
11748    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
11749}
11750
11751/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11752/// and store the results in dst.
11753///
11754/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ph)
11755#[inline]
11756#[target_feature(enable = "avx512fp16,avx512vl")]
11757#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11758#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11759pub unsafe fn _mm256_cvtepi32_ph(a: __m256i) -> __m128h {
11760    vcvtdq2ph_256(a.as_i32x8(), _MM_FROUND_CUR_DIRECTION)
11761}
11762
11763/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11764/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11765/// mask bit is not set).
11766///
11767/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_ph)
11768#[inline]
11769#[target_feature(enable = "avx512fp16,avx512vl")]
11770#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11771#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11772pub unsafe fn _mm256_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
11773    simd_select_bitmask(k, _mm256_cvtepi32_ph(a), src)
11774}
11775
11776/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11777/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11778///
11779/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_ph)
11780#[inline]
11781#[target_feature(enable = "avx512fp16,avx512vl")]
11782#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11783#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11784pub unsafe fn _mm256_maskz_cvtepi32_ph(k: __mmask8, a: __m256i) -> __m128h {
11785    _mm256_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
11786}
11787
11788/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11789/// and store the results in dst.
11790///
11791/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_ph)
11792#[inline]
11793#[target_feature(enable = "avx512fp16")]
11794#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11795#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11796pub unsafe fn _mm512_cvtepi32_ph(a: __m512i) -> __m256h {
11797    vcvtdq2ph_512(a.as_i32x16(), _MM_FROUND_CUR_DIRECTION)
11798}
11799
11800/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11801/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11802/// mask bit is not set).
11803///
11804/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_ph)
11805#[inline]
11806#[target_feature(enable = "avx512fp16")]
11807#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11808#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11809pub unsafe fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
11810    simd_select_bitmask(k, _mm512_cvtepi32_ph(a), src)
11811}
11812
11813/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11814/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11815///
11816/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_ph)
11817#[inline]
11818#[target_feature(enable = "avx512fp16")]
11819#[cfg_attr(test, assert_instr(vcvtdq2ph))]
11820#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11821pub unsafe fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h {
11822    _mm512_mask_cvtepi32_ph(_mm256_setzero_ph(), k, a)
11823}
11824
11825/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11826/// and store the results in dst.
11827///
11828/// Rounding is done according to the rounding parameter, which can be one of:
11829///
11830/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11831/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11832/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11833/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11834/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11835///
11836/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ph)
11837#[inline]
11838#[target_feature(enable = "avx512fp16")]
11839#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
11840#[rustc_legacy_const_generics(1)]
11841#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11842pub unsafe fn _mm512_cvt_roundepi32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
11843    static_assert_rounding!(ROUNDING);
11844    vcvtdq2ph_512(a.as_i32x16(), ROUNDING)
11845}
11846
11847/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11848/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11849/// mask bit is not set).
11850///
11851/// Rounding is done according to the rounding parameter, which can be one of:
11852///
11853/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11854/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11855/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11856/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11857/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11858///
11859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ph)
11860#[inline]
11861#[target_feature(enable = "avx512fp16")]
11862#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
11863#[rustc_legacy_const_generics(3)]
11864#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11865pub unsafe fn _mm512_mask_cvt_roundepi32_ph<const ROUNDING: i32>(
11866    src: __m256h,
11867    k: __mmask16,
11868    a: __m512i,
11869) -> __m256h {
11870    static_assert_rounding!(ROUNDING);
11871    simd_select_bitmask(k, _mm512_cvt_roundepi32_ph::<ROUNDING>(a), src)
11872}
11873
11874/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11875/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11876///
11877/// Rounding is done according to the rounding parameter, which can be one of:
11878///
11879/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11880/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11881/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11882/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11883/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11884///
11885/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ph)
11886#[inline]
11887#[target_feature(enable = "avx512fp16")]
11888#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
11889#[rustc_legacy_const_generics(2)]
11890#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11891pub unsafe fn _mm512_maskz_cvt_roundepi32_ph<const ROUNDING: i32>(
11892    k: __mmask16,
11893    a: __m512i,
11894) -> __m256h {
11895    static_assert_rounding!(ROUNDING);
11896    _mm512_mask_cvt_roundepi32_ph::<ROUNDING>(_mm256_setzero_ph(), k, a)
11897}
11898
11899/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
11900/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
11901/// of dst.
11902///
11903/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti32_sh)
11904#[inline]
11905#[target_feature(enable = "avx512fp16")]
11906#[cfg_attr(test, assert_instr(vcvtsi2sh))]
11907#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11908pub unsafe fn _mm_cvti32_sh(a: __m128h, b: i32) -> __m128h {
11909    vcvtsi2sh(a, b, _MM_FROUND_CUR_DIRECTION)
11910}
11911
11912/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
11913/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
11914/// of dst.
11915///
11916/// Rounding is done according to the rounding parameter, which can be one of:
11917///
11918/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11919/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11920/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11921/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11922/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11923///
11924/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi32_sh)
11925#[inline]
11926#[target_feature(enable = "avx512fp16")]
11927#[cfg_attr(test, assert_instr(vcvtsi2sh, ROUNDING = 8))]
11928#[rustc_legacy_const_generics(2)]
11929#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11930pub unsafe fn _mm_cvt_roundi32_sh<const ROUNDING: i32>(a: __m128h, b: i32) -> __m128h {
11931    static_assert_rounding!(ROUNDING);
11932    vcvtsi2sh(a, b, ROUNDING)
11933}
11934
11935/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11936/// and store the results in dst. The upper 64 bits of dst are zeroed out.
11937///
11938/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_ph)
11939#[inline]
11940#[target_feature(enable = "avx512fp16,avx512vl")]
11941#[cfg_attr(test, assert_instr(vcvtudq2ph))]
11942#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11943pub unsafe fn _mm_cvtepu32_ph(a: __m128i) -> __m128h {
11944    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), 0xff, a)
11945}
11946
11947/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11948/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11949/// mask bit is not set). The upper 64 bits of dst are zeroed out.
11950///
11951/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_ph)
11952#[inline]
11953#[target_feature(enable = "avx512fp16,avx512vl")]
11954#[cfg_attr(test, assert_instr(vcvtudq2ph))]
11955#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11956pub unsafe fn _mm_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11957    vcvtudq2ph_128(a.as_u32x4(), src, k)
11958}
11959
11960/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11961/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11962/// The upper 64 bits of dst are zeroed out.
11963///
11964/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_ph)
11965#[inline]
11966#[target_feature(enable = "avx512fp16,avx512vl")]
11967#[cfg_attr(test, assert_instr(vcvtudq2ph))]
11968#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11969pub unsafe fn _mm_maskz_cvtepu32_ph(k: __mmask8, a: __m128i) -> __m128h {
11970    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
11971}
11972
11973/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11974/// and store the results in dst.
11975///
11976/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_ph)
11977#[inline]
11978#[target_feature(enable = "avx512fp16,avx512vl")]
11979#[cfg_attr(test, assert_instr(vcvtudq2ph))]
11980#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11981pub unsafe fn _mm256_cvtepu32_ph(a: __m256i) -> __m128h {
11982    vcvtudq2ph_256(a.as_u32x8(), _MM_FROUND_CUR_DIRECTION)
11983}
11984
11985/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11986/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11987/// mask bit is not set).
11988///
11989/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_ph)
11990#[inline]
11991#[target_feature(enable = "avx512fp16,avx512vl")]
11992#[cfg_attr(test, assert_instr(vcvtudq2ph))]
11993#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11994pub unsafe fn _mm256_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
11995    simd_select_bitmask(k, _mm256_cvtepu32_ph(a), src)
11996}
11997
11998/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
11999/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12000///
12001/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_ph)
12002#[inline]
12003#[target_feature(enable = "avx512fp16,avx512vl")]
12004#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12005#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12006pub unsafe fn _mm256_maskz_cvtepu32_ph(k: __mmask8, a: __m256i) -> __m128h {
12007    _mm256_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12008}
12009
12010/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12011/// and store the results in dst.
12012///
12013/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_ph)
12014#[inline]
12015#[target_feature(enable = "avx512fp16")]
12016#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12017#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12018pub unsafe fn _mm512_cvtepu32_ph(a: __m512i) -> __m256h {
12019    vcvtudq2ph_512(a.as_u32x16(), _MM_FROUND_CUR_DIRECTION)
12020}
12021
12022/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12023/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12024/// mask bit is not set).
12025///
12026/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_ph)
12027#[inline]
12028#[target_feature(enable = "avx512fp16")]
12029#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12030#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12031pub unsafe fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12032    simd_select_bitmask(k, _mm512_cvtepu32_ph(a), src)
12033}
12034
12035/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12036/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12037///
12038/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_ph)
12039#[inline]
12040#[target_feature(enable = "avx512fp16")]
12041#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12042#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12043pub unsafe fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h {
12044    _mm512_mask_cvtepu32_ph(_mm256_setzero_ph(), k, a)
12045}
12046
12047/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12048/// and store the results in dst.
12049///
12050/// Rounding is done according to the rounding parameter, which can be one of:
12051///
12052/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12053/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12054/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12055/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12056/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12057///
12058/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu32_ph)
12059#[inline]
12060#[target_feature(enable = "avx512fp16")]
12061#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12062#[rustc_legacy_const_generics(1)]
12063#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12064pub unsafe fn _mm512_cvt_roundepu32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12065    static_assert_rounding!(ROUNDING);
12066    vcvtudq2ph_512(a.as_u32x16(), ROUNDING)
12067}
12068
12069/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12070/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12071/// mask bit is not set).
12072///
12073/// Rounding is done according to the rounding parameter, which can be one of:
12074///
12075/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12076/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12077/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12078/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12079/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12080///
12081/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu32_ph)
12082#[inline]
12083#[target_feature(enable = "avx512fp16")]
12084#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12085#[rustc_legacy_const_generics(3)]
12086#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12087pub unsafe fn _mm512_mask_cvt_roundepu32_ph<const ROUNDING: i32>(
12088    src: __m256h,
12089    k: __mmask16,
12090    a: __m512i,
12091) -> __m256h {
12092    static_assert_rounding!(ROUNDING);
12093    simd_select_bitmask(k, _mm512_cvt_roundepu32_ph::<ROUNDING>(a), src)
12094}
12095
12096/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12097/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12098///
12099/// Rounding is done according to the rounding parameter, which can be one of:
12100///
12101/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12102/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12103/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12104/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12105/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12106///
12107/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu32_ph)
12108#[inline]
12109#[target_feature(enable = "avx512fp16")]
12110#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12111#[rustc_legacy_const_generics(2)]
12112#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12113pub unsafe fn _mm512_maskz_cvt_roundepu32_ph<const ROUNDING: i32>(
12114    k: __mmask16,
12115    a: __m512i,
12116) -> __m256h {
12117    static_assert_rounding!(ROUNDING);
12118    _mm512_mask_cvt_roundepu32_ph::<ROUNDING>(_mm256_setzero_ph(), k, a)
12119}
12120
12121/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12122/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12123/// of dst.
12124///
12125/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_sh)
12126#[inline]
12127#[target_feature(enable = "avx512fp16")]
12128#[cfg_attr(test, assert_instr(vcvtusi2sh))]
12129#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12130pub unsafe fn _mm_cvtu32_sh(a: __m128h, b: u32) -> __m128h {
12131    vcvtusi2sh(a, b, _MM_FROUND_CUR_DIRECTION)
12132}
12133
12134/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12135/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12136/// of dst.
12137///
12138/// Rounding is done according to the rounding parameter, which can be one of:
12139///
12140/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12141/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12142/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12143/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12144/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12145///
12146/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu32_sh)
12147#[inline]
12148#[target_feature(enable = "avx512fp16")]
12149#[cfg_attr(test, assert_instr(vcvtusi2sh, ROUNDING = 8))]
12150#[rustc_legacy_const_generics(2)]
12151#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12152pub unsafe fn _mm_cvt_roundu32_sh<const ROUNDING: i32>(a: __m128h, b: u32) -> __m128h {
12153    static_assert_rounding!(ROUNDING);
12154    vcvtusi2sh(a, b, ROUNDING)
12155}
12156
12157/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12158/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12159///
12160/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_ph)
12161#[inline]
12162#[target_feature(enable = "avx512fp16,avx512vl")]
12163#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12164#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12165pub unsafe fn _mm_cvtepi64_ph(a: __m128i) -> __m128h {
12166    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12167}
12168
12169/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12170/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12171/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12172///
12173/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_ph)
12174#[inline]
12175#[target_feature(enable = "avx512fp16,avx512vl")]
12176#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12177#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12178pub unsafe fn _mm_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12179    vcvtqq2ph_128(a.as_i64x2(), src, k)
12180}
12181
12182/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12183/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12184/// The upper 96 bits of dst are zeroed out.
12185///
12186/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_ph)
12187#[inline]
12188#[target_feature(enable = "avx512fp16,avx512vl")]
12189#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12190#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12191pub unsafe fn _mm_maskz_cvtepi64_ph(k: __mmask8, a: __m128i) -> __m128h {
12192    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12193}
12194
12195/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12196/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12197///
12198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_ph)
12199#[inline]
12200#[target_feature(enable = "avx512fp16,avx512vl")]
12201#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12202#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12203pub unsafe fn _mm256_cvtepi64_ph(a: __m256i) -> __m128h {
12204    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12205}
12206
12207/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12208/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12209/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12210///
12211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_ph)
12212#[inline]
12213#[target_feature(enable = "avx512fp16,avx512vl")]
12214#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12215#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12216pub unsafe fn _mm256_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12217    vcvtqq2ph_256(a.as_i64x4(), src, k)
12218}
12219
12220/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12221/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12222/// The upper 64 bits of dst are zeroed out.
12223///
12224/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_ph)
12225#[inline]
12226#[target_feature(enable = "avx512fp16,avx512vl")]
12227#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12228#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12229pub unsafe fn _mm256_maskz_cvtepi64_ph(k: __mmask8, a: __m256i) -> __m128h {
12230    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12231}
12232
12233/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12234/// and store the results in dst.
12235///
12236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_ph)
12237#[inline]
12238#[target_feature(enable = "avx512fp16")]
12239#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12241pub unsafe fn _mm512_cvtepi64_ph(a: __m512i) -> __m128h {
12242    vcvtqq2ph_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION)
12243}
12244
12245/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12246/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12247/// mask bit is not set).
12248///
12249/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_ph)
12250#[inline]
12251#[target_feature(enable = "avx512fp16")]
12252#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12253#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12254pub unsafe fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12255    simd_select_bitmask(k, _mm512_cvtepi64_ph(a), src)
12256}
12257
12258/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12259/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12260///
12261/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_ph)
12262#[inline]
12263#[target_feature(enable = "avx512fp16")]
12264#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12265#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12266pub unsafe fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h {
12267    _mm512_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12268}
12269
12270/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12271/// and store the results in dst.
12272///
12273/// Rounding is done according to the rounding parameter, which can be one of:
12274///
12275/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12276/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12277/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12278/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12279/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12280///
12281/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi64_ph)
12282#[inline]
12283#[target_feature(enable = "avx512fp16")]
12284#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12285#[rustc_legacy_const_generics(1)]
12286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12287pub unsafe fn _mm512_cvt_roundepi64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
12288    static_assert_rounding!(ROUNDING);
12289    vcvtqq2ph_512(a.as_i64x8(), ROUNDING)
12290}
12291
12292/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12293/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12294/// mask bit is not set).
12295///
12296/// Rounding is done according to the rounding parameter, which can be one of:
12297///
12298/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12299/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12300/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12301/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12302/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12303///
12304/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi64_ph)
12305#[inline]
12306#[target_feature(enable = "avx512fp16")]
12307#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12308#[rustc_legacy_const_generics(3)]
12309#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12310pub unsafe fn _mm512_mask_cvt_roundepi64_ph<const ROUNDING: i32>(
12311    src: __m128h,
12312    k: __mmask8,
12313    a: __m512i,
12314) -> __m128h {
12315    static_assert_rounding!(ROUNDING);
12316    simd_select_bitmask(k, _mm512_cvt_roundepi64_ph::<ROUNDING>(a), src)
12317}
12318
12319/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12320/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12321///
12322/// Rounding is done according to the rounding parameter, which can be one of:
12323///
12324/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12325/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12326/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12327/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12328/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12329///
12330/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi64_ph)
12331#[inline]
12332#[target_feature(enable = "avx512fp16")]
12333#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12334#[rustc_legacy_const_generics(2)]
12335#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12336pub unsafe fn _mm512_maskz_cvt_roundepi64_ph<const ROUNDING: i32>(
12337    k: __mmask8,
12338    a: __m512i,
12339) -> __m128h {
12340    static_assert_rounding!(ROUNDING);
12341    _mm512_mask_cvt_roundepi64_ph::<ROUNDING>(_mm_setzero_ph(), k, a)
12342}
12343
12344/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12345/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12346///
12347/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu64_ph)
12348#[inline]
12349#[target_feature(enable = "avx512fp16,avx512vl")]
12350#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12351#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12352pub unsafe fn _mm_cvtepu64_ph(a: __m128i) -> __m128h {
12353    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12354}
12355
12356/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12357/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12358/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12359///
12360/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu64_ph)
12361#[inline]
12362#[target_feature(enable = "avx512fp16,avx512vl")]
12363#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12364#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12365pub unsafe fn _mm_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12366    vcvtuqq2ph_128(a.as_u64x2(), src, k)
12367}
12368
12369/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12370/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12371/// The upper 96 bits of dst are zeroed out.
12372///
12373/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu64_ph)
12374#[inline]
12375#[target_feature(enable = "avx512fp16,avx512vl")]
12376#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12377#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12378pub unsafe fn _mm_maskz_cvtepu64_ph(k: __mmask8, a: __m128i) -> __m128h {
12379    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12380}
12381
12382/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12383/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12384///
12385/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu64_ph)
12386#[inline]
12387#[target_feature(enable = "avx512fp16,avx512vl")]
12388#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12389#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12390pub unsafe fn _mm256_cvtepu64_ph(a: __m256i) -> __m128h {
12391    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12392}
12393
12394/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12395/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12396/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12397///
12398/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu64_ph)
12399#[inline]
12400#[target_feature(enable = "avx512fp16,avx512vl")]
12401#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12402#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12403pub unsafe fn _mm256_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12404    vcvtuqq2ph_256(a.as_u64x4(), src, k)
12405}
12406
12407/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12408/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12409/// The upper 64 bits of dst are zeroed out.
12410///
12411/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu64_ph)
12412#[inline]
12413#[target_feature(enable = "avx512fp16,avx512vl")]
12414#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12415#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12416pub unsafe fn _mm256_maskz_cvtepu64_ph(k: __mmask8, a: __m256i) -> __m128h {
12417    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12418}
12419
12420/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12421/// and store the results in dst.
12422///
12423/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu64_ph)
12424#[inline]
12425#[target_feature(enable = "avx512fp16")]
12426#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12427#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12428pub unsafe fn _mm512_cvtepu64_ph(a: __m512i) -> __m128h {
12429    vcvtuqq2ph_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION)
12430}
12431
12432/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12433/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12434/// mask bit is not set).
12435///
12436/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu64_ph)
12437#[inline]
12438#[target_feature(enable = "avx512fp16")]
12439#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12440#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12441pub unsafe fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12442    simd_select_bitmask(k, _mm512_cvtepu64_ph(a), src)
12443}
12444
12445/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12446/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12447///
12448/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu64_ph)
12449#[inline]
12450#[target_feature(enable = "avx512fp16")]
12451#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12452#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12453pub unsafe fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h {
12454    _mm512_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12455}
12456
12457/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12458/// and store the results in dst.
12459///
12460/// Rounding is done according to the rounding parameter, which can be one of:
12461///
12462/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12463/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12464/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12465/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12466/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12467///
12468/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu64_ph)
12469#[inline]
12470#[target_feature(enable = "avx512fp16")]
12471#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12472#[rustc_legacy_const_generics(1)]
12473#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12474pub unsafe fn _mm512_cvt_roundepu64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
12475    static_assert_rounding!(ROUNDING);
12476    vcvtuqq2ph_512(a.as_u64x8(), ROUNDING)
12477}
12478
12479/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12480/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12481/// mask bit is not set).
12482///
12483/// Rounding is done according to the rounding parameter, which can be one of:
12484///
12485/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12486/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12487/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12488/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12489/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12490///
12491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu64_ph)
12492#[inline]
12493#[target_feature(enable = "avx512fp16")]
12494#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12495#[rustc_legacy_const_generics(3)]
12496#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12497pub unsafe fn _mm512_mask_cvt_roundepu64_ph<const ROUNDING: i32>(
12498    src: __m128h,
12499    k: __mmask8,
12500    a: __m512i,
12501) -> __m128h {
12502    static_assert_rounding!(ROUNDING);
12503    simd_select_bitmask(k, _mm512_cvt_roundepu64_ph::<ROUNDING>(a), src)
12504}
12505
12506/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12507/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12508///
12509/// Rounding is done according to the rounding parameter, which can be one of:
12510///
12511/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12512/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12513/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12514/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12515/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12516///
12517/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu64_ph)
12518#[inline]
12519#[target_feature(enable = "avx512fp16")]
12520#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12521#[rustc_legacy_const_generics(2)]
12522#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12523pub unsafe fn _mm512_maskz_cvt_roundepu64_ph<const ROUNDING: i32>(
12524    k: __mmask8,
12525    a: __m512i,
12526) -> __m128h {
12527    static_assert_rounding!(ROUNDING);
12528    _mm512_mask_cvt_roundepu64_ph::<ROUNDING>(_mm_setzero_ph(), k, a)
12529}
12530
12531/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12532/// floating-point elements, and store the results in dst.
12533///
12534/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxps_ph)
12535#[inline]
12536#[target_feature(enable = "avx512fp16,avx512vl")]
12537#[cfg_attr(test, assert_instr(vcvtps2phx))]
12538#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12539pub unsafe fn _mm_cvtxps_ph(a: __m128) -> __m128h {
12540    _mm_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
12541}
12542
12543/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12544/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12545/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
12546///
12547/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxps_ph)
12548#[inline]
12549#[target_feature(enable = "avx512fp16,avx512vl")]
12550#[cfg_attr(test, assert_instr(vcvtps2phx))]
12551#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12552pub unsafe fn _mm_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m128) -> __m128h {
12553    vcvtps2phx_128(a, src, k)
12554}
12555
12556/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12557/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12558/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
12559///
12560/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxps_ph)
12561#[inline]
12562#[target_feature(enable = "avx512fp16,avx512vl")]
12563#[cfg_attr(test, assert_instr(vcvtps2phx))]
12564#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12565pub unsafe fn _mm_maskz_cvtxps_ph(k: __mmask8, a: __m128) -> __m128h {
12566    _mm_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
12567}
12568
12569/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12570/// floating-point elements, and store the results in dst.
12571///
12572/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxps_ph)
12573#[inline]
12574#[target_feature(enable = "avx512fp16,avx512vl")]
12575#[cfg_attr(test, assert_instr(vcvtps2phx))]
12576#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12577pub unsafe fn _mm256_cvtxps_ph(a: __m256) -> __m128h {
12578    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
12579}
12580
12581/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12582/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12583/// when the corresponding mask bit is not set).
12584///
12585/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxps_ph)
12586#[inline]
12587#[target_feature(enable = "avx512fp16,avx512vl")]
12588#[cfg_attr(test, assert_instr(vcvtps2phx))]
12589#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12590pub unsafe fn _mm256_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m256) -> __m128h {
12591    vcvtps2phx_256(a, src, k)
12592}
12593
12594/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12595/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12596/// corresponding mask bit is not set).
12597///
12598/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxps_ph)
12599#[inline]
12600#[target_feature(enable = "avx512fp16,avx512vl")]
12601#[cfg_attr(test, assert_instr(vcvtps2phx))]
12602#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12603pub unsafe fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h {
12604    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
12605}
12606
12607/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12608/// floating-point elements, and store the results in dst.
12609///
12610/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxps_ph)
12611#[inline]
12612#[target_feature(enable = "avx512fp16")]
12613#[cfg_attr(test, assert_instr(vcvtps2phx))]
12614#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12615pub unsafe fn _mm512_cvtxps_ph(a: __m512) -> __m256h {
12616    _mm512_mask_cvtxps_ph(_mm256_setzero_ph(), 0xffff, a)
12617}
12618
12619/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12620/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12621/// when the corresponding mask bit is not set).
12622///
12623/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxps_ph)
12624#[inline]
12625#[target_feature(enable = "avx512fp16")]
12626#[cfg_attr(test, assert_instr(vcvtps2phx))]
12627#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12628pub unsafe fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __m256h {
12629    vcvtps2phx_512(a, src, k, _MM_FROUND_CUR_DIRECTION)
12630}
12631
12632/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12633/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12634/// corresponding mask bit is not set).
12635///
12636/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxps_ph)
12637#[inline]
12638#[target_feature(enable = "avx512fp16")]
12639#[cfg_attr(test, assert_instr(vcvtps2phx))]
12640#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12641pub unsafe fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
12642    _mm512_mask_cvtxps_ph(_mm256_setzero_ph(), k, a)
12643}
12644
12645/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12646/// floating-point elements, and store the results in dst.
12647///
12648/// Rounding is done according to the rounding parameter, which can be one of:
12649///
12650/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12651/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12652/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12653/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12654/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12655///
12656/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundps_ph)
12657#[inline]
12658#[target_feature(enable = "avx512fp16")]
12659#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
12660#[rustc_legacy_const_generics(1)]
12661#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12662pub unsafe fn _mm512_cvtx_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256h {
12663    static_assert_rounding!(ROUNDING);
12664    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(_mm256_setzero_ph(), 0xffff, a)
12665}
12666
12667/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12668/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12669/// when the corresponding mask bit is not set).
12670///
12671/// Rounding is done according to the rounding parameter, which can be one of:
12672///
12673/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12674/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12675/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12676/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12677/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12678///
12679/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundps_ph)
12680#[inline]
12681#[target_feature(enable = "avx512fp16")]
12682#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
12683#[rustc_legacy_const_generics(3)]
12684#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12685pub unsafe fn _mm512_mask_cvtx_roundps_ph<const ROUNDING: i32>(
12686    src: __m256h,
12687    k: __mmask16,
12688    a: __m512,
12689) -> __m256h {
12690    static_assert_rounding!(ROUNDING);
12691    vcvtps2phx_512(a, src, k, ROUNDING)
12692}
12693
12694/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12695/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12696/// corresponding mask bit is not set).
12697///
12698/// Rounding is done according to the rounding parameter, which can be one of:
12699///
12700/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12701/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12702/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12703/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12704/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12705///
12706/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundps_ph)
12707#[inline]
12708#[target_feature(enable = "avx512fp16")]
12709#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
12710#[rustc_legacy_const_generics(2)]
12711#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12712pub unsafe fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(
12713    k: __mmask16,
12714    a: __m512,
12715) -> __m256h {
12716    static_assert_rounding!(ROUNDING);
12717    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(_mm256_setzero_ph(), k, a)
12718}
12719
12720/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12721/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
12722/// elements from a to the upper elements of dst.
12723///
12724/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sh)
12725#[inline]
12726#[target_feature(enable = "avx512fp16")]
12727#[cfg_attr(test, assert_instr(vcvtss2sh))]
12728#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12729pub unsafe fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h {
12730    _mm_mask_cvtss_sh(_mm_undefined_ph(), 0xff, a, b)
12731}
12732
12733/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12734/// floating-point elements, store the result in the lower element of dst using writemask k (the element
12735/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
12736/// upper elements of dst.
12737///
12738/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sh)
12739#[inline]
12740#[target_feature(enable = "avx512fp16")]
12741#[cfg_attr(test, assert_instr(vcvtss2sh))]
12742#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12743pub unsafe fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128) -> __m128h {
12744    vcvtss2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION)
12745}
12746
12747/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12748/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
12749/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
12750/// elements of dst.
12751///
12752/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sh)
12753#[inline]
12754#[target_feature(enable = "avx512fp16")]
12755#[cfg_attr(test, assert_instr(vcvtss2sh))]
12756#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12757pub unsafe fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h {
12758    _mm_mask_cvtss_sh(_mm_setzero_ph(), k, a, b)
12759}
12760
12761/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12762/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
12763/// elements from a to the upper elements of dst.
12764///
12765/// Rounding is done according to the rounding parameter, which can be one of:
12766///
12767/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12768/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12769/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12770/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12771/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12772///
12773/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_sh)
12774#[inline]
12775#[target_feature(enable = "avx512fp16")]
12776#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
12777#[rustc_legacy_const_generics(2)]
12778#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12779pub unsafe fn _mm_cvt_roundss_sh<const ROUNDING: i32>(a: __m128h, b: __m128) -> __m128h {
12780    static_assert_rounding!(ROUNDING);
12781    _mm_mask_cvt_roundss_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
12782}
12783
12784/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12785/// floating-point elements, store the result in the lower element of dst using writemask k (the element
12786/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
12787/// upper elements of dst.
12788///
12789/// Rounding is done according to the rounding parameter, which can be one of:
12790///
12791/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12792/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12793/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12794/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12795/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12796///
12797/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundss_sh)
12798#[inline]
12799#[target_feature(enable = "avx512fp16")]
12800#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
12801#[rustc_legacy_const_generics(4)]
12802#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12803pub unsafe fn _mm_mask_cvt_roundss_sh<const ROUNDING: i32>(
12804    src: __m128h,
12805    k: __mmask8,
12806    a: __m128h,
12807    b: __m128,
12808) -> __m128h {
12809    static_assert_rounding!(ROUNDING);
12810    vcvtss2sh(a, b, src, k, ROUNDING)
12811}
12812
12813/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
12814/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
12815/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
12816/// elements of dst.
12817///
12818/// Rounding is done according to the rounding parameter, which can be one of:
12819///
12820/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12821/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12822/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12823/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12824/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12825///
12826/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sh)
12827#[inline]
12828#[target_feature(enable = "avx512fp16")]
12829#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
12830#[rustc_legacy_const_generics(3)]
12831#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12832pub unsafe fn _mm_maskz_cvt_roundss_sh<const ROUNDING: i32>(
12833    k: __mmask8,
12834    a: __m128h,
12835    b: __m128,
12836) -> __m128h {
12837    static_assert_rounding!(ROUNDING);
12838    _mm_mask_cvt_roundss_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
12839}
12840
12841/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
12842/// floating-point elements, and store the results in dst. The upper 96 bits of dst are zeroed out.
12843///
12844/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ph)
12845#[inline]
12846#[target_feature(enable = "avx512fp16,avx512vl")]
12847#[cfg_attr(test, assert_instr(vcvtpd2ph))]
12848#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12849pub unsafe fn _mm_cvtpd_ph(a: __m128d) -> __m128h {
12850    _mm_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
12851}
12852
12853/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
12854/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12855/// when the corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
12856///
12857/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_ph)
12858#[inline]
12859#[target_feature(enable = "avx512fp16,avx512vl")]
12860#[cfg_attr(test, assert_instr(vcvtpd2ph))]
12861#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12862pub unsafe fn _mm_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m128d) -> __m128h {
12863    vcvtpd2ph_128(a, src, k)
12864}
12865
12866/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
12867/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12868/// corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
12869///
12870/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_ph)
12871#[inline]
12872#[target_feature(enable = "avx512fp16,avx512vl")]
12873#[cfg_attr(test, assert_instr(vcvtpd2ph))]
12874#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12875pub unsafe fn _mm_maskz_cvtpd_ph(k: __mmask8, a: __m128d) -> __m128h {
12876    _mm_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
12877}
12878
12879/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
12880/// floating-point elements, and store the results in dst. The upper 64 bits of dst are zeroed out.
12881///
12882/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ph)
12883#[inline]
12884#[target_feature(enable = "avx512fp16,avx512vl")]
12885#[cfg_attr(test, assert_instr(vcvtpd2ph))]
12886#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12887pub unsafe fn _mm256_cvtpd_ph(a: __m256d) -> __m128h {
12888    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
12889}
12890
12891/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
12892/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12893/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
12894///
12895/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_ph)
12896#[inline]
12897#[target_feature(enable = "avx512fp16,avx512vl")]
12898#[cfg_attr(test, assert_instr(vcvtpd2ph))]
12899#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12900pub unsafe fn _mm256_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m256d) -> __m128h {
12901    vcvtpd2ph_256(a, src, k)
12902}
12903
12904/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
12905/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12906/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
12907///
12908/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_ph)
12909#[inline]
12910#[target_feature(enable = "avx512fp16,avx512vl")]
12911#[cfg_attr(test, assert_instr(vcvtpd2ph))]
12912#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12913pub unsafe fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h {
12914    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
12915}
12916
12917/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
12918/// floating-point elements, and store the results in dst.
12919///
12920/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_ph)
12921#[inline]
12922#[target_feature(enable = "avx512fp16")]
12923#[cfg_attr(test, assert_instr(vcvtpd2ph))]
12924#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12925pub unsafe fn _mm512_cvtpd_ph(a: __m512d) -> __m128h {
12926    _mm512_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
12927}
12928
12929/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
12930/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12931/// when the corresponding mask bit is not set).
12932///
12933/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_ph)
12934#[inline]
12935#[target_feature(enable = "avx512fp16")]
12936#[cfg_attr(test, assert_instr(vcvtpd2ph))]
12937#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12938pub unsafe fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m128h {
12939    vcvtpd2ph_512(a, src, k, _MM_FROUND_CUR_DIRECTION)
12940}
12941
12942/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
12943/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12944/// corresponding mask bit is not set).
12945///
12946/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_ph)
12947#[inline]
12948#[target_feature(enable = "avx512fp16")]
12949#[cfg_attr(test, assert_instr(vcvtpd2ph))]
12950#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12951pub unsafe fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
12952    _mm512_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
12953}
12954
12955/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
12956/// floating-point elements, and store the results in dst.
12957///
12958/// Rounding is done according to the rounding parameter, which can be one of:
12959///
12960/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12961/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12962/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12963/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12964/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12965///
12966/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_ph)
12967#[inline]
12968#[target_feature(enable = "avx512fp16")]
12969#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
12970#[rustc_legacy_const_generics(1)]
12971#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12972pub unsafe fn _mm512_cvt_roundpd_ph<const ROUNDING: i32>(a: __m512d) -> __m128h {
12973    static_assert_rounding!(ROUNDING);
12974    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(_mm_setzero_ph(), 0xff, a)
12975}
12976
12977/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
12978/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12979/// when the corresponding mask bit is not set).
12980///
12981/// Rounding is done according to the rounding parameter, which can be one of:
12982///
12983/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12984/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12985/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12986/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12987/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12988///
12989/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_ph)
12990#[inline]
12991#[target_feature(enable = "avx512fp16")]
12992#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
12993#[rustc_legacy_const_generics(3)]
12994#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12995pub unsafe fn _mm512_mask_cvt_roundpd_ph<const ROUNDING: i32>(
12996    src: __m128h,
12997    k: __mmask8,
12998    a: __m512d,
12999) -> __m128h {
13000    static_assert_rounding!(ROUNDING);
13001    vcvtpd2ph_512(a, src, k, ROUNDING)
13002}
13003
13004/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13005/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13006/// corresponding mask bit is not set).
13007///
13008/// Rounding is done according to the rounding parameter, which can be one of:
13009///
13010/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13011/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13012/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13013/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13014/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13015///
13016/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ph)
13017#[inline]
13018#[target_feature(enable = "avx512fp16")]
13019#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13020#[rustc_legacy_const_generics(2)]
13021#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13022pub unsafe fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m128h {
13023    static_assert_rounding!(ROUNDING);
13024    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(_mm_setzero_ph(), k, a)
13025}
13026
13027/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13028/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13029/// elements from a to the upper elements of dst.
13030///
13031/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_sh)
13032#[inline]
13033#[target_feature(enable = "avx512fp16")]
13034#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13035#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13036pub unsafe fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h {
13037    _mm_mask_cvtsd_sh(_mm_undefined_ph(), 0xff, a, b)
13038}
13039
13040/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13041/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13042/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13043/// upper elements of dst.
13044///
13045/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsd_sh)
13046#[inline]
13047#[target_feature(enable = "avx512fp16")]
13048#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13049#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13050pub unsafe fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13051    vcvtsd2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION)
13052}
13053
13054/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13055/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13056/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13057/// elements of dst.
13058///
13059/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_sh)
13060#[inline]
13061#[target_feature(enable = "avx512fp16")]
13062#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13063#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13064pub unsafe fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13065    _mm_mask_cvtsd_sh(_mm_setzero_ph(), k, a, b)
13066}
13067
13068/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13069/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13070/// elements from a to the upper elements of dst.
13071///
13072/// Rounding is done according to the rounding parameter, which can be one of:
13073///
13074/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13075/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13076/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13077/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13078/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13079///
13080/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_sh)
13081#[inline]
13082#[target_feature(enable = "avx512fp16")]
13083#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13084#[rustc_legacy_const_generics(2)]
13085#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13086pub unsafe fn _mm_cvt_roundsd_sh<const ROUNDING: i32>(a: __m128h, b: __m128d) -> __m128h {
13087    static_assert_rounding!(ROUNDING);
13088    _mm_mask_cvt_roundsd_sh::<ROUNDING>(_mm_undefined_ph(), 0xff, a, b)
13089}
13090
13091/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13092/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13093/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13094/// upper elements of dst.
13095///
13096/// Rounding is done according to the rounding parameter, which can be one of:
13097///
13098/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13099/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13100/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13101/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13102/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13103///
13104/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsd_sh)
13105#[inline]
13106#[target_feature(enable = "avx512fp16")]
13107#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13108#[rustc_legacy_const_generics(4)]
13109#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13110pub unsafe fn _mm_mask_cvt_roundsd_sh<const ROUNDING: i32>(
13111    src: __m128h,
13112    k: __mmask8,
13113    a: __m128h,
13114    b: __m128d,
13115) -> __m128h {
13116    static_assert_rounding!(ROUNDING);
13117    vcvtsd2sh(a, b, src, k, ROUNDING)
13118}
13119
13120/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13121/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13122/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13123/// elements of dst.
13124///
13125/// Rounding is done according to the rounding parameter, which can be one of:
13126///
13127/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13128/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13129/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13130/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13131/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13132///
13133/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_sh)
13134#[inline]
13135#[target_feature(enable = "avx512fp16")]
13136#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13137#[rustc_legacy_const_generics(3)]
13138#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13139pub unsafe fn _mm_maskz_cvt_roundsd_sh<const ROUNDING: i32>(
13140    k: __mmask8,
13141    a: __m128h,
13142    b: __m128d,
13143) -> __m128h {
13144    static_assert_rounding!(ROUNDING);
13145    _mm_mask_cvt_roundsd_sh::<ROUNDING>(_mm_setzero_ph(), k, a, b)
13146}
13147
13148/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13149/// store the results in dst.
13150///
13151/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi16)
13152#[inline]
13153#[target_feature(enable = "avx512fp16,avx512vl")]
13154#[cfg_attr(test, assert_instr(vcvtph2w))]
13155#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13156pub unsafe fn _mm_cvtph_epi16(a: __m128h) -> __m128i {
13157    _mm_mask_cvtph_epi16(_mm_undefined_si128(), 0xff, a)
13158}
13159
13160/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13161/// store the results in dst using writemask k (elements are copied from src when the corresponding
13162/// mask bit is not set).
13163///
13164/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi16)
13165#[inline]
13166#[target_feature(enable = "avx512fp16,avx512vl")]
13167#[cfg_attr(test, assert_instr(vcvtph2w))]
13168#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13169pub unsafe fn _mm_mask_cvtph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13170    transmute(vcvtph2w_128(a, src.as_i16x8(), k))
13171}
13172
13173/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13174/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13175///
13176/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi16)
13177#[inline]
13178#[target_feature(enable = "avx512fp16,avx512vl")]
13179#[cfg_attr(test, assert_instr(vcvtph2w))]
13180#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13181pub unsafe fn _mm_maskz_cvtph_epi16(k: __mmask8, a: __m128h) -> __m128i {
13182    _mm_mask_cvtph_epi16(_mm_setzero_si128(), k, a)
13183}
13184
13185/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13186/// store the results in dst.
13187///
13188/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi16)
13189#[inline]
13190#[target_feature(enable = "avx512fp16,avx512vl")]
13191#[cfg_attr(test, assert_instr(vcvtph2w))]
13192#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13193pub unsafe fn _mm256_cvtph_epi16(a: __m256h) -> __m256i {
13194    _mm256_mask_cvtph_epi16(_mm256_undefined_si256(), 0xffff, a)
13195}
13196
13197/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13198/// store the results in dst using writemask k (elements are copied from src when the corresponding
13199/// mask bit is not set).
13200///
13201/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi16)
13202#[inline]
13203#[target_feature(enable = "avx512fp16,avx512vl")]
13204#[cfg_attr(test, assert_instr(vcvtph2w))]
13205#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13206pub unsafe fn _mm256_mask_cvtph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13207    transmute(vcvtph2w_256(a, src.as_i16x16(), k))
13208}
13209
13210/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13211/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13212///
13213/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi16)
13214#[inline]
13215#[target_feature(enable = "avx512fp16,avx512vl")]
13216#[cfg_attr(test, assert_instr(vcvtph2w))]
13217#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13218pub unsafe fn _mm256_maskz_cvtph_epi16(k: __mmask16, a: __m256h) -> __m256i {
13219    _mm256_mask_cvtph_epi16(_mm256_setzero_si256(), k, a)
13220}
13221
13222/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13223/// store the results in dst.
13224///
13225/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi16)
13226#[inline]
13227#[target_feature(enable = "avx512fp16")]
13228#[cfg_attr(test, assert_instr(vcvtph2w))]
13229#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13230pub unsafe fn _mm512_cvtph_epi16(a: __m512h) -> __m512i {
13231    _mm512_mask_cvtph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
13232}
13233
13234/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13235/// store the results in dst using writemask k (elements are copied from src when the corresponding
13236/// mask bit is not set).
13237///
13238/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi16)
13239#[inline]
13240#[target_feature(enable = "avx512fp16")]
13241#[cfg_attr(test, assert_instr(vcvtph2w))]
13242#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13243pub unsafe fn _mm512_mask_cvtph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13244    transmute(vcvtph2w_512(
13245        a,
13246        src.as_i16x32(),
13247        k,
13248        _MM_FROUND_CUR_DIRECTION,
13249    ))
13250}
13251
13252/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13253/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13254///
13255/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi16)
13256#[inline]
13257#[target_feature(enable = "avx512fp16")]
13258#[cfg_attr(test, assert_instr(vcvtph2w))]
13259#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13260pub unsafe fn _mm512_maskz_cvtph_epi16(k: __mmask32, a: __m512h) -> __m512i {
13261    _mm512_mask_cvtph_epi16(_mm512_setzero_si512(), k, a)
13262}
13263
13264/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13265/// store the results in dst.
13266///
13267/// Rounding is done according to the rounding parameter, which can be one of:
13268///
13269/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13270/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13271/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13272/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13273/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13274///
13275/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi16)
13276#[inline]
13277#[target_feature(enable = "avx512fp16")]
13278#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13279#[rustc_legacy_const_generics(1)]
13280#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13281pub unsafe fn _mm512_cvt_roundph_epi16<const ROUNDING: i32>(a: __m512h) -> __m512i {
13282    static_assert_rounding!(ROUNDING);
13283    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
13284}
13285
13286/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13287/// store the results in dst using writemask k (elements are copied from src when the corresponding
13288/// mask bit is not set).
13289///
13290/// Rounding is done according to the rounding parameter, which can be one of:
13291///
13292/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13293/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13294/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13295/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13296/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13297///
13298/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi16)
13299#[inline]
13300#[target_feature(enable = "avx512fp16")]
13301#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13302#[rustc_legacy_const_generics(3)]
13303#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13304pub unsafe fn _mm512_mask_cvt_roundph_epi16<const ROUNDING: i32>(
13305    src: __m512i,
13306    k: __mmask32,
13307    a: __m512h,
13308) -> __m512i {
13309    static_assert_rounding!(ROUNDING);
13310    transmute(vcvtph2w_512(a, src.as_i16x32(), k, ROUNDING))
13311}
13312
13313/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13314/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13315///
13316/// Rounding is done according to the rounding parameter, which can be one of:
13317///
13318/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13319/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13320/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13321/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13322/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13323///
13324/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi16)
13325#[inline]
13326#[target_feature(enable = "avx512fp16")]
13327#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13328#[rustc_legacy_const_generics(2)]
13329#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13330pub unsafe fn _mm512_maskz_cvt_roundph_epi16<const ROUNDING: i32>(
13331    k: __mmask32,
13332    a: __m512h,
13333) -> __m512i {
13334    static_assert_rounding!(ROUNDING);
13335    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_setzero_si512(), k, a)
13336}
13337
13338/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13339/// and store the results in dst.
13340///
13341/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu16)
13342#[inline]
13343#[target_feature(enable = "avx512fp16,avx512vl")]
13344#[cfg_attr(test, assert_instr(vcvtph2uw))]
13345#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13346pub unsafe fn _mm_cvtph_epu16(a: __m128h) -> __m128i {
13347    _mm_mask_cvtph_epu16(_mm_undefined_si128(), 0xff, a)
13348}
13349
13350/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13351/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13352/// mask bit is not set).
13353///
13354/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu16)
13355#[inline]
13356#[target_feature(enable = "avx512fp16,avx512vl")]
13357#[cfg_attr(test, assert_instr(vcvtph2uw))]
13358#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13359pub unsafe fn _mm_mask_cvtph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13360    transmute(vcvtph2uw_128(a, src.as_u16x8(), k))
13361}
13362
13363/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13364/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13365///
13366/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu16)
13367#[inline]
13368#[target_feature(enable = "avx512fp16,avx512vl")]
13369#[cfg_attr(test, assert_instr(vcvtph2uw))]
13370#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13371pub unsafe fn _mm_maskz_cvtph_epu16(k: __mmask8, a: __m128h) -> __m128i {
13372    _mm_mask_cvtph_epu16(_mm_setzero_si128(), k, a)
13373}
13374
13375/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13376/// and store the results in dst.
13377///
13378/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu16)
13379#[inline]
13380#[target_feature(enable = "avx512fp16,avx512vl")]
13381#[cfg_attr(test, assert_instr(vcvtph2uw))]
13382#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13383pub unsafe fn _mm256_cvtph_epu16(a: __m256h) -> __m256i {
13384    _mm256_mask_cvtph_epu16(_mm256_undefined_si256(), 0xffff, a)
13385}
13386
13387/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13388/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13389/// mask bit is not set).
13390///
13391/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu16)
13392#[inline]
13393#[target_feature(enable = "avx512fp16,avx512vl")]
13394#[cfg_attr(test, assert_instr(vcvtph2uw))]
13395#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13396pub unsafe fn _mm256_mask_cvtph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13397    transmute(vcvtph2uw_256(a, src.as_u16x16(), k))
13398}
13399
13400/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13401/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13402///
13403/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu16)
13404#[inline]
13405#[target_feature(enable = "avx512fp16,avx512vl")]
13406#[cfg_attr(test, assert_instr(vcvtph2uw))]
13407#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13408pub unsafe fn _mm256_maskz_cvtph_epu16(k: __mmask16, a: __m256h) -> __m256i {
13409    _mm256_mask_cvtph_epu16(_mm256_setzero_si256(), k, a)
13410}
13411
13412/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13413/// and store the results in dst.
13414///
13415/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu16)
13416#[inline]
13417#[target_feature(enable = "avx512fp16")]
13418#[cfg_attr(test, assert_instr(vcvtph2uw))]
13419#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13420pub unsafe fn _mm512_cvtph_epu16(a: __m512h) -> __m512i {
13421    _mm512_mask_cvtph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
13422}
13423
13424/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13425/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13426/// mask bit is not set).
13427///
13428/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu16)
13429#[inline]
13430#[target_feature(enable = "avx512fp16")]
13431#[cfg_attr(test, assert_instr(vcvtph2uw))]
13432#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13433pub unsafe fn _mm512_mask_cvtph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13434    transmute(vcvtph2uw_512(
13435        a,
13436        src.as_u16x32(),
13437        k,
13438        _MM_FROUND_CUR_DIRECTION,
13439    ))
13440}
13441
13442/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13443/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13444///
13445/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu16)
13446#[inline]
13447#[target_feature(enable = "avx512fp16")]
13448#[cfg_attr(test, assert_instr(vcvtph2uw))]
13449#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13450pub unsafe fn _mm512_maskz_cvtph_epu16(k: __mmask32, a: __m512h) -> __m512i {
13451    _mm512_mask_cvtph_epu16(_mm512_setzero_si512(), k, a)
13452}
13453
13454/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13455/// and store the results in dst.
13456///
13457/// Rounding is done according to the rounding parameter, which can be one of:
13458///
13459/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13460/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13461/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13462/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13463/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13464///
13465/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu16)
13466#[inline]
13467#[target_feature(enable = "avx512fp16")]
13468#[cfg_attr(test, assert_instr(vcvtph2uw, ROUNDING = 8))]
13469#[rustc_legacy_const_generics(1)]
13470#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13471pub unsafe fn _mm512_cvt_roundph_epu16<const ROUNDING: i32>(a: __m512h) -> __m512i {
13472    static_assert_rounding!(ROUNDING);
13473    _mm512_mask_cvt_roundph_epu16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
13474}
13475
13476/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13477/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13478/// mask bit is not set).
13479///
13480/// Rounding is done according to the rounding parameter, which can be one of:
13481///
13482/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13483/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13484/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13485/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13486/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13487///
13488/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu16)
13489#[inline]
13490#[target_feature(enable = "avx512fp16")]
13491#[cfg_attr(test, assert_instr(vcvtph2uw, ROUNDING = 8))]
13492#[rustc_legacy_const_generics(3)]
13493#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13494pub unsafe fn _mm512_mask_cvt_roundph_epu16<const ROUNDING: i32>(
13495    src: __m512i,
13496    k: __mmask32,
13497    a: __m512h,
13498) -> __m512i {
13499    static_assert_rounding!(ROUNDING);
13500    transmute(vcvtph2uw_512(a, src.as_u16x32(), k, ROUNDING))
13501}
13502
13503/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13504/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13505///
13506/// Rounding is done according to the rounding parameter, which can be one of:
13507///
13508/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13509/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13510/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13511/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13512/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13513///
13514/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu16)
13515#[inline]
13516#[target_feature(enable = "avx512fp16")]
13517#[cfg_attr(test, assert_instr(vcvtph2uw, ROUNDING = 8))]
13518#[rustc_legacy_const_generics(2)]
13519#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13520pub unsafe fn _mm512_maskz_cvt_roundph_epu16<const ROUNDING: i32>(
13521    k: __mmask32,
13522    a: __m512h,
13523) -> __m512i {
13524    static_assert_rounding!(ROUNDING);
13525    _mm512_mask_cvt_roundph_epu16::<ROUNDING>(_mm512_setzero_si512(), k, a)
13526}
13527
13528/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13529/// truncation, and store the results in dst.
13530///
13531/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi16)
13532#[inline]
13533#[target_feature(enable = "avx512fp16,avx512vl")]
13534#[cfg_attr(test, assert_instr(vcvttph2w))]
13535#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13536pub unsafe fn _mm_cvttph_epi16(a: __m128h) -> __m128i {
13537    _mm_mask_cvttph_epi16(_mm_undefined_si128(), 0xff, a)
13538}
13539
13540/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13541/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13542/// mask bit is not set).
13543///
13544/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi16)
13545#[inline]
13546#[target_feature(enable = "avx512fp16,avx512vl")]
13547#[cfg_attr(test, assert_instr(vcvttph2w))]
13548#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13549pub unsafe fn _mm_mask_cvttph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13550    transmute(vcvttph2w_128(a, src.as_i16x8(), k))
13551}
13552
13553/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13554/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13555/// mask bit is not set).
13556///
13557/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi16)
13558#[inline]
13559#[target_feature(enable = "avx512fp16,avx512vl")]
13560#[cfg_attr(test, assert_instr(vcvttph2w))]
13561#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13562pub unsafe fn _mm_maskz_cvttph_epi16(k: __mmask8, a: __m128h) -> __m128i {
13563    _mm_mask_cvttph_epi16(_mm_setzero_si128(), k, a)
13564}
13565
13566/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13567/// truncation, and store the results in dst.
13568///
13569/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi16)
13570#[inline]
13571#[target_feature(enable = "avx512fp16,avx512vl")]
13572#[cfg_attr(test, assert_instr(vcvttph2w))]
13573#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13574pub unsafe fn _mm256_cvttph_epi16(a: __m256h) -> __m256i {
13575    _mm256_mask_cvttph_epi16(_mm256_undefined_si256(), 0xffff, a)
13576}
13577
13578/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13579/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13580/// mask bit is not set).
13581///
13582/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi16)
13583#[inline]
13584#[target_feature(enable = "avx512fp16,avx512vl")]
13585#[cfg_attr(test, assert_instr(vcvttph2w))]
13586#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13587pub unsafe fn _mm256_mask_cvttph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13588    transmute(vcvttph2w_256(a, src.as_i16x16(), k))
13589}
13590
13591/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13592/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13593/// mask bit is not set).
13594///
13595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi16)
13596#[inline]
13597#[target_feature(enable = "avx512fp16,avx512vl")]
13598#[cfg_attr(test, assert_instr(vcvttph2w))]
13599#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13600pub unsafe fn _mm256_maskz_cvttph_epi16(k: __mmask16, a: __m256h) -> __m256i {
13601    _mm256_mask_cvttph_epi16(_mm256_setzero_si256(), k, a)
13602}
13603
13604/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13605/// truncation, and store the results in dst.
13606///
13607/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi16)
13608#[inline]
13609#[target_feature(enable = "avx512fp16")]
13610#[cfg_attr(test, assert_instr(vcvttph2w))]
13611#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13612pub unsafe fn _mm512_cvttph_epi16(a: __m512h) -> __m512i {
13613    _mm512_mask_cvttph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
13614}
13615
13616/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13617/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13618/// mask bit is not set).
13619///
13620/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi16)
13621#[inline]
13622#[target_feature(enable = "avx512fp16")]
13623#[cfg_attr(test, assert_instr(vcvttph2w))]
13624#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13625pub unsafe fn _mm512_mask_cvttph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13626    transmute(vcvttph2w_512(
13627        a,
13628        src.as_i16x32(),
13629        k,
13630        _MM_FROUND_CUR_DIRECTION,
13631    ))
13632}
13633
13634/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13635/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13636/// mask bit is not set).
13637///
13638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi16)
13639#[inline]
13640#[target_feature(enable = "avx512fp16")]
13641#[cfg_attr(test, assert_instr(vcvttph2w))]
13642#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13643pub unsafe fn _mm512_maskz_cvttph_epi16(k: __mmask32, a: __m512h) -> __m512i {
13644    _mm512_mask_cvttph_epi16(_mm512_setzero_si512(), k, a)
13645}
13646
13647/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13648/// truncation, and store the results in dst.
13649///
13650/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13651///
13652/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi16)
13653#[inline]
13654#[target_feature(enable = "avx512fp16")]
13655#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
13656#[rustc_legacy_const_generics(1)]
13657#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13658pub unsafe fn _mm512_cvtt_roundph_epi16<const SAE: i32>(a: __m512h) -> __m512i {
13659    static_assert_sae!(SAE);
13660    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
13661}
13662
13663/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13664/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13665/// mask bit is not set).
13666///
13667/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13668///
13669/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi16)
13670#[inline]
13671#[target_feature(enable = "avx512fp16")]
13672#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
13673#[rustc_legacy_const_generics(3)]
13674#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13675pub unsafe fn _mm512_mask_cvtt_roundph_epi16<const SAE: i32>(
13676    src: __m512i,
13677    k: __mmask32,
13678    a: __m512h,
13679) -> __m512i {
13680    static_assert_sae!(SAE);
13681    transmute(vcvttph2w_512(a, src.as_i16x32(), k, SAE))
13682}
13683
13684/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13685/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13686/// mask bit is not set).
13687///
13688/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13689///
13690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi16)
13691#[inline]
13692#[target_feature(enable = "avx512fp16")]
13693#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
13694#[rustc_legacy_const_generics(2)]
13695#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13696pub unsafe fn _mm512_maskz_cvtt_roundph_epi16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
13697    static_assert_sae!(SAE);
13698    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_setzero_si512(), k, a)
13699}
13700
13701/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13702/// truncation, and store the results in dst.
13703///
13704/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu16)
13705#[inline]
13706#[target_feature(enable = "avx512fp16,avx512vl")]
13707#[cfg_attr(test, assert_instr(vcvttph2uw))]
13708#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13709pub unsafe fn _mm_cvttph_epu16(a: __m128h) -> __m128i {
13710    _mm_mask_cvttph_epu16(_mm_undefined_si128(), 0xff, a)
13711}
13712
13713/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13714/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13715/// mask bit is not set).
13716///
13717/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu16)
13718#[inline]
13719#[target_feature(enable = "avx512fp16,avx512vl")]
13720#[cfg_attr(test, assert_instr(vcvttph2uw))]
13721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13722pub unsafe fn _mm_mask_cvttph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13723    transmute(vcvttph2uw_128(a, src.as_u16x8(), k))
13724}
13725
13726/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13727/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13728/// mask bit is not set).
13729///
13730/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu16)
13731#[inline]
13732#[target_feature(enable = "avx512fp16,avx512vl")]
13733#[cfg_attr(test, assert_instr(vcvttph2uw))]
13734#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13735pub unsafe fn _mm_maskz_cvttph_epu16(k: __mmask8, a: __m128h) -> __m128i {
13736    _mm_mask_cvttph_epu16(_mm_setzero_si128(), k, a)
13737}
13738
13739/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13740/// truncation, and store the results in dst.
13741///
13742/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu16)
13743#[inline]
13744#[target_feature(enable = "avx512fp16,avx512vl")]
13745#[cfg_attr(test, assert_instr(vcvttph2uw))]
13746#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13747pub unsafe fn _mm256_cvttph_epu16(a: __m256h) -> __m256i {
13748    _mm256_mask_cvttph_epu16(_mm256_undefined_si256(), 0xffff, a)
13749}
13750
13751/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13752/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13753/// mask bit is not set).
13754///
13755/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu16)
13756#[inline]
13757#[target_feature(enable = "avx512fp16,avx512vl")]
13758#[cfg_attr(test, assert_instr(vcvttph2uw))]
13759#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13760pub unsafe fn _mm256_mask_cvttph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13761    transmute(vcvttph2uw_256(a, src.as_u16x16(), k))
13762}
13763
13764/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13765/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13766/// mask bit is not set).
13767///
13768/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu16)
13769#[inline]
13770#[target_feature(enable = "avx512fp16,avx512vl")]
13771#[cfg_attr(test, assert_instr(vcvttph2uw))]
13772#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13773pub unsafe fn _mm256_maskz_cvttph_epu16(k: __mmask16, a: __m256h) -> __m256i {
13774    _mm256_mask_cvttph_epu16(_mm256_setzero_si256(), k, a)
13775}
13776
13777/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13778/// truncation, and store the results in dst.
13779///
13780/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu16)
13781#[inline]
13782#[target_feature(enable = "avx512fp16")]
13783#[cfg_attr(test, assert_instr(vcvttph2uw))]
13784#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13785pub unsafe fn _mm512_cvttph_epu16(a: __m512h) -> __m512i {
13786    _mm512_mask_cvttph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
13787}
13788
13789/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13790/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13791/// mask bit is not set).
13792///
13793/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu16)
13794#[inline]
13795#[target_feature(enable = "avx512fp16")]
13796#[cfg_attr(test, assert_instr(vcvttph2uw))]
13797#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13798pub unsafe fn _mm512_mask_cvttph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13799    transmute(vcvttph2uw_512(
13800        a,
13801        src.as_u16x32(),
13802        k,
13803        _MM_FROUND_CUR_DIRECTION,
13804    ))
13805}
13806
13807/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13808/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13809/// mask bit is not set).
13810///
13811/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu16)
13812#[inline]
13813#[target_feature(enable = "avx512fp16")]
13814#[cfg_attr(test, assert_instr(vcvttph2uw))]
13815#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13816pub unsafe fn _mm512_maskz_cvttph_epu16(k: __mmask32, a: __m512h) -> __m512i {
13817    _mm512_mask_cvttph_epu16(_mm512_setzero_si512(), k, a)
13818}
13819
13820/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13821/// truncation, and store the results in dst.
13822///
13823/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13824///
13825/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu16)
13826#[inline]
13827#[target_feature(enable = "avx512fp16")]
13828#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
13829#[rustc_legacy_const_generics(1)]
13830#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13831pub unsafe fn _mm512_cvtt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
13832    static_assert_sae!(SAE);
13833    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
13834}
13835
13836/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13837/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13838/// mask bit is not set).
13839///
13840/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13841///
13842/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu16)
13843#[inline]
13844#[target_feature(enable = "avx512fp16")]
13845#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
13846#[rustc_legacy_const_generics(3)]
13847#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13848pub unsafe fn _mm512_mask_cvtt_roundph_epu16<const SAE: i32>(
13849    src: __m512i,
13850    k: __mmask32,
13851    a: __m512h,
13852) -> __m512i {
13853    static_assert_sae!(SAE);
13854    transmute(vcvttph2uw_512(a, src.as_u16x32(), k, SAE))
13855}
13856
13857/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
13858/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13859/// mask bit is not set).
13860///
13861/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13862///
13863/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu16)
13864#[inline]
13865#[target_feature(enable = "avx512fp16")]
13866#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
13867#[rustc_legacy_const_generics(2)]
13868#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13869pub unsafe fn _mm512_maskz_cvtt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
13870    static_assert_sae!(SAE);
13871    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
13872}
13873
13874/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
13875/// results in dst.
13876///
13877/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi32)
13878#[inline]
13879#[target_feature(enable = "avx512fp16,avx512vl")]
13880#[cfg_attr(test, assert_instr(vcvtph2dq))]
13881#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13882pub unsafe fn _mm_cvtph_epi32(a: __m128h) -> __m128i {
13883    _mm_mask_cvtph_epi32(_mm_undefined_si128(), 0xff, a)
13884}
13885
13886/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
13887/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
13888///
13889/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi32)
13890#[inline]
13891#[target_feature(enable = "avx512fp16,avx512vl")]
13892#[cfg_attr(test, assert_instr(vcvtph2dq))]
13893#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13894pub unsafe fn _mm_mask_cvtph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13895    transmute(vcvtph2dq_128(a, src.as_i32x4(), k))
13896}
13897
13898/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
13899/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13900///
13901/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi32)
13902#[inline]
13903#[target_feature(enable = "avx512fp16,avx512vl")]
13904#[cfg_attr(test, assert_instr(vcvtph2dq))]
13905#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13906pub unsafe fn _mm_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m128i {
13907    _mm_mask_cvtph_epi32(_mm_setzero_si128(), k, a)
13908}
13909
13910/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
13911/// results in dst.
13912///
13913/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi32)
13914#[inline]
13915#[target_feature(enable = "avx512fp16,avx512vl")]
13916#[cfg_attr(test, assert_instr(vcvtph2dq))]
13917#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13918pub unsafe fn _mm256_cvtph_epi32(a: __m128h) -> __m256i {
13919    _mm256_mask_cvtph_epi32(_mm256_undefined_si256(), 0xff, a)
13920}
13921
13922/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
13923/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
13924///
13925/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi32)
13926#[inline]
13927#[target_feature(enable = "avx512fp16,avx512vl")]
13928#[cfg_attr(test, assert_instr(vcvtph2dq))]
13929#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13930pub unsafe fn _mm256_mask_cvtph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
13931    transmute(vcvtph2dq_256(a, src.as_i32x8(), k))
13932}
13933
13934/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
13935/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13936///
13937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi32)
13938#[inline]
13939#[target_feature(enable = "avx512fp16,avx512vl")]
13940#[cfg_attr(test, assert_instr(vcvtph2dq))]
13941#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13942pub unsafe fn _mm256_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m256i {
13943    _mm256_mask_cvtph_epi32(_mm256_setzero_si256(), k, a)
13944}
13945
13946/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
13947/// results in dst.
13948///
13949/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi32)
13950#[inline]
13951#[target_feature(enable = "avx512fp16")]
13952#[cfg_attr(test, assert_instr(vcvtph2dq))]
13953#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13954pub unsafe fn _mm512_cvtph_epi32(a: __m256h) -> __m512i {
13955    _mm512_mask_cvtph_epi32(_mm512_undefined_epi32(), 0xffff, a)
13956}
13957
13958/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
13959/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
13960///
13961/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi32)
13962#[inline]
13963#[target_feature(enable = "avx512fp16")]
13964#[cfg_attr(test, assert_instr(vcvtph2dq))]
13965#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13966pub unsafe fn _mm512_mask_cvtph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
13967    transmute(vcvtph2dq_512(
13968        a,
13969        src.as_i32x16(),
13970        k,
13971        _MM_FROUND_CUR_DIRECTION,
13972    ))
13973}
13974
13975/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
13976/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13977///
13978/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi32)
13979#[inline]
13980#[target_feature(enable = "avx512fp16")]
13981#[cfg_attr(test, assert_instr(vcvtph2dq))]
13982#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13983pub unsafe fn _mm512_maskz_cvtph_epi32(k: __mmask16, a: __m256h) -> __m512i {
13984    _mm512_mask_cvtph_epi32(_mm512_setzero_si512(), k, a)
13985}
13986
13987/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
13988/// results in dst.
13989///
13990/// Rounding is done according to the rounding parameter, which can be one of:
13991///
13992/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13993/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13994/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13995/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13996/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13997///
13998/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi32)
13999#[inline]
14000#[target_feature(enable = "avx512fp16")]
14001#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14002#[rustc_legacy_const_generics(1)]
14003#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14004pub unsafe fn _mm512_cvt_roundph_epi32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14005    static_assert_rounding!(ROUNDING);
14006    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14007}
14008
14009/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14010/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14011///
14012/// Rounding is done according to the rounding parameter, which can be one of:
14013///
14014/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14015/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14016/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14017/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14018/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14019///
14020/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi32)
14021#[inline]
14022#[target_feature(enable = "avx512fp16")]
14023#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14024#[rustc_legacy_const_generics(3)]
14025#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14026pub unsafe fn _mm512_mask_cvt_roundph_epi32<const ROUNDING: i32>(
14027    src: __m512i,
14028    k: __mmask16,
14029    a: __m256h,
14030) -> __m512i {
14031    static_assert_rounding!(ROUNDING);
14032    transmute(vcvtph2dq_512(a, src.as_i32x16(), k, ROUNDING))
14033}
14034
14035/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14036/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14037///
14038/// Rounding is done according to the rounding parameter, which can be one of:
14039///
14040/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14041/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14042/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14043/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14044/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14045///
14046/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi32)
14047#[inline]
14048#[target_feature(enable = "avx512fp16")]
14049#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14050#[rustc_legacy_const_generics(2)]
14051#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14052pub unsafe fn _mm512_maskz_cvt_roundph_epi32<const ROUNDING: i32>(
14053    k: __mmask16,
14054    a: __m256h,
14055) -> __m512i {
14056    static_assert_rounding!(ROUNDING);
14057    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14058}
14059
14060/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14061/// the result in dst.
14062///
14063/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_i32)
14064#[inline]
14065#[target_feature(enable = "avx512fp16")]
14066#[cfg_attr(test, assert_instr(vcvtsh2si))]
14067#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14068pub unsafe fn _mm_cvtsh_i32(a: __m128h) -> i32 {
14069    vcvtsh2si32(a, _MM_FROUND_CUR_DIRECTION)
14070}
14071
14072/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14073/// the result in dst.
14074///
14075/// Rounding is done according to the rounding parameter, which can be one of:
14076///
14077/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14078/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14079/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14080/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14081/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14082///
14083/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_i32)
14084#[inline]
14085#[target_feature(enable = "avx512fp16")]
14086#[cfg_attr(test, assert_instr(vcvtsh2si, ROUNDING = 8))]
14087#[rustc_legacy_const_generics(1)]
14088#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14089pub unsafe fn _mm_cvt_roundsh_i32<const ROUNDING: i32>(a: __m128h) -> i32 {
14090    static_assert_rounding!(ROUNDING);
14091    vcvtsh2si32(a, ROUNDING)
14092}
14093
14094/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14095/// results in dst.
14096///
14097/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu32)
14098#[inline]
14099#[target_feature(enable = "avx512fp16,avx512vl")]
14100#[cfg_attr(test, assert_instr(vcvtph2udq))]
14101#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14102pub unsafe fn _mm_cvtph_epu32(a: __m128h) -> __m128i {
14103    _mm_mask_cvtph_epu32(_mm_undefined_si128(), 0xff, a)
14104}
14105
14106/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14107/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14108///
14109/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu32)
14110#[inline]
14111#[target_feature(enable = "avx512fp16,avx512vl")]
14112#[cfg_attr(test, assert_instr(vcvtph2udq))]
14113#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14114pub unsafe fn _mm_mask_cvtph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14115    transmute(vcvtph2udq_128(a, src.as_u32x4(), k))
14116}
14117
14118/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14119/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14120///
14121/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu32)
14122#[inline]
14123#[target_feature(enable = "avx512fp16,avx512vl")]
14124#[cfg_attr(test, assert_instr(vcvtph2udq))]
14125#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14126pub unsafe fn _mm_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m128i {
14127    _mm_mask_cvtph_epu32(_mm_setzero_si128(), k, a)
14128}
14129
14130/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14131/// the results in dst.
14132///
14133/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu32)
14134#[inline]
14135#[target_feature(enable = "avx512fp16,avx512vl")]
14136#[cfg_attr(test, assert_instr(vcvtph2udq))]
14137#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14138pub unsafe fn _mm256_cvtph_epu32(a: __m128h) -> __m256i {
14139    _mm256_mask_cvtph_epu32(_mm256_undefined_si256(), 0xff, a)
14140}
14141
14142/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14143/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14144///
14145/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu32)
14146#[inline]
14147#[target_feature(enable = "avx512fp16,avx512vl")]
14148#[cfg_attr(test, assert_instr(vcvtph2udq))]
14149#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14150pub unsafe fn _mm256_mask_cvtph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14151    transmute(vcvtph2udq_256(a, src.as_u32x8(), k))
14152}
14153
14154/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14155/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14156///
14157/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu32)
14158#[inline]
14159#[target_feature(enable = "avx512fp16,avx512vl")]
14160#[cfg_attr(test, assert_instr(vcvtph2udq))]
14161#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14162pub unsafe fn _mm256_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m256i {
14163    _mm256_mask_cvtph_epu32(_mm256_setzero_si256(), k, a)
14164}
14165
14166/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14167/// the results in dst.
14168///
14169/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu32)
14170#[inline]
14171#[target_feature(enable = "avx512fp16")]
14172#[cfg_attr(test, assert_instr(vcvtph2udq))]
14173#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14174pub unsafe fn _mm512_cvtph_epu32(a: __m256h) -> __m512i {
14175    _mm512_mask_cvtph_epu32(_mm512_undefined_epi32(), 0xffff, a)
14176}
14177
14178/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14179/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14180///
14181/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu32)
14182#[inline]
14183#[target_feature(enable = "avx512fp16")]
14184#[cfg_attr(test, assert_instr(vcvtph2udq))]
14185#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14186pub unsafe fn _mm512_mask_cvtph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14187    transmute(vcvtph2udq_512(
14188        a,
14189        src.as_u32x16(),
14190        k,
14191        _MM_FROUND_CUR_DIRECTION,
14192    ))
14193}
14194
14195/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14196/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14197///
14198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu32)
14199#[inline]
14200#[target_feature(enable = "avx512fp16")]
14201#[cfg_attr(test, assert_instr(vcvtph2udq))]
14202#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14203pub unsafe fn _mm512_maskz_cvtph_epu32(k: __mmask16, a: __m256h) -> __m512i {
14204    _mm512_mask_cvtph_epu32(_mm512_setzero_si512(), k, a)
14205}
14206
14207/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14208/// the results in dst.
14209///
14210/// Rounding is done according to the rounding parameter, which can be one of:
14211///
14212/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14213/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14214/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14215/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14216/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14217///
14218/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu32)
14219#[inline]
14220#[target_feature(enable = "avx512fp16")]
14221#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14222#[rustc_legacy_const_generics(1)]
14223#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14224pub unsafe fn _mm512_cvt_roundph_epu32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14225    static_assert_rounding!(ROUNDING);
14226    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14227}
14228
14229/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14230/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14231///
14232/// Rounding is done according to the rounding parameter, which can be one of:
14233///
14234/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14235/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14236/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14237/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14238/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14239///
14240/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu32)
14241#[inline]
14242#[target_feature(enable = "avx512fp16")]
14243#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14244#[rustc_legacy_const_generics(3)]
14245#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14246pub unsafe fn _mm512_mask_cvt_roundph_epu32<const ROUNDING: i32>(
14247    src: __m512i,
14248    k: __mmask16,
14249    a: __m256h,
14250) -> __m512i {
14251    static_assert_rounding!(ROUNDING);
14252    transmute(vcvtph2udq_512(a, src.as_u32x16(), k, ROUNDING))
14253}
14254
14255/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14256/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14257///
14258/// Rounding is done according to the rounding parameter, which can be one of:
14259///
14260/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14261/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14262/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14263/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14264/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14265///
14266/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu32)
14267#[inline]
14268#[target_feature(enable = "avx512fp16")]
14269#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14270#[rustc_legacy_const_generics(2)]
14271#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14272pub unsafe fn _mm512_maskz_cvt_roundph_epu32<const ROUNDING: i32>(
14273    k: __mmask16,
14274    a: __m256h,
14275) -> __m512i {
14276    static_assert_rounding!(ROUNDING);
14277    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14278}
14279
14280/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14281/// the result in dst.
14282///
14283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_u32)
14284#[inline]
14285#[target_feature(enable = "avx512fp16")]
14286#[cfg_attr(test, assert_instr(vcvtsh2usi))]
14287#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14288pub unsafe fn _mm_cvtsh_u32(a: __m128h) -> u32 {
14289    vcvtsh2usi32(a, _MM_FROUND_CUR_DIRECTION)
14290}
14291
14292/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14293/// the result in dst.
14294///
14295/// Rounding is done according to the rounding parameter, which can be one of:
14296///
14297/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14298/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14299/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14300/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14301/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14302///
14303/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_u32)
14304#[inline]
14305#[target_feature(enable = "avx512fp16")]
14306#[cfg_attr(test, assert_instr(vcvtsh2usi, ROUNDING = 8))]
14307#[rustc_legacy_const_generics(1)]
14308#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14309pub unsafe fn _mm_cvt_roundsh_u32<const ROUNDING: i32>(a: __m128h) -> u32 {
14310    static_assert_rounding!(ROUNDING);
14311    vcvtsh2usi32(a, ROUNDING)
14312}
14313
14314/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14315/// store the results in dst.
14316///
14317/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi32)
14318#[inline]
14319#[target_feature(enable = "avx512fp16,avx512vl")]
14320#[cfg_attr(test, assert_instr(vcvttph2dq))]
14321#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14322pub unsafe fn _mm_cvttph_epi32(a: __m128h) -> __m128i {
14323    _mm_mask_cvttph_epi32(_mm_undefined_si128(), 0xff, a)
14324}
14325
14326/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14327/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14328///
14329/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi32)
14330#[inline]
14331#[target_feature(enable = "avx512fp16,avx512vl")]
14332#[cfg_attr(test, assert_instr(vcvttph2dq))]
14333#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14334pub unsafe fn _mm_mask_cvttph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14335    transmute(vcvttph2dq_128(a, src.as_i32x4(), k))
14336}
14337
14338/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14339/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14340///
14341/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi32)
14342#[inline]
14343#[target_feature(enable = "avx512fp16,avx512vl")]
14344#[cfg_attr(test, assert_instr(vcvttph2dq))]
14345#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14346pub unsafe fn _mm_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14347    _mm_mask_cvttph_epi32(_mm_setzero_si128(), k, a)
14348}
14349
14350/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14351/// store the results in dst.
14352///
14353/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi32)
14354#[inline]
14355#[target_feature(enable = "avx512fp16,avx512vl")]
14356#[cfg_attr(test, assert_instr(vcvttph2dq))]
14357#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14358pub unsafe fn _mm256_cvttph_epi32(a: __m128h) -> __m256i {
14359    _mm256_mask_cvttph_epi32(_mm256_undefined_si256(), 0xff, a)
14360}
14361
14362/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14363/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14364///
14365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi32)
14366#[inline]
14367#[target_feature(enable = "avx512fp16,avx512vl")]
14368#[cfg_attr(test, assert_instr(vcvttph2dq))]
14369#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14370pub unsafe fn _mm256_mask_cvttph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14371    transmute(vcvttph2dq_256(a, src.as_i32x8(), k))
14372}
14373
14374/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14375/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14376///
14377/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi32)
14378#[inline]
14379#[target_feature(enable = "avx512fp16,avx512vl")]
14380#[cfg_attr(test, assert_instr(vcvttph2dq))]
14381#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14382pub unsafe fn _mm256_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14383    _mm256_mask_cvttph_epi32(_mm256_setzero_si256(), k, a)
14384}
14385
14386/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14387/// store the results in dst.
14388///
14389/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi32)
14390#[inline]
14391#[target_feature(enable = "avx512fp16")]
14392#[cfg_attr(test, assert_instr(vcvttph2dq))]
14393#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14394pub unsafe fn _mm512_cvttph_epi32(a: __m256h) -> __m512i {
14395    _mm512_mask_cvttph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14396}
14397
14398/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14399/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14400///
14401/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi32)
14402#[inline]
14403#[target_feature(enable = "avx512fp16")]
14404#[cfg_attr(test, assert_instr(vcvttph2dq))]
14405#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14406pub unsafe fn _mm512_mask_cvttph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14407    transmute(vcvttph2dq_512(
14408        a,
14409        src.as_i32x16(),
14410        k,
14411        _MM_FROUND_CUR_DIRECTION,
14412    ))
14413}
14414
14415/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14416/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14417///
14418/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi32)
14419#[inline]
14420#[target_feature(enable = "avx512fp16")]
14421#[cfg_attr(test, assert_instr(vcvttph2dq))]
14422#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14423pub unsafe fn _mm512_maskz_cvttph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14424    _mm512_mask_cvttph_epi32(_mm512_setzero_si512(), k, a)
14425}
14426
14427/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14428/// store the results in dst.
14429///
14430/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14431///
14432/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi32)
14433#[inline]
14434#[target_feature(enable = "avx512fp16")]
14435#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14436#[rustc_legacy_const_generics(1)]
14437#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14438pub unsafe fn _mm512_cvtt_roundph_epi32<const SAE: i32>(a: __m256h) -> __m512i {
14439    static_assert_sae!(SAE);
14440    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
14441}
14442
14443/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14444/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14445///
14446/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14447///
14448/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi32)
14449#[inline]
14450#[target_feature(enable = "avx512fp16")]
14451#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14452#[rustc_legacy_const_generics(3)]
14453#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14454pub unsafe fn _mm512_mask_cvtt_roundph_epi32<const SAE: i32>(
14455    src: __m512i,
14456    k: __mmask16,
14457    a: __m256h,
14458) -> __m512i {
14459    static_assert_sae!(SAE);
14460    transmute(vcvttph2dq_512(a, src.as_i32x16(), k, SAE))
14461}
14462
14463/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14464/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14465///
14466/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14467///
14468/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi32)
14469#[inline]
14470#[target_feature(enable = "avx512fp16")]
14471#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14472#[rustc_legacy_const_generics(2)]
14473#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14474pub unsafe fn _mm512_maskz_cvtt_roundph_epi32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
14475    static_assert_sae!(SAE);
14476    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_setzero_si512(), k, a)
14477}
14478
14479/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
14480/// the result in dst.
14481///
14482/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_i32)
14483#[inline]
14484#[target_feature(enable = "avx512fp16")]
14485#[cfg_attr(test, assert_instr(vcvttsh2si))]
14486#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14487pub unsafe fn _mm_cvttsh_i32(a: __m128h) -> i32 {
14488    vcvttsh2si32(a, _MM_FROUND_CUR_DIRECTION)
14489}
14490
14491/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
14492/// the result in dst.
14493///
14494/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14495///
14496/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_i32)
14497#[inline]
14498#[target_feature(enable = "avx512fp16")]
14499#[cfg_attr(test, assert_instr(vcvttsh2si, SAE = 8))]
14500#[rustc_legacy_const_generics(1)]
14501#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14502pub unsafe fn _mm_cvtt_roundsh_i32<const SAE: i32>(a: __m128h) -> i32 {
14503    static_assert_sae!(SAE);
14504    vcvttsh2si32(a, SAE)
14505}
14506
14507/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14508/// store the results in dst.
14509///
14510/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu32)
14511#[inline]
14512#[target_feature(enable = "avx512fp16,avx512vl")]
14513#[cfg_attr(test, assert_instr(vcvttph2udq))]
14514#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14515pub unsafe fn _mm_cvttph_epu32(a: __m128h) -> __m128i {
14516    _mm_mask_cvttph_epu32(_mm_undefined_si128(), 0xff, a)
14517}
14518
14519/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14520/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14521///
14522/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu32)
14523#[inline]
14524#[target_feature(enable = "avx512fp16,avx512vl")]
14525#[cfg_attr(test, assert_instr(vcvttph2udq))]
14526#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14527pub unsafe fn _mm_mask_cvttph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14528    transmute(vcvttph2udq_128(a, src.as_u32x4(), k))
14529}
14530
14531/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14532/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14533///
14534/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu32)
14535#[inline]
14536#[target_feature(enable = "avx512fp16,avx512vl")]
14537#[cfg_attr(test, assert_instr(vcvttph2udq))]
14538#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14539pub unsafe fn _mm_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m128i {
14540    _mm_mask_cvttph_epu32(_mm_setzero_si128(), k, a)
14541}
14542
14543/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14544/// store the results in dst.
14545///
14546/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu32)
14547#[inline]
14548#[target_feature(enable = "avx512fp16,avx512vl")]
14549#[cfg_attr(test, assert_instr(vcvttph2udq))]
14550#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14551pub unsafe fn _mm256_cvttph_epu32(a: __m128h) -> __m256i {
14552    _mm256_mask_cvttph_epu32(_mm256_undefined_si256(), 0xff, a)
14553}
14554
14555/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14556/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14557///
14558/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu32)
14559#[inline]
14560#[target_feature(enable = "avx512fp16,avx512vl")]
14561#[cfg_attr(test, assert_instr(vcvttph2udq))]
14562#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14563pub unsafe fn _mm256_mask_cvttph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14564    transmute(vcvttph2udq_256(a, src.as_u32x8(), k))
14565}
14566
14567/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14568/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14569///
14570/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu32)
14571#[inline]
14572#[target_feature(enable = "avx512fp16,avx512vl")]
14573#[cfg_attr(test, assert_instr(vcvttph2udq))]
14574#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14575pub unsafe fn _mm256_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m256i {
14576    _mm256_mask_cvttph_epu32(_mm256_setzero_si256(), k, a)
14577}
14578
14579/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14580/// store the results in dst.
14581///
14582/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu32)
14583#[inline]
14584#[target_feature(enable = "avx512fp16")]
14585#[cfg_attr(test, assert_instr(vcvttph2udq))]
14586#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14587pub unsafe fn _mm512_cvttph_epu32(a: __m256h) -> __m512i {
14588    _mm512_mask_cvttph_epu32(_mm512_undefined_epi32(), 0xffff, a)
14589}
14590
14591/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14592/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14593///
14594/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu32)
14595#[inline]
14596#[target_feature(enable = "avx512fp16")]
14597#[cfg_attr(test, assert_instr(vcvttph2udq))]
14598#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14599pub unsafe fn _mm512_mask_cvttph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14600    transmute(vcvttph2udq_512(
14601        a,
14602        src.as_u32x16(),
14603        k,
14604        _MM_FROUND_CUR_DIRECTION,
14605    ))
14606}
14607
14608/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14609/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14610///
14611/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu32)
14612#[inline]
14613#[target_feature(enable = "avx512fp16")]
14614#[cfg_attr(test, assert_instr(vcvttph2udq))]
14615#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14616pub unsafe fn _mm512_maskz_cvttph_epu32(k: __mmask16, a: __m256h) -> __m512i {
14617    _mm512_mask_cvttph_epu32(_mm512_setzero_si512(), k, a)
14618}
14619
14620/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14621/// store the results in dst.
14622///
14623/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14624///
14625/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu32)
14626#[inline]
14627#[target_feature(enable = "avx512fp16")]
14628#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14629#[rustc_legacy_const_generics(1)]
14630#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14631pub unsafe fn _mm512_cvtt_roundph_epu32<const SAE: i32>(a: __m256h) -> __m512i {
14632    static_assert_sae!(SAE);
14633    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
14634}
14635
14636/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14637/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14638///
14639/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14640///
14641/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu32)
14642#[inline]
14643#[target_feature(enable = "avx512fp16")]
14644#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14645#[rustc_legacy_const_generics(3)]
14646#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14647pub unsafe fn _mm512_mask_cvtt_roundph_epu32<const SAE: i32>(
14648    src: __m512i,
14649    k: __mmask16,
14650    a: __m256h,
14651) -> __m512i {
14652    static_assert_sae!(SAE);
14653    transmute(vcvttph2udq_512(a, src.as_u32x16(), k, SAE))
14654}
14655
14656/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14657/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14658///
14659/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14660///
14661/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu32)
14662#[inline]
14663#[target_feature(enable = "avx512fp16")]
14664#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14665#[rustc_legacy_const_generics(2)]
14666#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14667pub unsafe fn _mm512_maskz_cvtt_roundph_epu32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
14668    static_assert_sae!(SAE);
14669    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_setzero_si512(), k, a)
14670}
14671
14672/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
14673/// the result in dst.
14674///
14675/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_u32)
14676#[inline]
14677#[target_feature(enable = "avx512fp16")]
14678#[cfg_attr(test, assert_instr(vcvttsh2usi))]
14679#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14680pub unsafe fn _mm_cvttsh_u32(a: __m128h) -> u32 {
14681    vcvttsh2usi32(a, _MM_FROUND_CUR_DIRECTION)
14682}
14683
14684/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
14685/// the result in dst.
14686///
14687/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14688///
14689/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_u32)
14690#[inline]
14691#[target_feature(enable = "avx512fp16")]
14692#[cfg_attr(test, assert_instr(vcvttsh2usi, SAE = 8))]
14693#[rustc_legacy_const_generics(1)]
14694#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14695pub unsafe fn _mm_cvtt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
14696    static_assert_sae!(SAE);
14697    vcvttsh2usi32(a, SAE)
14698}
14699
14700/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14701/// store the results in dst.
14702///
14703/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi64)
14704#[inline]
14705#[target_feature(enable = "avx512fp16,avx512vl")]
14706#[cfg_attr(test, assert_instr(vcvtph2qq))]
14707#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14708pub unsafe fn _mm_cvtph_epi64(a: __m128h) -> __m128i {
14709    _mm_mask_cvtph_epi64(_mm_undefined_si128(), 0xff, a)
14710}
14711
14712/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14713/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14714///
14715/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi64)
14716#[inline]
14717#[target_feature(enable = "avx512fp16,avx512vl")]
14718#[cfg_attr(test, assert_instr(vcvtph2qq))]
14719#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14720pub unsafe fn _mm_mask_cvtph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14721    transmute(vcvtph2qq_128(a, src.as_i64x2(), k))
14722}
14723
14724/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14725/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14726///
14727/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi64)
14728#[inline]
14729#[target_feature(enable = "avx512fp16,avx512vl")]
14730#[cfg_attr(test, assert_instr(vcvtph2qq))]
14731#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14732pub unsafe fn _mm_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m128i {
14733    _mm_mask_cvtph_epi64(_mm_setzero_si128(), k, a)
14734}
14735
14736/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14737/// store the results in dst.
14738///
14739/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi64)
14740#[inline]
14741#[target_feature(enable = "avx512fp16,avx512vl")]
14742#[cfg_attr(test, assert_instr(vcvtph2qq))]
14743#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14744pub unsafe fn _mm256_cvtph_epi64(a: __m128h) -> __m256i {
14745    _mm256_mask_cvtph_epi64(_mm256_undefined_si256(), 0xff, a)
14746}
14747
14748/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14749/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14750///
14751/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi64)
14752#[inline]
14753#[target_feature(enable = "avx512fp16,avx512vl")]
14754#[cfg_attr(test, assert_instr(vcvtph2qq))]
14755#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14756pub unsafe fn _mm256_mask_cvtph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14757    transmute(vcvtph2qq_256(a, src.as_i64x4(), k))
14758}
14759
14760/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14761/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14762///
14763/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi64)
14764#[inline]
14765#[target_feature(enable = "avx512fp16,avx512vl")]
14766#[cfg_attr(test, assert_instr(vcvtph2qq))]
14767#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14768pub unsafe fn _mm256_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m256i {
14769    _mm256_mask_cvtph_epi64(_mm256_setzero_si256(), k, a)
14770}
14771
14772/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14773/// store the results in dst.
14774///
14775/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi64)
14776#[inline]
14777#[target_feature(enable = "avx512fp16")]
14778#[cfg_attr(test, assert_instr(vcvtph2qq))]
14779#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14780pub unsafe fn _mm512_cvtph_epi64(a: __m128h) -> __m512i {
14781    _mm512_mask_cvtph_epi64(_mm512_undefined_epi32(), 0xff, a)
14782}
14783
14784/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14785/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14786///
14787/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi64)
14788#[inline]
14789#[target_feature(enable = "avx512fp16")]
14790#[cfg_attr(test, assert_instr(vcvtph2qq))]
14791#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14792pub unsafe fn _mm512_mask_cvtph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
14793    transmute(vcvtph2qq_512(
14794        a,
14795        src.as_i64x8(),
14796        k,
14797        _MM_FROUND_CUR_DIRECTION,
14798    ))
14799}
14800
14801/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14802/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14803///
14804/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi64)
14805#[inline]
14806#[target_feature(enable = "avx512fp16")]
14807#[cfg_attr(test, assert_instr(vcvtph2qq))]
14808#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14809pub unsafe fn _mm512_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m512i {
14810    _mm512_mask_cvtph_epi64(_mm512_setzero_si512(), k, a)
14811}
14812
14813/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14814/// store the results in dst.
14815///
14816/// Rounding is done according to the rounding parameter, which can be one of:
14817///
14818/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14819/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14820/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14821/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14822/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14823///
14824/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi64)
14825#[inline]
14826#[target_feature(enable = "avx512fp16")]
14827#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
14828#[rustc_legacy_const_generics(1)]
14829#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14830pub unsafe fn _mm512_cvt_roundph_epi64<const ROUNDING: i32>(a: __m128h) -> __m512i {
14831    static_assert_rounding!(ROUNDING);
14832    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
14833}
14834
14835/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14836/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14837///
14838/// Rounding is done according to the rounding parameter, which can be one of:
14839///
14840/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14841/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14842/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14843/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14844/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14845///
14846/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi64)
14847#[inline]
14848#[target_feature(enable = "avx512fp16")]
14849#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
14850#[rustc_legacy_const_generics(3)]
14851#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14852pub unsafe fn _mm512_mask_cvt_roundph_epi64<const ROUNDING: i32>(
14853    src: __m512i,
14854    k: __mmask8,
14855    a: __m128h,
14856) -> __m512i {
14857    static_assert_rounding!(ROUNDING);
14858    transmute(vcvtph2qq_512(a, src.as_i64x8(), k, ROUNDING))
14859}
14860
14861/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
14862/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14863///
14864/// Rounding is done according to the rounding parameter, which can be one of:
14865///
14866/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14867/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14868/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14869/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14870/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14871///
14872/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi64)
14873#[inline]
14874#[target_feature(enable = "avx512fp16")]
14875#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
14876#[rustc_legacy_const_generics(2)]
14877#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14878pub unsafe fn _mm512_maskz_cvt_roundph_epi64<const ROUNDING: i32>(
14879    k: __mmask8,
14880    a: __m128h,
14881) -> __m512i {
14882    static_assert_rounding!(ROUNDING);
14883    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
14884}
14885
14886/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
14887/// store the results in dst.
14888///
14889/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu64)
14890#[inline]
14891#[target_feature(enable = "avx512fp16,avx512vl")]
14892#[cfg_attr(test, assert_instr(vcvtph2uqq))]
14893#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14894pub unsafe fn _mm_cvtph_epu64(a: __m128h) -> __m128i {
14895    _mm_mask_cvtph_epu64(_mm_undefined_si128(), 0xff, a)
14896}
14897
14898/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
14899/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14900///
14901/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu64)
14902#[inline]
14903#[target_feature(enable = "avx512fp16,avx512vl")]
14904#[cfg_attr(test, assert_instr(vcvtph2uqq))]
14905#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14906pub unsafe fn _mm_mask_cvtph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14907    transmute(vcvtph2uqq_128(a, src.as_u64x2(), k))
14908}
14909
14910/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
14911/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14912///
14913/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu64)
14914#[inline]
14915#[target_feature(enable = "avx512fp16,avx512vl")]
14916#[cfg_attr(test, assert_instr(vcvtph2uqq))]
14917#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14918pub unsafe fn _mm_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m128i {
14919    _mm_mask_cvtph_epu64(_mm_setzero_si128(), k, a)
14920}
14921
14922/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
14923/// store the results in dst.
14924///
14925/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu64)
14926#[inline]
14927#[target_feature(enable = "avx512fp16,avx512vl")]
14928#[cfg_attr(test, assert_instr(vcvtph2uqq))]
14929#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14930pub unsafe fn _mm256_cvtph_epu64(a: __m128h) -> __m256i {
14931    _mm256_mask_cvtph_epu64(_mm256_undefined_si256(), 0xff, a)
14932}
14933
14934/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
14935/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14936///
14937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu64)
14938#[inline]
14939#[target_feature(enable = "avx512fp16,avx512vl")]
14940#[cfg_attr(test, assert_instr(vcvtph2uqq))]
14941#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14942pub unsafe fn _mm256_mask_cvtph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14943    transmute(vcvtph2uqq_256(a, src.as_u64x4(), k))
14944}
14945
14946/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
14947/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14948///
14949/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu64)
14950#[inline]
14951#[target_feature(enable = "avx512fp16,avx512vl")]
14952#[cfg_attr(test, assert_instr(vcvtph2uqq))]
14953#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14954pub unsafe fn _mm256_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m256i {
14955    _mm256_mask_cvtph_epu64(_mm256_setzero_si256(), k, a)
14956}
14957
14958/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
14959/// store the results in dst.
14960///
14961/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu64)
14962#[inline]
14963#[target_feature(enable = "avx512fp16")]
14964#[cfg_attr(test, assert_instr(vcvtph2uqq))]
14965#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14966pub unsafe fn _mm512_cvtph_epu64(a: __m128h) -> __m512i {
14967    _mm512_mask_cvtph_epu64(_mm512_undefined_epi32(), 0xff, a)
14968}
14969
14970/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
14971/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14972///
14973/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu64)
14974#[inline]
14975#[target_feature(enable = "avx512fp16")]
14976#[cfg_attr(test, assert_instr(vcvtph2uqq))]
14977#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14978pub unsafe fn _mm512_mask_cvtph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
14979    transmute(vcvtph2uqq_512(
14980        a,
14981        src.as_u64x8(),
14982        k,
14983        _MM_FROUND_CUR_DIRECTION,
14984    ))
14985}
14986
14987/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
14988/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14989///
14990/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu64)
14991#[inline]
14992#[target_feature(enable = "avx512fp16")]
14993#[cfg_attr(test, assert_instr(vcvtph2uqq))]
14994#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14995pub unsafe fn _mm512_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m512i {
14996    _mm512_mask_cvtph_epu64(_mm512_setzero_si512(), k, a)
14997}
14998
14999/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15000/// store the results in dst.
15001///
15002/// Rounding is done according to the rounding parameter, which can be one of:
15003///
15004/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15005/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15006/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15007/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15008/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15009///
15010/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu64)
15011#[inline]
15012#[target_feature(enable = "avx512fp16")]
15013#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15014#[rustc_legacy_const_generics(1)]
15015#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15016pub unsafe fn _mm512_cvt_roundph_epu64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15017    static_assert_rounding!(ROUNDING);
15018    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15019}
15020
15021/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15022/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15023///
15024/// Rounding is done according to the rounding parameter, which can be one of:
15025///
15026/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15027/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15028/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15029/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15030/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15031///
15032/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu64)
15033#[inline]
15034#[target_feature(enable = "avx512fp16")]
15035#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15036#[rustc_legacy_const_generics(3)]
15037#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15038pub unsafe fn _mm512_mask_cvt_roundph_epu64<const ROUNDING: i32>(
15039    src: __m512i,
15040    k: __mmask8,
15041    a: __m128h,
15042) -> __m512i {
15043    static_assert_rounding!(ROUNDING);
15044    transmute(vcvtph2uqq_512(a, src.as_u64x8(), k, ROUNDING))
15045}
15046
15047/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15048/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15049///
15050/// Rounding is done according to the rounding parameter, which can be one of:
15051///
15052/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15053/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15054/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15055/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15056/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15057///
15058/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu64)
15059#[inline]
15060#[target_feature(enable = "avx512fp16")]
15061#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15062#[rustc_legacy_const_generics(2)]
15063#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15064pub unsafe fn _mm512_maskz_cvt_roundph_epu64<const ROUNDING: i32>(
15065    k: __mmask8,
15066    a: __m128h,
15067) -> __m512i {
15068    static_assert_rounding!(ROUNDING);
15069    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15070}
15071
15072/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15073/// store the results in dst.
15074///
15075/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi64)
15076#[inline]
15077#[target_feature(enable = "avx512fp16,avx512vl")]
15078#[cfg_attr(test, assert_instr(vcvttph2qq))]
15079#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15080pub unsafe fn _mm_cvttph_epi64(a: __m128h) -> __m128i {
15081    _mm_mask_cvttph_epi64(_mm_undefined_si128(), 0xff, a)
15082}
15083
15084/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15085/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15086///
15087/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi64)
15088#[inline]
15089#[target_feature(enable = "avx512fp16,avx512vl")]
15090#[cfg_attr(test, assert_instr(vcvttph2qq))]
15091#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15092pub unsafe fn _mm_mask_cvttph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15093    transmute(vcvttph2qq_128(a, src.as_i64x2(), k))
15094}
15095
15096/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15097/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15098///
15099/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi64)
15100#[inline]
15101#[target_feature(enable = "avx512fp16,avx512vl")]
15102#[cfg_attr(test, assert_instr(vcvttph2qq))]
15103#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15104pub unsafe fn _mm_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m128i {
15105    _mm_mask_cvttph_epi64(_mm_setzero_si128(), k, a)
15106}
15107
15108/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15109/// store the results in dst.
15110///
15111/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi64)
15112#[inline]
15113#[target_feature(enable = "avx512fp16,avx512vl")]
15114#[cfg_attr(test, assert_instr(vcvttph2qq))]
15115#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15116pub unsafe fn _mm256_cvttph_epi64(a: __m128h) -> __m256i {
15117    _mm256_mask_cvttph_epi64(_mm256_undefined_si256(), 0xff, a)
15118}
15119
15120/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15121/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15122///
15123/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi64)
15124#[inline]
15125#[target_feature(enable = "avx512fp16,avx512vl")]
15126#[cfg_attr(test, assert_instr(vcvttph2qq))]
15127#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15128pub unsafe fn _mm256_mask_cvttph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15129    transmute(vcvttph2qq_256(a, src.as_i64x4(), k))
15130}
15131
15132/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15133/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15134///
15135/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi64)
15136#[inline]
15137#[target_feature(enable = "avx512fp16,avx512vl")]
15138#[cfg_attr(test, assert_instr(vcvttph2qq))]
15139#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15140pub unsafe fn _mm256_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15141    _mm256_mask_cvttph_epi64(_mm256_setzero_si256(), k, a)
15142}
15143
15144/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15145/// store the results in dst.
15146///
15147/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi64)
15148#[inline]
15149#[target_feature(enable = "avx512fp16")]
15150#[cfg_attr(test, assert_instr(vcvttph2qq))]
15151#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15152pub unsafe fn _mm512_cvttph_epi64(a: __m128h) -> __m512i {
15153    _mm512_mask_cvttph_epi64(_mm512_undefined_epi32(), 0xff, a)
15154}
15155
15156/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15157/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15158///
15159/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi64)
15160#[inline]
15161#[target_feature(enable = "avx512fp16")]
15162#[cfg_attr(test, assert_instr(vcvttph2qq))]
15163#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15164pub unsafe fn _mm512_mask_cvttph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15165    transmute(vcvttph2qq_512(
15166        a,
15167        src.as_i64x8(),
15168        k,
15169        _MM_FROUND_CUR_DIRECTION,
15170    ))
15171}
15172
15173/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15174/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15175///
15176/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi64)
15177#[inline]
15178#[target_feature(enable = "avx512fp16")]
15179#[cfg_attr(test, assert_instr(vcvttph2qq))]
15180#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15181pub unsafe fn _mm512_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15182    _mm512_mask_cvttph_epi64(_mm512_setzero_si512(), k, a)
15183}
15184
15185/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15186/// store the results in dst.
15187///
15188/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15189///
15190/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi64)
15191#[inline]
15192#[target_feature(enable = "avx512fp16")]
15193#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15194#[rustc_legacy_const_generics(1)]
15195#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15196pub unsafe fn _mm512_cvtt_roundph_epi64<const SAE: i32>(a: __m128h) -> __m512i {
15197    static_assert_sae!(SAE);
15198    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15199}
15200
15201/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15202/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15203///
15204/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15205///
15206/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi64)
15207#[inline]
15208#[target_feature(enable = "avx512fp16")]
15209#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15210#[rustc_legacy_const_generics(3)]
15211#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15212pub unsafe fn _mm512_mask_cvtt_roundph_epi64<const SAE: i32>(
15213    src: __m512i,
15214    k: __mmask8,
15215    a: __m128h,
15216) -> __m512i {
15217    static_assert_sae!(SAE);
15218    transmute(vcvttph2qq_512(a, src.as_i64x8(), k, SAE))
15219}
15220
15221/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15222/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15223///
15224/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15225///
15226/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi64)
15227#[inline]
15228#[target_feature(enable = "avx512fp16")]
15229#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15230#[rustc_legacy_const_generics(2)]
15231#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15232pub unsafe fn _mm512_maskz_cvtt_roundph_epi64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15233    static_assert_sae!(SAE);
15234    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_setzero_si512(), k, a)
15235}
15236
15237/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15238/// store the results in dst.
15239///
15240/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu64)
15241#[inline]
15242#[target_feature(enable = "avx512fp16,avx512vl")]
15243#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15244#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15245pub unsafe fn _mm_cvttph_epu64(a: __m128h) -> __m128i {
15246    _mm_mask_cvttph_epu64(_mm_undefined_si128(), 0xff, a)
15247}
15248
15249/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15250/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15251///
15252/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu64)
15253#[inline]
15254#[target_feature(enable = "avx512fp16,avx512vl")]
15255#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15256#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15257pub unsafe fn _mm_mask_cvttph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15258    transmute(vcvttph2uqq_128(a, src.as_u64x2(), k))
15259}
15260
15261/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15262/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15263///
15264/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu64)
15265#[inline]
15266#[target_feature(enable = "avx512fp16,avx512vl")]
15267#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15268#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15269pub unsafe fn _mm_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15270    _mm_mask_cvttph_epu64(_mm_setzero_si128(), k, a)
15271}
15272
15273/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15274/// store the results in dst.
15275///
15276/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu64)
15277#[inline]
15278#[target_feature(enable = "avx512fp16,avx512vl")]
15279#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15280#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15281pub unsafe fn _mm256_cvttph_epu64(a: __m128h) -> __m256i {
15282    _mm256_mask_cvttph_epu64(_mm256_undefined_si256(), 0xff, a)
15283}
15284
15285/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15286/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15287///
15288/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu64)
15289#[inline]
15290#[target_feature(enable = "avx512fp16,avx512vl")]
15291#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15292#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15293pub unsafe fn _mm256_mask_cvttph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15294    transmute(vcvttph2uqq_256(a, src.as_u64x4(), k))
15295}
15296
15297/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15298/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15299///
15300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu64)
15301#[inline]
15302#[target_feature(enable = "avx512fp16,avx512vl")]
15303#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15304#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15305pub unsafe fn _mm256_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15306    _mm256_mask_cvttph_epu64(_mm256_setzero_si256(), k, a)
15307}
15308
15309/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15310/// store the results in dst.
15311///
15312/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu64)
15313#[inline]
15314#[target_feature(enable = "avx512fp16")]
15315#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15316#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15317pub unsafe fn _mm512_cvttph_epu64(a: __m128h) -> __m512i {
15318    _mm512_mask_cvttph_epu64(_mm512_undefined_epi32(), 0xff, a)
15319}
15320
15321/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15322/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15323///
15324/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu64)
15325#[inline]
15326#[target_feature(enable = "avx512fp16")]
15327#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15328#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15329pub unsafe fn _mm512_mask_cvttph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15330    transmute(vcvttph2uqq_512(
15331        a,
15332        src.as_u64x8(),
15333        k,
15334        _MM_FROUND_CUR_DIRECTION,
15335    ))
15336}
15337
15338/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15339/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15340///
15341/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu64)
15342#[inline]
15343#[target_feature(enable = "avx512fp16")]
15344#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15345#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15346pub unsafe fn _mm512_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15347    _mm512_mask_cvttph_epu64(_mm512_setzero_si512(), k, a)
15348}
15349
15350/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15351/// store the results in dst.
15352///
15353/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15354///
15355/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu64)
15356#[inline]
15357#[target_feature(enable = "avx512fp16")]
15358#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15359#[rustc_legacy_const_generics(1)]
15360#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15361pub unsafe fn _mm512_cvtt_roundph_epu64<const SAE: i32>(a: __m128h) -> __m512i {
15362    static_assert_sae!(SAE);
15363    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15364}
15365
15366/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15367/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15368///
15369/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15370///
15371/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu64)
15372#[inline]
15373#[target_feature(enable = "avx512fp16")]
15374#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15375#[rustc_legacy_const_generics(3)]
15376#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15377pub unsafe fn _mm512_mask_cvtt_roundph_epu64<const SAE: i32>(
15378    src: __m512i,
15379    k: __mmask8,
15380    a: __m128h,
15381) -> __m512i {
15382    static_assert_sae!(SAE);
15383    transmute(vcvttph2uqq_512(a, src.as_u64x8(), k, SAE))
15384}
15385
15386/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15387/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15388///
15389/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15390///
15391/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu64)
15392#[inline]
15393#[target_feature(enable = "avx512fp16")]
15394#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15395#[rustc_legacy_const_generics(2)]
15396#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15397pub unsafe fn _mm512_maskz_cvtt_roundph_epu64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15398    static_assert_sae!(SAE);
15399    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_setzero_si512(), k, a)
15400}
15401
15402/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15403/// floating-point elements, and store the results in dst.
15404///
15405/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxph_ps)
15406#[inline]
15407#[target_feature(enable = "avx512fp16,avx512vl")]
15408#[cfg_attr(test, assert_instr(vcvtph2psx))]
15409#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15410pub unsafe fn _mm_cvtxph_ps(a: __m128h) -> __m128 {
15411    _mm_mask_cvtxph_ps(_mm_setzero_ps(), 0xff, a)
15412}
15413
15414/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15415/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15416/// dst when the corresponding mask bit is not set).
15417///
15418/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxph_ps)
15419#[inline]
15420#[target_feature(enable = "avx512fp16,avx512vl")]
15421#[cfg_attr(test, assert_instr(vcvtph2psx))]
15422#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15423pub unsafe fn _mm_mask_cvtxph_ps(src: __m128, k: __mmask8, a: __m128h) -> __m128 {
15424    vcvtph2psx_128(a, src, k)
15425}
15426
15427/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15428/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15429/// corresponding mask bit is not set).
15430///
15431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxph_ps)
15432#[inline]
15433#[target_feature(enable = "avx512fp16,avx512vl")]
15434#[cfg_attr(test, assert_instr(vcvtph2psx))]
15435#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15436pub unsafe fn _mm_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m128 {
15437    _mm_mask_cvtxph_ps(_mm_setzero_ps(), k, a)
15438}
15439
15440/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15441/// floating-point elements, and store the results in dst.
15442///
15443/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxph_ps)
15444#[inline]
15445#[target_feature(enable = "avx512fp16,avx512vl")]
15446#[cfg_attr(test, assert_instr(vcvtph2psx))]
15447#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15448pub unsafe fn _mm256_cvtxph_ps(a: __m128h) -> __m256 {
15449    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), 0xff, a)
15450}
15451
15452/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15453/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15454/// dst when the corresponding mask bit is not set).
15455///
15456/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxph_ps)
15457#[inline]
15458#[target_feature(enable = "avx512fp16,avx512vl")]
15459#[cfg_attr(test, assert_instr(vcvtph2psx))]
15460#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15461pub unsafe fn _mm256_mask_cvtxph_ps(src: __m256, k: __mmask8, a: __m128h) -> __m256 {
15462    vcvtph2psx_256(a, src, k)
15463}
15464
15465/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15466/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15467/// corresponding mask bit is not set).
15468///
15469/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxph_ps)
15470#[inline]
15471#[target_feature(enable = "avx512fp16,avx512vl")]
15472#[cfg_attr(test, assert_instr(vcvtph2psx))]
15473#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15474pub unsafe fn _mm256_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m256 {
15475    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), k, a)
15476}
15477
15478/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15479/// floating-point elements, and store the results in dst.
15480///
15481/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxph_ps)
15482#[inline]
15483#[target_feature(enable = "avx512fp16")]
15484#[cfg_attr(test, assert_instr(vcvtph2psx))]
15485#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15486pub unsafe fn _mm512_cvtxph_ps(a: __m256h) -> __m512 {
15487    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), 0xffff, a)
15488}
15489
15490/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15491/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15492/// dst when the corresponding mask bit is not set).
15493///
15494/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxph_ps)
15495#[inline]
15496#[target_feature(enable = "avx512fp16")]
15497#[cfg_attr(test, assert_instr(vcvtph2psx))]
15498#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15499pub unsafe fn _mm512_mask_cvtxph_ps(src: __m512, k: __mmask16, a: __m256h) -> __m512 {
15500    vcvtph2psx_512(a, src, k, _MM_FROUND_CUR_DIRECTION)
15501}
15502
15503/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15504/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15505/// corresponding mask bit is not set).
15506///
15507/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxph_ps)
15508#[inline]
15509#[target_feature(enable = "avx512fp16")]
15510#[cfg_attr(test, assert_instr(vcvtph2psx))]
15511#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15512pub unsafe fn _mm512_maskz_cvtxph_ps(k: __mmask16, a: __m256h) -> __m512 {
15513    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), k, a)
15514}
15515
15516/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15517/// floating-point elements, and store the results in dst.
15518///
15519/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15520///
15521/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundph_ps)
15522#[inline]
15523#[target_feature(enable = "avx512fp16")]
15524#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15525#[rustc_legacy_const_generics(1)]
15526#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15527pub unsafe fn _mm512_cvtx_roundph_ps<const SAE: i32>(a: __m256h) -> __m512 {
15528    static_assert_sae!(SAE);
15529    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), 0xffff, a)
15530}
15531
15532/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15533/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15534/// dst when the corresponding mask bit is not set).
15535///
15536/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15537///
15538/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundph_ps)
15539#[inline]
15540#[target_feature(enable = "avx512fp16")]
15541#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15542#[rustc_legacy_const_generics(3)]
15543#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15544pub unsafe fn _mm512_mask_cvtx_roundph_ps<const SAE: i32>(
15545    src: __m512,
15546    k: __mmask16,
15547    a: __m256h,
15548) -> __m512 {
15549    static_assert_sae!(SAE);
15550    vcvtph2psx_512(a, src, k, SAE)
15551}
15552
15553/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15554/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15555/// corresponding mask bit is not set).
15556///
15557/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15558///
15559/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundph_ps)
15560#[inline]
15561#[target_feature(enable = "avx512fp16")]
15562#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15563#[rustc_legacy_const_generics(2)]
15564#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15565pub unsafe fn _mm512_maskz_cvtx_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512 {
15566    static_assert_sae!(SAE);
15567    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), k, a)
15568}
15569
15570/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15571/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed
15572/// elements from a to the upper elements of dst.
15573///
15574/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_ss)
15575#[inline]
15576#[target_feature(enable = "avx512fp16")]
15577#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15578#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15579pub unsafe fn _mm_cvtsh_ss(a: __m128, b: __m128h) -> __m128 {
15580    _mm_mask_cvtsh_ss(a, 0xff, a, b)
15581}
15582
15583/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15584/// floating-point element, store the result in the lower element of dst using writemask k (the element is
15585/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
15586/// upper elements of dst.
15587///
15588/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_ss)
15589#[inline]
15590#[target_feature(enable = "avx512fp16")]
15591#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15592#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15593pub unsafe fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h) -> __m128 {
15594    vcvtsh2ss(a, b, src, k, _MM_FROUND_CUR_DIRECTION)
15595}
15596
15597/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15598/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
15599/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
15600/// of dst.
15601///
15602/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_ss)
15603#[inline]
15604#[target_feature(enable = "avx512fp16")]
15605#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15606#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15607pub unsafe fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
15608    _mm_mask_cvtsh_ss(_mm_setzero_ps(), k, a, b)
15609}
15610
15611/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15612/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements
15613/// from a to the upper elements of dst.
15614///
15615/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15616///
15617/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_ss)
15618#[inline]
15619#[target_feature(enable = "avx512fp16")]
15620#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
15621#[rustc_legacy_const_generics(2)]
15622#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15623pub unsafe fn _mm_cvt_roundsh_ss<const SAE: i32>(a: __m128, b: __m128h) -> __m128 {
15624    static_assert_sae!(SAE);
15625    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_undefined_ps(), 0xff, a, b)
15626}
15627
15628/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15629/// floating-point element, store the result in the lower element of dst using writemask k (the element is
15630/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
15631/// upper elements of dst.
15632///
15633/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15634///
15635/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_ss)
15636#[inline]
15637#[target_feature(enable = "avx512fp16")]
15638#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
15639#[rustc_legacy_const_generics(4)]
15640#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15641pub unsafe fn _mm_mask_cvt_roundsh_ss<const SAE: i32>(
15642    src: __m128,
15643    k: __mmask8,
15644    a: __m128,
15645    b: __m128h,
15646) -> __m128 {
15647    static_assert_sae!(SAE);
15648    vcvtsh2ss(a, b, src, k, SAE)
15649}
15650
15651/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15652/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
15653/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
15654/// of dst.
15655///
15656/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15657///
15658/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_ss)
15659#[inline]
15660#[target_feature(enable = "avx512fp16")]
15661#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
15662#[rustc_legacy_const_generics(3)]
15663#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15664pub unsafe fn _mm_maskz_cvt_roundsh_ss<const SAE: i32>(
15665    k: __mmask8,
15666    a: __m128,
15667    b: __m128h,
15668) -> __m128 {
15669    static_assert_sae!(SAE);
15670    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_setzero_ps(), k, a, b)
15671}
15672
15673/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15674/// floating-point elements, and store the results in dst.
15675///
15676/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_pd)
15677#[inline]
15678#[target_feature(enable = "avx512fp16,avx512vl")]
15679#[cfg_attr(test, assert_instr(vcvtph2pd))]
15680#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15681pub unsafe fn _mm_cvtph_pd(a: __m128h) -> __m128d {
15682    _mm_mask_cvtph_pd(_mm_setzero_pd(), 0xff, a)
15683}
15684
15685/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15686/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15687/// dst when the corresponding mask bit is not set).
15688///
15689/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_pd)
15690#[inline]
15691#[target_feature(enable = "avx512fp16,avx512vl")]
15692#[cfg_attr(test, assert_instr(vcvtph2pd))]
15693#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15694pub unsafe fn _mm_mask_cvtph_pd(src: __m128d, k: __mmask8, a: __m128h) -> __m128d {
15695    vcvtph2pd_128(a, src, k)
15696}
15697
15698/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15699/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15700/// corresponding mask bit is not set).
15701///
15702/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_pd)
15703#[inline]
15704#[target_feature(enable = "avx512fp16,avx512vl")]
15705#[cfg_attr(test, assert_instr(vcvtph2pd))]
15706#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15707pub unsafe fn _mm_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m128d {
15708    _mm_mask_cvtph_pd(_mm_setzero_pd(), k, a)
15709}
15710
15711/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15712/// floating-point elements, and store the results in dst.
15713///
15714/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_pd)
15715#[inline]
15716#[target_feature(enable = "avx512fp16,avx512vl")]
15717#[cfg_attr(test, assert_instr(vcvtph2pd))]
15718#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15719pub unsafe fn _mm256_cvtph_pd(a: __m128h) -> __m256d {
15720    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), 0xff, a)
15721}
15722
15723/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15724/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15725/// dst when the corresponding mask bit is not set).
15726///
15727/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_pd)
15728#[inline]
15729#[target_feature(enable = "avx512fp16,avx512vl")]
15730#[cfg_attr(test, assert_instr(vcvtph2pd))]
15731#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15732pub unsafe fn _mm256_mask_cvtph_pd(src: __m256d, k: __mmask8, a: __m128h) -> __m256d {
15733    vcvtph2pd_256(a, src, k)
15734}
15735
15736/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15737/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15738/// corresponding mask bit is not set).
15739///
15740/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_pd)
15741#[inline]
15742#[target_feature(enable = "avx512fp16,avx512vl")]
15743#[cfg_attr(test, assert_instr(vcvtph2pd))]
15744#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15745pub unsafe fn _mm256_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m256d {
15746    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), k, a)
15747}
15748
15749/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15750/// floating-point elements, and store the results in dst.
15751///
15752/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_pd)
15753#[inline]
15754#[target_feature(enable = "avx512fp16")]
15755#[cfg_attr(test, assert_instr(vcvtph2pd))]
15756#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15757pub unsafe fn _mm512_cvtph_pd(a: __m128h) -> __m512d {
15758    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), 0xff, a)
15759}
15760
15761/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15762/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15763/// dst when the corresponding mask bit is not set).
15764///
15765/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_pd)
15766#[inline]
15767#[target_feature(enable = "avx512fp16")]
15768#[cfg_attr(test, assert_instr(vcvtph2pd))]
15769#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15770pub unsafe fn _mm512_mask_cvtph_pd(src: __m512d, k: __mmask8, a: __m128h) -> __m512d {
15771    vcvtph2pd_512(a, src, k, _MM_FROUND_CUR_DIRECTION)
15772}
15773
15774/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15775/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15776/// corresponding mask bit is not set).
15777///
15778/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_pd)
15779#[inline]
15780#[target_feature(enable = "avx512fp16")]
15781#[cfg_attr(test, assert_instr(vcvtph2pd))]
15782#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15783pub unsafe fn _mm512_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m512d {
15784    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), k, a)
15785}
15786
15787/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15788/// floating-point elements, and store the results in dst.
15789///
15790/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15791///
15792/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_pd)
15793#[inline]
15794#[target_feature(enable = "avx512fp16")]
15795#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
15796#[rustc_legacy_const_generics(1)]
15797#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15798pub unsafe fn _mm512_cvt_roundph_pd<const SAE: i32>(a: __m128h) -> __m512d {
15799    static_assert_sae!(SAE);
15800    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), 0xff, a)
15801}
15802
15803/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15804/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15805/// dst when the corresponding mask bit is not set).
15806///
15807/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15808///
15809/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_pd)
15810#[inline]
15811#[target_feature(enable = "avx512fp16")]
15812#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
15813#[rustc_legacy_const_generics(3)]
15814#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15815pub unsafe fn _mm512_mask_cvt_roundph_pd<const SAE: i32>(
15816    src: __m512d,
15817    k: __mmask8,
15818    a: __m128h,
15819) -> __m512d {
15820    static_assert_sae!(SAE);
15821    vcvtph2pd_512(a, src, k, SAE)
15822}
15823
15824/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
15825/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15826/// corresponding mask bit is not set).
15827///
15828/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15829///
15830/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_pd)
15831#[inline]
15832#[target_feature(enable = "avx512fp16")]
15833#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
15834#[rustc_legacy_const_generics(2)]
15835#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15836pub unsafe fn _mm512_maskz_cvt_roundph_pd<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512d {
15837    static_assert_sae!(SAE);
15838    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), k, a)
15839}
15840
15841/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
15842/// floating-point element, store the result in the lower element of dst, and copy the upper element
15843/// from a to the upper element of dst.
15844///
15845/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_sd)
15846#[inline]
15847#[target_feature(enable = "avx512fp16")]
15848#[cfg_attr(test, assert_instr(vcvtsh2sd))]
15849#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15850pub unsafe fn _mm_cvtsh_sd(a: __m128d, b: __m128h) -> __m128d {
15851    _mm_mask_cvtsh_sd(a, 0xff, a, b)
15852}
15853
15854/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
15855/// floating-point element, store the result in the lower element of dst using writemask k (the element is
15856/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
15857/// of dst.
15858///
15859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_sd)
15860#[inline]
15861#[target_feature(enable = "avx512fp16")]
15862#[cfg_attr(test, assert_instr(vcvtsh2sd))]
15863#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15864pub unsafe fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
15865    vcvtsh2sd(a, b, src, k, _MM_FROUND_CUR_DIRECTION)
15866}
15867
15868/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
15869/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
15870/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
15871///
15872/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_sd)
15873#[inline]
15874#[target_feature(enable = "avx512fp16")]
15875#[cfg_attr(test, assert_instr(vcvtsh2sd))]
15876#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15877pub unsafe fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
15878    _mm_mask_cvtsh_sd(_mm_setzero_pd(), k, a, b)
15879}
15880
15881/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
15882/// floating-point element, store the result in the lower element of dst, and copy the upper element from a
15883/// to the upper element of dst.
15884///
15885/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15886///
15887/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_sd)
15888#[inline]
15889#[target_feature(enable = "avx512fp16")]
15890#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
15891#[rustc_legacy_const_generics(2)]
15892#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15893pub unsafe fn _mm_cvt_roundsh_sd<const SAE: i32>(a: __m128d, b: __m128h) -> __m128d {
15894    static_assert_sae!(SAE);
15895    _mm_mask_cvt_roundsh_sd::<SAE>(a, 0xff, a, b)
15896}
15897
15898/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
15899/// floating-point element, store the result in the lower element of dst using writemask k (the element is
15900/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
15901/// of dst.
15902///
15903/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15904///
15905/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_sd)
15906#[inline]
15907#[target_feature(enable = "avx512fp16")]
15908#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
15909#[rustc_legacy_const_generics(4)]
15910#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15911pub unsafe fn _mm_mask_cvt_roundsh_sd<const SAE: i32>(
15912    src: __m128d,
15913    k: __mmask8,
15914    a: __m128d,
15915    b: __m128h,
15916) -> __m128d {
15917    static_assert_sae!(SAE);
15918    vcvtsh2sd(a, b, src, k, SAE)
15919}
15920
15921/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
15922/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
15923/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
15924///
15925/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15926///
15927/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_sd)
15928#[inline]
15929#[target_feature(enable = "avx512fp16")]
15930#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
15931#[rustc_legacy_const_generics(3)]
15932#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15933pub unsafe fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(
15934    k: __mmask8,
15935    a: __m128d,
15936    b: __m128h,
15937) -> __m128d {
15938    static_assert_sae!(SAE);
15939    _mm_mask_cvt_roundsh_sd::<SAE>(_mm_setzero_pd(), k, a, b)
15940}
15941
15942/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
15943///
15944/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_h)
15945#[inline]
15946#[target_feature(enable = "avx512fp16")]
15947#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15948pub unsafe fn _mm_cvtsh_h(a: __m128h) -> f16 {
15949    simd_extract!(a, 0)
15950}
15951
15952/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
15953///
15954/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h)
15955#[inline]
15956#[target_feature(enable = "avx512fp16")]
15957#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15958pub unsafe fn _mm256_cvtsh_h(a: __m256h) -> f16 {
15959    simd_extract!(a, 0)
15960}
15961
15962/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
15963///
15964/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsh_h)
15965#[inline]
15966#[target_feature(enable = "avx512fp16")]
15967#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15968pub unsafe fn _mm512_cvtsh_h(a: __m512h) -> f16 {
15969    simd_extract!(a, 0)
15970}
15971
15972/// Copy the lower 16-bit integer in a to dst.
15973///
15974/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si16)
15975#[inline]
15976#[target_feature(enable = "avx512fp16")]
15977#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15978pub unsafe fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
15979    simd_extract!(a.as_i16x8(), 0)
15980}
15981
15982/// Copy 16-bit integer a to the lower elements of dst, and zero the upper elements of dst.
15983///
15984/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi16_si128)
15985#[inline]
15986#[target_feature(enable = "avx512fp16")]
15987#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15988pub unsafe fn _mm_cvtsi16_si128(a: i16) -> __m128i {
15989    transmute(simd_insert!(i16x8::ZERO, 0, a))
15990}
15991
15992#[allow(improper_ctypes)]
15993extern "C" {
15994    #[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"]
15995    fn vcmpsh(a: __m128h, b: __m128h, imm8: i32, mask: __mmask8, sae: i32) -> __mmask8;
15996    #[link_name = "llvm.x86.avx512fp16.vcomi.sh"]
15997    fn vcomish(a: __m128h, b: __m128h, imm8: i32, sae: i32) -> i32;
15998
15999    #[link_name = "llvm.x86.avx512fp16.add.ph.512"]
16000    fn vaddph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16001    #[link_name = "llvm.x86.avx512fp16.sub.ph.512"]
16002    fn vsubph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16003    #[link_name = "llvm.x86.avx512fp16.mul.ph.512"]
16004    fn vmulph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16005    #[link_name = "llvm.x86.avx512fp16.div.ph.512"]
16006    fn vdivph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16007
16008    #[link_name = "llvm.x86.avx512fp16.mask.add.sh.round"]
16009    fn vaddsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16010    #[link_name = "llvm.x86.avx512fp16.mask.sub.sh.round"]
16011    fn vsubsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16012    #[link_name = "llvm.x86.avx512fp16.mask.mul.sh.round"]
16013    fn vmulsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16014    #[link_name = "llvm.x86.avx512fp16.mask.div.sh.round"]
16015    fn vdivsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16016
16017    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.128"]
16018    fn vfmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16019    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.256"]
16020    fn vfmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16021    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.512"]
16022    fn vfmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16023    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.csh"]
16024    fn vfmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16025
16026    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.128"]
16027    fn vfcmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16028    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.256"]
16029    fn vfcmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16030    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.512"]
16031    fn vfcmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16032    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.csh"]
16033    fn vfcmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16034
16035    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.128"]
16036    fn vfmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16037    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.128"]
16038    fn vfmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16039    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.256"]
16040    fn vfmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16041    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.256"]
16042    fn vfmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16043    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.512"]
16044    fn vfmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16045    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.512"]
16046    fn vfmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16047    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.csh"]
16048    fn vfmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16049    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.csh"]
16050    fn vfmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16051
16052    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.128"]
16053    fn vfcmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16054    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.128"]
16055    fn vfcmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16056    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.256"]
16057    fn vfcmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16058    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.256"]
16059    fn vfcmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16060    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.512"]
16061    fn vfcmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16062        -> __m512;
16063    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.512"]
16064    fn vfcmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16065        -> __m512;
16066    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.csh"]
16067    fn vfcmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16068    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.csh"]
16069    fn vfcmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16070
16071    #[link_name = "llvm.x86.avx512fp16.vfmadd.ph.512"]
16072    fn vfmaddph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16073    #[link_name = "llvm.x86.avx512fp16.vfmadd.f16"]
16074    fn vfmaddsh(a: f16, b: f16, c: f16, rounding: i32) -> f16;
16075
16076    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.128"]
16077    fn vfmaddsubph_128(a: __m128h, b: __m128h, c: __m128h) -> __m128h;
16078    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.256"]
16079    fn vfmaddsubph_256(a: __m256h, b: __m256h, c: __m256h) -> __m256h;
16080    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.512"]
16081    fn vfmaddsubph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16082
16083    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.128"]
16084    fn vrcpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16085    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.256"]
16086    fn vrcpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16087    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.512"]
16088    fn vrcpph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16089    #[link_name = "llvm.x86.avx512fp16.mask.rcp.sh"]
16090    fn vrcpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16091
16092    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.128"]
16093    fn vrsqrtph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16094    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.256"]
16095    fn vrsqrtph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16096    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.512"]
16097    fn vrsqrtph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16098    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.sh"]
16099    fn vrsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16100
16101    #[link_name = "llvm.x86.avx512fp16.sqrt.ph.512"]
16102    fn vsqrtph_512(a: __m512h, rounding: i32) -> __m512h;
16103    #[link_name = "llvm.x86.avx512fp16.mask.sqrt.sh"]
16104    fn vsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16105
16106    #[link_name = "llvm.x86.avx512fp16.max.ph.128"]
16107    fn vmaxph_128(a: __m128h, b: __m128h) -> __m128h;
16108    #[link_name = "llvm.x86.avx512fp16.max.ph.256"]
16109    fn vmaxph_256(a: __m256h, b: __m256h) -> __m256h;
16110    #[link_name = "llvm.x86.avx512fp16.max.ph.512"]
16111    fn vmaxph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16112    #[link_name = "llvm.x86.avx512fp16.mask.max.sh.round"]
16113    fn vmaxsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16114
16115    #[link_name = "llvm.x86.avx512fp16.min.ph.128"]
16116    fn vminph_128(a: __m128h, b: __m128h) -> __m128h;
16117    #[link_name = "llvm.x86.avx512fp16.min.ph.256"]
16118    fn vminph_256(a: __m256h, b: __m256h) -> __m256h;
16119    #[link_name = "llvm.x86.avx512fp16.min.ph.512"]
16120    fn vminph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16121    #[link_name = "llvm.x86.avx512fp16.mask.min.sh.round"]
16122    fn vminsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16123
16124    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.128"]
16125    fn vgetexpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16126    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.256"]
16127    fn vgetexpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16128    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.512"]
16129    fn vgetexpph_512(a: __m512h, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16130    #[link_name = "llvm.x86.avx512fp16.mask.getexp.sh"]
16131    fn vgetexpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16132
16133    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.128"]
16134    fn vgetmantph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16135    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.256"]
16136    fn vgetmantph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16137    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.512"]
16138    fn vgetmantph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16139    #[link_name = "llvm.x86.avx512fp16.mask.getmant.sh"]
16140    fn vgetmantsh(
16141        a: __m128h,
16142        b: __m128h,
16143        imm8: i32,
16144        src: __m128h,
16145        k: __mmask8,
16146        sae: i32,
16147    ) -> __m128h;
16148
16149    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.128"]
16150    fn vrndscaleph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16151    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.256"]
16152    fn vrndscaleph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16153    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.512"]
16154    fn vrndscaleph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16155    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.sh"]
16156    fn vrndscalesh(
16157        a: __m128h,
16158        b: __m128h,
16159        src: __m128h,
16160        k: __mmask8,
16161        imm8: i32,
16162        sae: i32,
16163    ) -> __m128h;
16164
16165    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.128"]
16166    fn vscalefph_128(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16167    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.256"]
16168    fn vscalefph_256(a: __m256h, b: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16169    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.512"]
16170    fn vscalefph_512(a: __m512h, b: __m512h, src: __m512h, k: __mmask32, rounding: i32) -> __m512h;
16171    #[link_name = "llvm.x86.avx512fp16.mask.scalef.sh"]
16172    fn vscalefsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16173
16174    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.128"]
16175    fn vreduceph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16176    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.256"]
16177    fn vreduceph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16178    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.512"]
16179    fn vreduceph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16180    #[link_name = "llvm.x86.avx512fp16.mask.reduce.sh"]
16181    fn vreducesh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, imm8: i32, sae: i32)
16182        -> __m128h;
16183
16184    #[link_name = "llvm.x86.avx512fp16.mask.fpclass.sh"]
16185    fn vfpclasssh(a: __m128h, imm8: i32, k: __mmask8) -> __mmask8;
16186
16187    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i16"]
16188    fn vcvtw2ph_128(a: i16x8, rounding: i32) -> __m128h;
16189    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i16"]
16190    fn vcvtw2ph_256(a: i16x16, rounding: i32) -> __m256h;
16191    #[link_name = "llvm.x86.avx512.sitofp.round.v32f16.v32i16"]
16192    fn vcvtw2ph_512(a: i16x32, rounding: i32) -> __m512h;
16193    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8u16"]
16194    fn vcvtuw2ph_128(a: u16x8, rounding: i32) -> __m128h;
16195    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16u16"]
16196    fn vcvtuw2ph_256(a: u16x16, rounding: i32) -> __m256h;
16197    #[link_name = "llvm.x86.avx512.uitofp.round.v32f16.v32u16"]
16198    fn vcvtuw2ph_512(a: u16x32, rounding: i32) -> __m512h;
16199
16200    #[link_name = "llvm.x86.avx512fp16.mask.vcvtdq2ph.128"]
16201    fn vcvtdq2ph_128(a: i32x4, src: __m128h, k: __mmask8) -> __m128h;
16202    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i32"]
16203    fn vcvtdq2ph_256(a: i32x8, rounding: i32) -> __m128h;
16204    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i32"]
16205    fn vcvtdq2ph_512(a: i32x16, rounding: i32) -> __m256h;
16206    #[link_name = "llvm.x86.avx512fp16.vcvtsi2sh"]
16207    fn vcvtsi2sh(a: __m128h, b: i32, rounding: i32) -> __m128h;
16208    #[link_name = "llvm.x86.avx512fp16.mask.vcvtudq2ph.128"]
16209    fn vcvtudq2ph_128(a: u32x4, src: __m128h, k: __mmask8) -> __m128h;
16210    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8u32"]
16211    fn vcvtudq2ph_256(a: u32x8, rounding: i32) -> __m128h;
16212    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16u32"]
16213    fn vcvtudq2ph_512(a: u32x16, rounding: i32) -> __m256h;
16214    #[link_name = "llvm.x86.avx512fp16.vcvtusi2sh"]
16215    fn vcvtusi2sh(a: __m128h, b: u32, rounding: i32) -> __m128h;
16216
16217    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.128"]
16218    fn vcvtqq2ph_128(a: i64x2, src: __m128h, k: __mmask8) -> __m128h;
16219    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.256"]
16220    fn vcvtqq2ph_256(a: i64x4, src: __m128h, k: __mmask8) -> __m128h;
16221    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i64"]
16222    fn vcvtqq2ph_512(a: i64x8, rounding: i32) -> __m128h;
16223    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.128"]
16224    fn vcvtuqq2ph_128(a: u64x2, src: __m128h, k: __mmask8) -> __m128h;
16225    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.256"]
16226    fn vcvtuqq2ph_256(a: u64x4, src: __m128h, k: __mmask8) -> __m128h;
16227    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8u64"]
16228    fn vcvtuqq2ph_512(a: u64x8, rounding: i32) -> __m128h;
16229
16230    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.128"]
16231    fn vcvtps2phx_128(a: __m128, src: __m128h, k: __mmask8) -> __m128h;
16232    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.256"]
16233    fn vcvtps2phx_256(a: __m256, src: __m128h, k: __mmask8) -> __m128h;
16234    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.512"]
16235    fn vcvtps2phx_512(a: __m512, src: __m256h, k: __mmask16, rounding: i32) -> __m256h;
16236    #[link_name = "llvm.x86.avx512fp16.mask.vcvtss2sh.round"]
16237    fn vcvtss2sh(a: __m128h, b: __m128, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16238
16239    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.128"]
16240    fn vcvtpd2ph_128(a: __m128d, src: __m128h, k: __mmask8) -> __m128h;
16241    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.256"]
16242    fn vcvtpd2ph_256(a: __m256d, src: __m128h, k: __mmask8) -> __m128h;
16243    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.512"]
16244    fn vcvtpd2ph_512(a: __m512d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16245    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsd2sh.round"]
16246    fn vcvtsd2sh(a: __m128h, b: __m128d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16247
16248    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.128"]
16249    fn vcvtph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16250    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.256"]
16251    fn vcvtph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16252    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.512"]
16253    fn vcvtph2w_512(a: __m512h, src: i16x32, k: __mmask32, rounding: i32) -> i16x32;
16254    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.128"]
16255    fn vcvtph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16256    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.256"]
16257    fn vcvtph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16258    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.512"]
16259    fn vcvtph2uw_512(a: __m512h, src: u16x32, k: __mmask32, rounding: i32) -> u16x32;
16260
16261    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.128"]
16262    fn vcvttph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16263    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.256"]
16264    fn vcvttph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16265    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.512"]
16266    fn vcvttph2w_512(a: __m512h, src: i16x32, k: __mmask32, sae: i32) -> i16x32;
16267    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.128"]
16268    fn vcvttph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16269    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.256"]
16270    fn vcvttph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16271    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.512"]
16272    fn vcvttph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
16273
16274    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.128"]
16275    fn vcvtph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16276    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.256"]
16277    fn vcvtph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16278    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.512"]
16279    fn vcvtph2dq_512(a: __m256h, src: i32x16, k: __mmask16, rounding: i32) -> i32x16;
16280    #[link_name = "llvm.x86.avx512fp16.vcvtsh2si32"]
16281    fn vcvtsh2si32(a: __m128h, rounding: i32) -> i32;
16282    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.128"]
16283    fn vcvtph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16284    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.256"]
16285    fn vcvtph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16286    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.512"]
16287    fn vcvtph2udq_512(a: __m256h, src: u32x16, k: __mmask16, rounding: i32) -> u32x16;
16288    #[link_name = "llvm.x86.avx512fp16.vcvtsh2usi32"]
16289    fn vcvtsh2usi32(a: __m128h, sae: i32) -> u32;
16290
16291    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.128"]
16292    fn vcvttph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16293    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.256"]
16294    fn vcvttph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16295    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.512"]
16296    fn vcvttph2dq_512(a: __m256h, src: i32x16, k: __mmask16, sae: i32) -> i32x16;
16297    #[link_name = "llvm.x86.avx512fp16.vcvttsh2si32"]
16298    fn vcvttsh2si32(a: __m128h, sae: i32) -> i32;
16299    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.128"]
16300    fn vcvttph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16301    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.256"]
16302    fn vcvttph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16303    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.512"]
16304    fn vcvttph2udq_512(a: __m256h, src: u32x16, k: __mmask16, sae: i32) -> u32x16;
16305    #[link_name = "llvm.x86.avx512fp16.vcvttsh2usi32"]
16306    fn vcvttsh2usi32(a: __m128h, sae: i32) -> u32;
16307
16308    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.128"]
16309    fn vcvtph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16310    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.256"]
16311    fn vcvtph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16312    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.512"]
16313    fn vcvtph2qq_512(a: __m128h, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
16314    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.128"]
16315    fn vcvtph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16316    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.256"]
16317    fn vcvtph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16318    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.512"]
16319    fn vcvtph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
16320
16321    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.128"]
16322    fn vcvttph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16323    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.256"]
16324    fn vcvttph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16325    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.512"]
16326    fn vcvttph2qq_512(a: __m128h, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
16327    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.128"]
16328    fn vcvttph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16329    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.256"]
16330    fn vcvttph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16331    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.512"]
16332    fn vcvttph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
16333
16334    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.128"]
16335    fn vcvtph2psx_128(a: __m128h, src: __m128, k: __mmask8) -> __m128;
16336    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.256"]
16337    fn vcvtph2psx_256(a: __m128h, src: __m256, k: __mmask8) -> __m256;
16338    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.512"]
16339    fn vcvtph2psx_512(a: __m256h, src: __m512, k: __mmask16, sae: i32) -> __m512;
16340    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2ss.round"]
16341    fn vcvtsh2ss(a: __m128, b: __m128h, src: __m128, k: __mmask8, sae: i32) -> __m128;
16342
16343    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.128"]
16344    fn vcvtph2pd_128(a: __m128h, src: __m128d, k: __mmask8) -> __m128d;
16345    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.256"]
16346    fn vcvtph2pd_256(a: __m128h, src: __m256d, k: __mmask8) -> __m256d;
16347    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.512"]
16348    fn vcvtph2pd_512(a: __m128h, src: __m512d, k: __mmask8, sae: i32) -> __m512d;
16349    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2sd.round"]
16350    fn vcvtsh2sd(a: __m128d, b: __m128h, src: __m128d, k: __mmask8, sae: i32) -> __m128d;
16351
16352}
16353
16354#[cfg(test)]
16355mod tests {
16356    use crate::core_arch::x86::*;
16357    use crate::mem::transmute;
16358    use crate::ptr::{addr_of, addr_of_mut};
16359    use stdarch_test::simd_test;
16360
16361    #[target_feature(enable = "avx512fp16")]
16362    unsafe fn _mm_set1_pch(re: f16, im: f16) -> __m128h {
16363        _mm_setr_ph(re, im, re, im, re, im, re, im)
16364    }
16365
16366    #[target_feature(enable = "avx512fp16")]
16367    unsafe fn _mm256_set1_pch(re: f16, im: f16) -> __m256h {
16368        _mm256_setr_ph(
16369            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16370        )
16371    }
16372
16373    #[target_feature(enable = "avx512fp16")]
16374    unsafe fn _mm512_set1_pch(re: f16, im: f16) -> __m512h {
16375        _mm512_setr_ph(
16376            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16377            re, im, re, im, re, im, re, im, re, im,
16378        )
16379    }
16380
16381    #[simd_test(enable = "avx512fp16")]
16382    unsafe fn test_mm_set_ph() {
16383        let r = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16384        let e = _mm_setr_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
16385        assert_eq_m128h(r, e);
16386    }
16387
16388    #[simd_test(enable = "avx512fp16")]
16389    unsafe fn test_mm256_set_ph() {
16390        let r = _mm256_set_ph(
16391            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16392        );
16393        let e = _mm256_setr_ph(
16394            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
16395        );
16396        assert_eq_m256h(r, e);
16397    }
16398
16399    #[simd_test(enable = "avx512fp16")]
16400    unsafe fn test_mm512_set_ph() {
16401        let r = _mm512_set_ph(
16402            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16403            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16404            31.0, 32.0,
16405        );
16406        let e = _mm512_setr_ph(
16407            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
16408            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
16409            3.0, 2.0, 1.0,
16410        );
16411        assert_eq_m512h(r, e);
16412    }
16413
16414    #[simd_test(enable = "avx512fp16")]
16415    unsafe fn test_mm_set_sh() {
16416        let r = _mm_set_sh(1.0);
16417        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0);
16418        assert_eq_m128h(r, e);
16419    }
16420
16421    #[simd_test(enable = "avx512fp16")]
16422    unsafe fn test_mm_set1_ph() {
16423        let r = _mm_set1_ph(1.0);
16424        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
16425        assert_eq_m128h(r, e);
16426    }
16427
16428    #[simd_test(enable = "avx512fp16")]
16429    unsafe fn test_mm256_set1_ph() {
16430        let r = _mm256_set1_ph(1.0);
16431        let e = _mm256_set_ph(
16432            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16433        );
16434        assert_eq_m256h(r, e);
16435    }
16436
16437    #[simd_test(enable = "avx512fp16")]
16438    unsafe fn test_mm512_set1_ph() {
16439        let r = _mm512_set1_ph(1.0);
16440        let e = _mm512_set_ph(
16441            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16442            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16443        );
16444        assert_eq_m512h(r, e);
16445    }
16446
16447    #[simd_test(enable = "avx512fp16")]
16448    unsafe fn test_mm_setr_ph() {
16449        let r = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16450        let e = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
16451        assert_eq_m128h(r, e);
16452    }
16453
16454    #[simd_test(enable = "avx512fp16")]
16455    unsafe fn test_mm256_setr_ph() {
16456        let r = _mm256_setr_ph(
16457            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16458        );
16459        let e = _mm256_set_ph(
16460            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
16461        );
16462        assert_eq_m256h(r, e);
16463    }
16464
16465    #[simd_test(enable = "avx512fp16")]
16466    unsafe fn test_mm512_setr_ph() {
16467        let r = _mm512_setr_ph(
16468            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16469            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16470            31.0, 32.0,
16471        );
16472        let e = _mm512_set_ph(
16473            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
16474            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
16475            3.0, 2.0, 1.0,
16476        );
16477        assert_eq_m512h(r, e);
16478    }
16479
16480    #[simd_test(enable = "avx512fp16,avx512vl")]
16481    unsafe fn test_mm_setzero_ph() {
16482        let r = _mm_setzero_ph();
16483        let e = _mm_set1_ph(0.0);
16484        assert_eq_m128h(r, e);
16485    }
16486
16487    #[simd_test(enable = "avx512fp16,avx512vl")]
16488    unsafe fn test_mm256_setzero_ph() {
16489        let r = _mm256_setzero_ph();
16490        let e = _mm256_set1_ph(0.0);
16491        assert_eq_m256h(r, e);
16492    }
16493
16494    #[simd_test(enable = "avx512fp16")]
16495    unsafe fn test_mm512_setzero_ph() {
16496        let r = _mm512_setzero_ph();
16497        let e = _mm512_set1_ph(0.0);
16498        assert_eq_m512h(r, e);
16499    }
16500
16501    #[simd_test(enable = "avx512fp16")]
16502    unsafe fn test_mm_castsi128_ph() {
16503        let a = _mm_set1_epi16(0x3c00);
16504        let r = _mm_castsi128_ph(a);
16505        let e = _mm_set1_ph(1.0);
16506        assert_eq_m128h(r, e);
16507    }
16508
16509    #[simd_test(enable = "avx512fp16")]
16510    unsafe fn test_mm256_castsi256_ph() {
16511        let a = _mm256_set1_epi16(0x3c00);
16512        let r = _mm256_castsi256_ph(a);
16513        let e = _mm256_set1_ph(1.0);
16514        assert_eq_m256h(r, e);
16515    }
16516
16517    #[simd_test(enable = "avx512fp16")]
16518    unsafe fn test_mm512_castsi512_ph() {
16519        let a = _mm512_set1_epi16(0x3c00);
16520        let r = _mm512_castsi512_ph(a);
16521        let e = _mm512_set1_ph(1.0);
16522        assert_eq_m512h(r, e);
16523    }
16524
16525    #[simd_test(enable = "avx512fp16")]
16526    unsafe fn test_mm_castph_si128() {
16527        let a = _mm_set1_ph(1.0);
16528        let r = _mm_castph_si128(a);
16529        let e = _mm_set1_epi16(0x3c00);
16530        assert_eq_m128i(r, e);
16531    }
16532
16533    #[simd_test(enable = "avx512fp16")]
16534    unsafe fn test_mm256_castph_si256() {
16535        let a = _mm256_set1_ph(1.0);
16536        let r = _mm256_castph_si256(a);
16537        let e = _mm256_set1_epi16(0x3c00);
16538        assert_eq_m256i(r, e);
16539    }
16540
16541    #[simd_test(enable = "avx512fp16")]
16542    unsafe fn test_mm512_castph_si512() {
16543        let a = _mm512_set1_ph(1.0);
16544        let r = _mm512_castph_si512(a);
16545        let e = _mm512_set1_epi16(0x3c00);
16546        assert_eq_m512i(r, e);
16547    }
16548
16549    #[simd_test(enable = "avx512fp16")]
16550    unsafe fn test_mm_castps_ph() {
16551        let a = _mm_castsi128_ps(_mm_set1_epi16(0x3c00));
16552        let r = _mm_castps_ph(a);
16553        let e = _mm_set1_ph(1.0);
16554        assert_eq_m128h(r, e);
16555    }
16556
16557    #[simd_test(enable = "avx512fp16")]
16558    unsafe fn test_mm256_castps_ph() {
16559        let a = _mm256_castsi256_ps(_mm256_set1_epi16(0x3c00));
16560        let r = _mm256_castps_ph(a);
16561        let e = _mm256_set1_ph(1.0);
16562        assert_eq_m256h(r, e);
16563    }
16564
16565    #[simd_test(enable = "avx512fp16")]
16566    unsafe fn test_mm512_castps_ph() {
16567        let a = _mm512_castsi512_ps(_mm512_set1_epi16(0x3c00));
16568        let r = _mm512_castps_ph(a);
16569        let e = _mm512_set1_ph(1.0);
16570        assert_eq_m512h(r, e);
16571    }
16572
16573    #[simd_test(enable = "avx512fp16")]
16574    unsafe fn test_mm_castph_ps() {
16575        let a = _mm_castsi128_ph(_mm_set1_epi32(0x3f800000));
16576        let r = _mm_castph_ps(a);
16577        let e = _mm_set1_ps(1.0);
16578        assert_eq_m128(r, e);
16579    }
16580
16581    #[simd_test(enable = "avx512fp16")]
16582    unsafe fn test_mm256_castph_ps() {
16583        let a = _mm256_castsi256_ph(_mm256_set1_epi32(0x3f800000));
16584        let r = _mm256_castph_ps(a);
16585        let e = _mm256_set1_ps(1.0);
16586        assert_eq_m256(r, e);
16587    }
16588
16589    #[simd_test(enable = "avx512fp16")]
16590    unsafe fn test_mm512_castph_ps() {
16591        let a = _mm512_castsi512_ph(_mm512_set1_epi32(0x3f800000));
16592        let r = _mm512_castph_ps(a);
16593        let e = _mm512_set1_ps(1.0);
16594        assert_eq_m512(r, e);
16595    }
16596
16597    #[simd_test(enable = "avx512fp16")]
16598    unsafe fn test_mm_castpd_ph() {
16599        let a = _mm_castsi128_pd(_mm_set1_epi16(0x3c00));
16600        let r = _mm_castpd_ph(a);
16601        let e = _mm_set1_ph(1.0);
16602        assert_eq_m128h(r, e);
16603    }
16604
16605    #[simd_test(enable = "avx512fp16")]
16606    unsafe fn test_mm256_castpd_ph() {
16607        let a = _mm256_castsi256_pd(_mm256_set1_epi16(0x3c00));
16608        let r = _mm256_castpd_ph(a);
16609        let e = _mm256_set1_ph(1.0);
16610        assert_eq_m256h(r, e);
16611    }
16612
16613    #[simd_test(enable = "avx512fp16")]
16614    unsafe fn test_mm512_castpd_ph() {
16615        let a = _mm512_castsi512_pd(_mm512_set1_epi16(0x3c00));
16616        let r = _mm512_castpd_ph(a);
16617        let e = _mm512_set1_ph(1.0);
16618        assert_eq_m512h(r, e);
16619    }
16620
16621    #[simd_test(enable = "avx512fp16")]
16622    unsafe fn test_mm_castph_pd() {
16623        let a = _mm_castsi128_ph(_mm_set1_epi64x(0x3ff0000000000000));
16624        let r = _mm_castph_pd(a);
16625        let e = _mm_set1_pd(1.0);
16626        assert_eq_m128d(r, e);
16627    }
16628
16629    #[simd_test(enable = "avx512fp16")]
16630    unsafe fn test_mm256_castph_pd() {
16631        let a = _mm256_castsi256_ph(_mm256_set1_epi64x(0x3ff0000000000000));
16632        let r = _mm256_castph_pd(a);
16633        let e = _mm256_set1_pd(1.0);
16634        assert_eq_m256d(r, e);
16635    }
16636
16637    #[simd_test(enable = "avx512fp16")]
16638    unsafe fn test_mm512_castph_pd() {
16639        let a = _mm512_castsi512_ph(_mm512_set1_epi64(0x3ff0000000000000));
16640        let r = _mm512_castph_pd(a);
16641        let e = _mm512_set1_pd(1.0);
16642        assert_eq_m512d(r, e);
16643    }
16644
16645    #[simd_test(enable = "avx512fp16")]
16646    unsafe fn test_mm256_castph256_ph128() {
16647        let a = _mm256_setr_ph(
16648            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16649        );
16650        let r = _mm256_castph256_ph128(a);
16651        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16652        assert_eq_m128h(r, e);
16653    }
16654
16655    #[simd_test(enable = "avx512fp16")]
16656    unsafe fn test_mm512_castph512_ph128() {
16657        let a = _mm512_setr_ph(
16658            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
16659            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
16660        );
16661        let r = _mm512_castph512_ph128(a);
16662        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16663        assert_eq_m128h(r, e);
16664    }
16665
16666    #[simd_test(enable = "avx512fp16")]
16667    unsafe fn test_mm512_castph512_ph256() {
16668        let a = _mm512_setr_ph(
16669            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
16670            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
16671        );
16672        let r = _mm512_castph512_ph256(a);
16673        let e = _mm256_setr_ph(
16674            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16675        );
16676        assert_eq_m256h(r, e);
16677    }
16678
16679    #[simd_test(enable = "avx512fp16")]
16680    unsafe fn test_mm256_castph128_ph256() {
16681        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16682        let r = _mm256_castph128_ph256(a);
16683        assert_eq_m128h(_mm256_castph256_ph128(r), a);
16684    }
16685
16686    #[simd_test(enable = "avx512fp16")]
16687    unsafe fn test_mm512_castph128_ph512() {
16688        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16689        let r = _mm512_castph128_ph512(a);
16690        assert_eq_m128h(_mm512_castph512_ph128(r), a);
16691    }
16692
16693    #[simd_test(enable = "avx512fp16")]
16694    unsafe fn test_mm512_castph256_ph512() {
16695        let a = _mm256_setr_ph(
16696            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16697        );
16698        let r = _mm512_castph256_ph512(a);
16699        assert_eq_m256h(_mm512_castph512_ph256(r), a);
16700    }
16701
16702    #[simd_test(enable = "avx512fp16")]
16703    unsafe fn test_mm256_zextph128_ph256() {
16704        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16705        let r = _mm256_zextph128_ph256(a);
16706        let e = _mm256_setr_ph(
16707            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
16708        );
16709        assert_eq_m256h(r, e);
16710    }
16711
16712    #[simd_test(enable = "avx512fp16")]
16713    unsafe fn test_mm512_zextph128_ph512() {
16714        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
16715        let r = _mm512_zextph128_ph512(a);
16716        let e = _mm512_setr_ph(
16717            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
16718            0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
16719        );
16720        assert_eq_m512h(r, e);
16721    }
16722
16723    #[simd_test(enable = "avx512fp16")]
16724    unsafe fn test_mm512_zextph256_ph512() {
16725        let a = _mm256_setr_ph(
16726            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
16727        );
16728        let r = _mm512_zextph256_ph512(a);
16729        let e = _mm512_setr_ph(
16730            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 0., 0., 0., 0.,
16731            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
16732        );
16733        assert_eq_m512h(r, e);
16734    }
16735
16736    #[simd_test(enable = "avx512fp16,avx512vl")]
16737    unsafe fn test_mm_cmp_ph_mask() {
16738        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16739        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
16740        let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
16741        assert_eq!(r, 0b11110000);
16742    }
16743
16744    #[simd_test(enable = "avx512fp16,avx512vl")]
16745    unsafe fn test_mm_mask_cmp_ph_mask() {
16746        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16747        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
16748        let r = _mm_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101, a, b);
16749        assert_eq!(r, 0b01010000);
16750    }
16751
16752    #[simd_test(enable = "avx512fp16,avx512vl")]
16753    unsafe fn test_mm256_cmp_ph_mask() {
16754        let a = _mm256_set_ph(
16755            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16756        );
16757        let b = _mm256_set_ph(
16758            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
16759            -16.0,
16760        );
16761        let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
16762        assert_eq!(r, 0b1111000011110000);
16763    }
16764
16765    #[simd_test(enable = "avx512fp16,avx512vl")]
16766    unsafe fn test_mm256_mask_cmp_ph_mask() {
16767        let a = _mm256_set_ph(
16768            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16769        );
16770        let b = _mm256_set_ph(
16771            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
16772            -16.0,
16773        );
16774        let r = _mm256_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b0101010101010101, a, b);
16775        assert_eq!(r, 0b0101000001010000);
16776    }
16777
16778    #[simd_test(enable = "avx512fp16")]
16779    unsafe fn test_mm512_cmp_ph_mask() {
16780        let a = _mm512_set_ph(
16781            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16782            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16783            31.0, 32.0,
16784        );
16785        let b = _mm512_set_ph(
16786            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
16787            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
16788            -29.0, -30.0, -31.0, -32.0,
16789        );
16790        let r = _mm512_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
16791        assert_eq!(r, 0b11110000111100001111000011110000);
16792    }
16793
16794    #[simd_test(enable = "avx512fp16")]
16795    unsafe fn test_mm512_mask_cmp_ph_mask() {
16796        let a = _mm512_set_ph(
16797            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16798            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16799            31.0, 32.0,
16800        );
16801        let b = _mm512_set_ph(
16802            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
16803            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
16804            -29.0, -30.0, -31.0, -32.0,
16805        );
16806        let r = _mm512_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101010101010101010101010101, a, b);
16807        assert_eq!(r, 0b01010000010100000101000001010000);
16808    }
16809
16810    #[simd_test(enable = "avx512fp16")]
16811    unsafe fn test_mm512_cmp_round_ph_mask() {
16812        let a = _mm512_set_ph(
16813            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16814            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16815            31.0, 32.0,
16816        );
16817        let b = _mm512_set_ph(
16818            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
16819            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
16820            -29.0, -30.0, -31.0, -32.0,
16821        );
16822        let r = _mm512_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
16823        assert_eq!(r, 0b11110000111100001111000011110000);
16824    }
16825
16826    #[simd_test(enable = "avx512fp16")]
16827    unsafe fn test_mm512_mask_cmp_round_ph_mask() {
16828        let a = _mm512_set_ph(
16829            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16830            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16831            31.0, 32.0,
16832        );
16833        let b = _mm512_set_ph(
16834            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
16835            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
16836            -29.0, -30.0, -31.0, -32.0,
16837        );
16838        let r = _mm512_mask_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(
16839            0b01010101010101010101010101010101,
16840            a,
16841            b,
16842        );
16843        assert_eq!(r, 0b01010000010100000101000001010000);
16844    }
16845
16846    #[simd_test(enable = "avx512fp16")]
16847    unsafe fn test_mm_cmp_round_sh_mask() {
16848        let a = _mm_set_sh(1.0);
16849        let b = _mm_set_sh(1.0);
16850        let r = _mm_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
16851        assert_eq!(r, 1);
16852    }
16853
16854    #[simd_test(enable = "avx512fp16")]
16855    unsafe fn test_mm_mask_cmp_round_sh_mask() {
16856        let a = _mm_set_sh(1.0);
16857        let b = _mm_set_sh(1.0);
16858        let r = _mm_mask_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(0, a, b);
16859        assert_eq!(r, 0);
16860    }
16861
16862    #[simd_test(enable = "avx512fp16")]
16863    unsafe fn test_mm_cmp_sh_mask() {
16864        let a = _mm_set_sh(1.0);
16865        let b = _mm_set_sh(1.0);
16866        let r = _mm_cmp_sh_mask::<_CMP_EQ_OQ>(a, b);
16867        assert_eq!(r, 1);
16868    }
16869
16870    #[simd_test(enable = "avx512fp16")]
16871    unsafe fn test_mm_mask_cmp_sh_mask() {
16872        let a = _mm_set_sh(1.0);
16873        let b = _mm_set_sh(1.0);
16874        let r = _mm_mask_cmp_sh_mask::<_CMP_EQ_OQ>(0, a, b);
16875        assert_eq!(r, 0);
16876    }
16877
16878    #[simd_test(enable = "avx512fp16")]
16879    unsafe fn test_mm_comi_round_sh() {
16880        let a = _mm_set_sh(1.0);
16881        let b = _mm_set_sh(1.0);
16882        let r = _mm_comi_round_sh::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
16883        assert_eq!(r, 1);
16884    }
16885
16886    #[simd_test(enable = "avx512fp16")]
16887    unsafe fn test_mm_comi_sh() {
16888        let a = _mm_set_sh(1.0);
16889        let b = _mm_set_sh(1.0);
16890        let r = _mm_comi_sh::<_CMP_EQ_OQ>(a, b);
16891        assert_eq!(r, 1);
16892    }
16893
16894    #[simd_test(enable = "avx512fp16")]
16895    unsafe fn test_mm_comieq_sh() {
16896        let a = _mm_set_sh(1.0);
16897        let b = _mm_set_sh(1.0);
16898        let r = _mm_comieq_sh(a, b);
16899        assert_eq!(r, 1);
16900    }
16901
16902    #[simd_test(enable = "avx512fp16")]
16903    unsafe fn test_mm_comige_sh() {
16904        let a = _mm_set_sh(2.0);
16905        let b = _mm_set_sh(1.0);
16906        let r = _mm_comige_sh(a, b);
16907        assert_eq!(r, 1);
16908    }
16909
16910    #[simd_test(enable = "avx512fp16")]
16911    unsafe fn test_mm_comigt_sh() {
16912        let a = _mm_set_sh(2.0);
16913        let b = _mm_set_sh(1.0);
16914        let r = _mm_comigt_sh(a, b);
16915        assert_eq!(r, 1);
16916    }
16917
16918    #[simd_test(enable = "avx512fp16")]
16919    unsafe fn test_mm_comile_sh() {
16920        let a = _mm_set_sh(1.0);
16921        let b = _mm_set_sh(2.0);
16922        let r = _mm_comile_sh(a, b);
16923        assert_eq!(r, 1);
16924    }
16925
16926    #[simd_test(enable = "avx512fp16")]
16927    unsafe fn test_mm_comilt_sh() {
16928        let a = _mm_set_sh(1.0);
16929        let b = _mm_set_sh(2.0);
16930        let r = _mm_comilt_sh(a, b);
16931        assert_eq!(r, 1);
16932    }
16933
16934    #[simd_test(enable = "avx512fp16")]
16935    unsafe fn test_mm_comineq_sh() {
16936        let a = _mm_set_sh(1.0);
16937        let b = _mm_set_sh(2.0);
16938        let r = _mm_comineq_sh(a, b);
16939        assert_eq!(r, 1);
16940    }
16941
16942    #[simd_test(enable = "avx512fp16")]
16943    unsafe fn test_mm_ucomieq_sh() {
16944        let a = _mm_set_sh(1.0);
16945        let b = _mm_set_sh(1.0);
16946        let r = _mm_ucomieq_sh(a, b);
16947        assert_eq!(r, 1);
16948    }
16949
16950    #[simd_test(enable = "avx512fp16")]
16951    unsafe fn test_mm_ucomige_sh() {
16952        let a = _mm_set_sh(2.0);
16953        let b = _mm_set_sh(1.0);
16954        let r = _mm_ucomige_sh(a, b);
16955        assert_eq!(r, 1);
16956    }
16957
16958    #[simd_test(enable = "avx512fp16")]
16959    unsafe fn test_mm_ucomigt_sh() {
16960        let a = _mm_set_sh(2.0);
16961        let b = _mm_set_sh(1.0);
16962        let r = _mm_ucomigt_sh(a, b);
16963        assert_eq!(r, 1);
16964    }
16965
16966    #[simd_test(enable = "avx512fp16")]
16967    unsafe fn test_mm_ucomile_sh() {
16968        let a = _mm_set_sh(1.0);
16969        let b = _mm_set_sh(2.0);
16970        let r = _mm_ucomile_sh(a, b);
16971        assert_eq!(r, 1);
16972    }
16973
16974    #[simd_test(enable = "avx512fp16")]
16975    unsafe fn test_mm_ucomilt_sh() {
16976        let a = _mm_set_sh(1.0);
16977        let b = _mm_set_sh(2.0);
16978        let r = _mm_ucomilt_sh(a, b);
16979        assert_eq!(r, 1);
16980    }
16981
16982    #[simd_test(enable = "avx512fp16")]
16983    unsafe fn test_mm_ucomineq_sh() {
16984        let a = _mm_set_sh(1.0);
16985        let b = _mm_set_sh(2.0);
16986        let r = _mm_ucomineq_sh(a, b);
16987        assert_eq!(r, 1);
16988    }
16989
16990    #[simd_test(enable = "avx512fp16,avx512vl")]
16991    unsafe fn test_mm_load_ph() {
16992        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16993        let b = _mm_load_ph(addr_of!(a).cast());
16994        assert_eq_m128h(a, b);
16995    }
16996
16997    #[simd_test(enable = "avx512fp16,avx512vl")]
16998    unsafe fn test_mm256_load_ph() {
16999        let a = _mm256_set_ph(
17000            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17001        );
17002        let b = _mm256_load_ph(addr_of!(a).cast());
17003        assert_eq_m256h(a, b);
17004    }
17005
17006    #[simd_test(enable = "avx512fp16")]
17007    unsafe fn test_mm512_load_ph() {
17008        let a = _mm512_set_ph(
17009            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17010            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17011            31.0, 32.0,
17012        );
17013        let b = _mm512_load_ph(addr_of!(a).cast());
17014        assert_eq_m512h(a, b);
17015    }
17016
17017    #[simd_test(enable = "avx512fp16")]
17018    unsafe fn test_mm_load_sh() {
17019        let a = _mm_set_sh(1.0);
17020        let b = _mm_load_sh(addr_of!(a).cast());
17021        assert_eq_m128h(a, b);
17022    }
17023
17024    #[simd_test(enable = "avx512fp16")]
17025    unsafe fn test_mm_mask_load_sh() {
17026        let a = _mm_set_sh(1.0);
17027        let src = _mm_set_sh(2.);
17028        let b = _mm_mask_load_sh(src, 1, addr_of!(a).cast());
17029        assert_eq_m128h(a, b);
17030        let b = _mm_mask_load_sh(src, 0, addr_of!(a).cast());
17031        assert_eq_m128h(src, b);
17032    }
17033
17034    #[simd_test(enable = "avx512fp16")]
17035    unsafe fn test_mm_maskz_load_sh() {
17036        let a = _mm_set_sh(1.0);
17037        let b = _mm_maskz_load_sh(1, addr_of!(a).cast());
17038        assert_eq_m128h(a, b);
17039        let b = _mm_maskz_load_sh(0, addr_of!(a).cast());
17040        assert_eq_m128h(_mm_setzero_ph(), b);
17041    }
17042
17043    #[simd_test(enable = "avx512fp16,avx512vl")]
17044    unsafe fn test_mm_loadu_ph() {
17045        let array = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
17046        let r = _mm_loadu_ph(array.as_ptr());
17047        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17048        assert_eq_m128h(r, e);
17049    }
17050
17051    #[simd_test(enable = "avx512fp16,avx512vl")]
17052    unsafe fn test_mm256_loadu_ph() {
17053        let array = [
17054            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17055        ];
17056        let r = _mm256_loadu_ph(array.as_ptr());
17057        let e = _mm256_setr_ph(
17058            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17059        );
17060        assert_eq_m256h(r, e);
17061    }
17062
17063    #[simd_test(enable = "avx512fp16")]
17064    unsafe fn test_mm512_loadu_ph() {
17065        let array = [
17066            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17067            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17068            31.0, 32.0,
17069        ];
17070        let r = _mm512_loadu_ph(array.as_ptr());
17071        let e = _mm512_setr_ph(
17072            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17073            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17074            31.0, 32.0,
17075        );
17076        assert_eq_m512h(r, e);
17077    }
17078
17079    #[simd_test(enable = "avx512fp16")]
17080    unsafe fn test_mm_move_sh() {
17081        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17082        let b = _mm_set_sh(9.0);
17083        let r = _mm_move_sh(a, b);
17084        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0);
17085        assert_eq_m128h(r, e);
17086    }
17087
17088    #[simd_test(enable = "avx512fp16")]
17089    unsafe fn test_mm_mask_move_sh() {
17090        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17091        let b = _mm_set_sh(9.0);
17092        let src = _mm_set_sh(10.0);
17093        let r = _mm_mask_move_sh(src, 0, a, b);
17094        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 10.0);
17095        assert_eq_m128h(r, e);
17096    }
17097
17098    #[simd_test(enable = "avx512fp16")]
17099    unsafe fn test_mm_maskz_move_sh() {
17100        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17101        let b = _mm_set_sh(9.0);
17102        let r = _mm_maskz_move_sh(0, a, b);
17103        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0);
17104        assert_eq_m128h(r, e);
17105    }
17106
17107    #[simd_test(enable = "avx512fp16,avx512vl")]
17108    unsafe fn test_mm_store_ph() {
17109        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17110        let mut b = _mm_setzero_ph();
17111        _mm_store_ph(addr_of_mut!(b).cast(), a);
17112        assert_eq_m128h(a, b);
17113    }
17114
17115    #[simd_test(enable = "avx512fp16,avx512vl")]
17116    unsafe fn test_mm256_store_ph() {
17117        let a = _mm256_set_ph(
17118            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17119        );
17120        let mut b = _mm256_setzero_ph();
17121        _mm256_store_ph(addr_of_mut!(b).cast(), a);
17122        assert_eq_m256h(a, b);
17123    }
17124
17125    #[simd_test(enable = "avx512fp16")]
17126    unsafe fn test_mm512_store_ph() {
17127        let a = _mm512_set_ph(
17128            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17129            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17130            31.0, 32.0,
17131        );
17132        let mut b = _mm512_setzero_ph();
17133        _mm512_store_ph(addr_of_mut!(b).cast(), a);
17134        assert_eq_m512h(a, b);
17135    }
17136
17137    #[simd_test(enable = "avx512fp16")]
17138    unsafe fn test_mm_store_sh() {
17139        let a = _mm_set_sh(1.0);
17140        let mut b = _mm_setzero_ph();
17141        _mm_store_sh(addr_of_mut!(b).cast(), a);
17142        assert_eq_m128h(a, b);
17143    }
17144
17145    #[simd_test(enable = "avx512fp16")]
17146    unsafe fn test_mm_mask_store_sh() {
17147        let a = _mm_set_sh(1.0);
17148        let mut b = _mm_setzero_ph();
17149        _mm_mask_store_sh(addr_of_mut!(b).cast(), 0, a);
17150        assert_eq_m128h(_mm_setzero_ph(), b);
17151        _mm_mask_store_sh(addr_of_mut!(b).cast(), 1, a);
17152        assert_eq_m128h(a, b);
17153    }
17154
17155    #[simd_test(enable = "avx512fp16,avx512vl")]
17156    unsafe fn test_mm_storeu_ph() {
17157        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17158        let mut array = [0.0; 8];
17159        _mm_storeu_ph(array.as_mut_ptr(), a);
17160        assert_eq_m128h(a, _mm_loadu_ph(array.as_ptr()));
17161    }
17162
17163    #[simd_test(enable = "avx512fp16,avx512vl")]
17164    unsafe fn test_mm256_storeu_ph() {
17165        let a = _mm256_set_ph(
17166            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17167        );
17168        let mut array = [0.0; 16];
17169        _mm256_storeu_ph(array.as_mut_ptr(), a);
17170        assert_eq_m256h(a, _mm256_loadu_ph(array.as_ptr()));
17171    }
17172
17173    #[simd_test(enable = "avx512fp16")]
17174    unsafe fn test_mm512_storeu_ph() {
17175        let a = _mm512_set_ph(
17176            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17177            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17178            31.0, 32.0,
17179        );
17180        let mut array = [0.0; 32];
17181        _mm512_storeu_ph(array.as_mut_ptr(), a);
17182        assert_eq_m512h(a, _mm512_loadu_ph(array.as_ptr()));
17183    }
17184
17185    #[simd_test(enable = "avx512fp16,avx512vl")]
17186    unsafe fn test_mm_add_ph() {
17187        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17188        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17189        let r = _mm_add_ph(a, b);
17190        let e = _mm_set1_ph(9.0);
17191        assert_eq_m128h(r, e);
17192    }
17193
17194    #[simd_test(enable = "avx512fp16,avx512vl")]
17195    unsafe fn test_mm_mask_add_ph() {
17196        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17197        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17198        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17199        let r = _mm_mask_add_ph(src, 0b01010101, a, b);
17200        let e = _mm_set_ph(10., 9., 12., 9., 14., 9., 16., 9.);
17201        assert_eq_m128h(r, e);
17202    }
17203
17204    #[simd_test(enable = "avx512fp16,avx512vl")]
17205    unsafe fn test_mm_maskz_add_ph() {
17206        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17207        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17208        let r = _mm_maskz_add_ph(0b01010101, a, b);
17209        let e = _mm_set_ph(0., 9., 0., 9., 0., 9., 0., 9.);
17210        assert_eq_m128h(r, e);
17211    }
17212
17213    #[simd_test(enable = "avx512fp16,avx512vl")]
17214    unsafe fn test_mm256_add_ph() {
17215        let a = _mm256_set_ph(
17216            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17217        );
17218        let b = _mm256_set_ph(
17219            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17220        );
17221        let r = _mm256_add_ph(a, b);
17222        let e = _mm256_set1_ph(17.0);
17223        assert_eq_m256h(r, e);
17224    }
17225
17226    #[simd_test(enable = "avx512fp16,avx512vl")]
17227    unsafe fn test_mm256_mask_add_ph() {
17228        let a = _mm256_set_ph(
17229            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17230        );
17231        let b = _mm256_set_ph(
17232            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17233        );
17234        let src = _mm256_set_ph(
17235            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
17236        );
17237        let r = _mm256_mask_add_ph(src, 0b0101010101010101, a, b);
17238        let e = _mm256_set_ph(
17239            18., 17., 20., 17., 22., 17., 24., 17., 26., 17., 28., 17., 30., 17., 32., 17.,
17240        );
17241        assert_eq_m256h(r, e);
17242    }
17243
17244    #[simd_test(enable = "avx512fp16,avx512vl")]
17245    unsafe fn test_mm256_maskz_add_ph() {
17246        let a = _mm256_set_ph(
17247            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17248        );
17249        let b = _mm256_set_ph(
17250            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17251        );
17252        let r = _mm256_maskz_add_ph(0b0101010101010101, a, b);
17253        let e = _mm256_set_ph(
17254            0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17.,
17255        );
17256        assert_eq_m256h(r, e);
17257    }
17258
17259    #[simd_test(enable = "avx512fp16")]
17260    unsafe fn test_mm512_add_ph() {
17261        let a = _mm512_set_ph(
17262            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17263            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17264            31.0, 32.0,
17265        );
17266        let b = _mm512_set_ph(
17267            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17268            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17269            3.0, 2.0, 1.0,
17270        );
17271        let r = _mm512_add_ph(a, b);
17272        let e = _mm512_set1_ph(33.0);
17273        assert_eq_m512h(r, e);
17274    }
17275
17276    #[simd_test(enable = "avx512fp16")]
17277    unsafe fn test_mm512_mask_add_ph() {
17278        let a = _mm512_set_ph(
17279            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17280            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17281            31.0, 32.0,
17282        );
17283        let b = _mm512_set_ph(
17284            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17285            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17286            3.0, 2.0, 1.0,
17287        );
17288        let src = _mm512_set_ph(
17289            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17290            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17291        );
17292        let r = _mm512_mask_add_ph(src, 0b01010101010101010101010101010101, a, b);
17293        let e = _mm512_set_ph(
17294            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17295            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17296        );
17297        assert_eq_m512h(r, e);
17298    }
17299
17300    #[simd_test(enable = "avx512fp16")]
17301    unsafe fn test_mm512_maskz_add_ph() {
17302        let a = _mm512_set_ph(
17303            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17304            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17305            31.0, 32.0,
17306        );
17307        let b = _mm512_set_ph(
17308            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17309            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17310            3.0, 2.0, 1.0,
17311        );
17312        let r = _mm512_maskz_add_ph(0b01010101010101010101010101010101, a, b);
17313        let e = _mm512_set_ph(
17314            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17315            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17316        );
17317        assert_eq_m512h(r, e);
17318    }
17319
17320    #[simd_test(enable = "avx512fp16")]
17321    unsafe fn test_mm512_add_round_ph() {
17322        let a = _mm512_set_ph(
17323            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17324            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17325            31.0, 32.0,
17326        );
17327        let b = _mm512_set_ph(
17328            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17329            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17330            3.0, 2.0, 1.0,
17331        );
17332        let r = _mm512_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17333        let e = _mm512_set1_ph(33.0);
17334        assert_eq_m512h(r, e);
17335    }
17336
17337    #[simd_test(enable = "avx512fp16")]
17338    unsafe fn test_mm512_mask_add_round_ph() {
17339        let a = _mm512_set_ph(
17340            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17341            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17342            31.0, 32.0,
17343        );
17344        let b = _mm512_set_ph(
17345            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17346            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17347            3.0, 2.0, 1.0,
17348        );
17349        let src = _mm512_set_ph(
17350            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17351            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17352        );
17353        let r = _mm512_mask_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17354            src,
17355            0b01010101010101010101010101010101,
17356            a,
17357            b,
17358        );
17359        let e = _mm512_set_ph(
17360            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17361            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17362        );
17363        assert_eq_m512h(r, e);
17364    }
17365
17366    #[simd_test(enable = "avx512fp16")]
17367    unsafe fn test_mm512_maskz_add_round_ph() {
17368        let a = _mm512_set_ph(
17369            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17370            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17371            31.0, 32.0,
17372        );
17373        let b = _mm512_set_ph(
17374            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17375            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17376            3.0, 2.0, 1.0,
17377        );
17378        let r = _mm512_maskz_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17379            0b01010101010101010101010101010101,
17380            a,
17381            b,
17382        );
17383        let e = _mm512_set_ph(
17384            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17385            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17386        );
17387        assert_eq_m512h(r, e);
17388    }
17389
17390    #[simd_test(enable = "avx512fp16")]
17391    unsafe fn test_mm_add_round_sh() {
17392        let a = _mm_set_sh(1.0);
17393        let b = _mm_set_sh(2.0);
17394        let r = _mm_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17395        let e = _mm_set_sh(3.0);
17396        assert_eq_m128h(r, e);
17397    }
17398
17399    #[simd_test(enable = "avx512fp16")]
17400    unsafe fn test_mm_mask_add_round_sh() {
17401        let a = _mm_set_sh(1.0);
17402        let b = _mm_set_sh(2.0);
17403        let src = _mm_set_sh(4.0);
17404        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17405            src, 0, a, b,
17406        );
17407        let e = _mm_set_sh(4.0);
17408        assert_eq_m128h(r, e);
17409        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17410            src, 1, a, b,
17411        );
17412        let e = _mm_set_sh(3.0);
17413        assert_eq_m128h(r, e);
17414    }
17415
17416    #[simd_test(enable = "avx512fp16")]
17417    unsafe fn test_mm_maskz_add_round_sh() {
17418        let a = _mm_set_sh(1.0);
17419        let b = _mm_set_sh(2.0);
17420        let r =
17421            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
17422        let e = _mm_set_sh(0.0);
17423        assert_eq_m128h(r, e);
17424        let r =
17425            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
17426        let e = _mm_set_sh(3.0);
17427        assert_eq_m128h(r, e);
17428    }
17429
17430    #[simd_test(enable = "avx512fp16")]
17431    unsafe fn test_mm_add_sh() {
17432        let a = _mm_set_sh(1.0);
17433        let b = _mm_set_sh(2.0);
17434        let r = _mm_add_sh(a, b);
17435        let e = _mm_set_sh(3.0);
17436        assert_eq_m128h(r, e);
17437    }
17438
17439    #[simd_test(enable = "avx512fp16")]
17440    unsafe fn test_mm_mask_add_sh() {
17441        let a = _mm_set_sh(1.0);
17442        let b = _mm_set_sh(2.0);
17443        let src = _mm_set_sh(4.0);
17444        let r = _mm_mask_add_sh(src, 0, a, b);
17445        let e = _mm_set_sh(4.0);
17446        assert_eq_m128h(r, e);
17447        let r = _mm_mask_add_sh(src, 1, a, b);
17448        let e = _mm_set_sh(3.0);
17449        assert_eq_m128h(r, e);
17450    }
17451
17452    #[simd_test(enable = "avx512fp16")]
17453    unsafe fn test_mm_maskz_add_sh() {
17454        let a = _mm_set_sh(1.0);
17455        let b = _mm_set_sh(2.0);
17456        let r = _mm_maskz_add_sh(0, a, b);
17457        let e = _mm_set_sh(0.0);
17458        assert_eq_m128h(r, e);
17459        let r = _mm_maskz_add_sh(1, a, b);
17460        let e = _mm_set_sh(3.0);
17461        assert_eq_m128h(r, e);
17462    }
17463
17464    #[simd_test(enable = "avx512fp16,avx512vl")]
17465    unsafe fn test_mm_sub_ph() {
17466        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17467        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17468        let r = _mm_sub_ph(a, b);
17469        let e = _mm_set_ph(-7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0);
17470        assert_eq_m128h(r, e);
17471    }
17472
17473    #[simd_test(enable = "avx512fp16,avx512vl")]
17474    unsafe fn test_mm_mask_sub_ph() {
17475        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17476        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17477        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17478        let r = _mm_mask_sub_ph(src, 0b01010101, a, b);
17479        let e = _mm_set_ph(10., -5., 12., -1., 14., 3., 16., 7.);
17480        assert_eq_m128h(r, e);
17481    }
17482
17483    #[simd_test(enable = "avx512fp16,avx512vl")]
17484    unsafe fn test_mm_maskz_sub_ph() {
17485        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17486        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17487        let r = _mm_maskz_sub_ph(0b01010101, a, b);
17488        let e = _mm_set_ph(0., -5., 0., -1., 0., 3., 0., 7.);
17489        assert_eq_m128h(r, e);
17490    }
17491
17492    #[simd_test(enable = "avx512fp16,avx512vl")]
17493    unsafe fn test_mm256_sub_ph() {
17494        let a = _mm256_set_ph(
17495            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17496        );
17497        let b = _mm256_set_ph(
17498            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17499        );
17500        let r = _mm256_sub_ph(a, b);
17501        let e = _mm256_set_ph(
17502            -15.0, -13.0, -11.0, -9.0, -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0,
17503            15.0,
17504        );
17505        assert_eq_m256h(r, e);
17506    }
17507
17508    #[simd_test(enable = "avx512fp16,avx512vl")]
17509    unsafe fn test_mm256_mask_sub_ph() {
17510        let a = _mm256_set_ph(
17511            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17512        );
17513        let b = _mm256_set_ph(
17514            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17515        );
17516        let src = _mm256_set_ph(
17517            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
17518        );
17519        let r = _mm256_mask_sub_ph(src, 0b0101010101010101, a, b);
17520        let e = _mm256_set_ph(
17521            18., -13., 20., -9., 22., -5., 24., -1., 26., 3., 28., 7., 30., 11., 32., 15.,
17522        );
17523        assert_eq_m256h(r, e);
17524    }
17525
17526    #[simd_test(enable = "avx512fp16,avx512vl")]
17527    unsafe fn test_mm256_maskz_sub_ph() {
17528        let a = _mm256_set_ph(
17529            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17530        );
17531        let b = _mm256_set_ph(
17532            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17533        );
17534        let r = _mm256_maskz_sub_ph(0b0101010101010101, a, b);
17535        let e = _mm256_set_ph(
17536            0., -13., 0., -9., 0., -5., 0., -1., 0., 3., 0., 7., 0., 11., 0., 15.,
17537        );
17538        assert_eq_m256h(r, e);
17539    }
17540
17541    #[simd_test(enable = "avx512fp16")]
17542    unsafe fn test_mm512_sub_ph() {
17543        let a = _mm512_set_ph(
17544            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17545            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17546            31.0, 32.0,
17547        );
17548        let b = _mm512_set_ph(
17549            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17550            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17551            3.0, 2.0, 1.0,
17552        );
17553        let r = _mm512_sub_ph(a, b);
17554        let e = _mm512_set_ph(
17555            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
17556            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
17557            23.0, 25.0, 27.0, 29.0, 31.0,
17558        );
17559        assert_eq_m512h(r, e);
17560    }
17561
17562    #[simd_test(enable = "avx512fp16")]
17563    unsafe fn test_mm512_mask_sub_ph() {
17564        let a = _mm512_set_ph(
17565            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17566            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17567            31.0, 32.0,
17568        );
17569        let b = _mm512_set_ph(
17570            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17571            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17572            3.0, 2.0, 1.0,
17573        );
17574        let src = _mm512_set_ph(
17575            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17576            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17577        );
17578        let r = _mm512_mask_sub_ph(src, 0b01010101010101010101010101010101, a, b);
17579        let e = _mm512_set_ph(
17580            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
17581            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
17582        );
17583        assert_eq_m512h(r, e);
17584    }
17585
17586    #[simd_test(enable = "avx512fp16")]
17587    unsafe fn test_mm512_maskz_sub_ph() {
17588        let a = _mm512_set_ph(
17589            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17590            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17591            31.0, 32.0,
17592        );
17593        let b = _mm512_set_ph(
17594            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17595            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17596            3.0, 2.0, 1.0,
17597        );
17598        let r = _mm512_maskz_sub_ph(0b01010101010101010101010101010101, a, b);
17599        let e = _mm512_set_ph(
17600            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
17601            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
17602        );
17603        assert_eq_m512h(r, e);
17604    }
17605
17606    #[simd_test(enable = "avx512fp16")]
17607    unsafe fn test_mm512_sub_round_ph() {
17608        let a = _mm512_set_ph(
17609            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17610            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17611            31.0, 32.0,
17612        );
17613        let b = _mm512_set_ph(
17614            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17615            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17616            3.0, 2.0, 1.0,
17617        );
17618        let r = _mm512_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17619        let e = _mm512_set_ph(
17620            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
17621            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
17622            23.0, 25.0, 27.0, 29.0, 31.0,
17623        );
17624        assert_eq_m512h(r, e);
17625    }
17626
17627    #[simd_test(enable = "avx512fp16")]
17628    unsafe fn test_mm512_mask_sub_round_ph() {
17629        let a = _mm512_set_ph(
17630            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17631            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17632            31.0, 32.0,
17633        );
17634        let b = _mm512_set_ph(
17635            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17636            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17637            3.0, 2.0, 1.0,
17638        );
17639        let src = _mm512_set_ph(
17640            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17641            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17642        );
17643        let r = _mm512_mask_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17644            src,
17645            0b01010101010101010101010101010101,
17646            a,
17647            b,
17648        );
17649        let e = _mm512_set_ph(
17650            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
17651            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
17652        );
17653        assert_eq_m512h(r, e);
17654    }
17655
17656    #[simd_test(enable = "avx512fp16")]
17657    unsafe fn test_mm512_maskz_sub_round_ph() {
17658        let a = _mm512_set_ph(
17659            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17660            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17661            31.0, 32.0,
17662        );
17663        let b = _mm512_set_ph(
17664            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17665            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17666            3.0, 2.0, 1.0,
17667        );
17668        let r = _mm512_maskz_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17669            0b01010101010101010101010101010101,
17670            a,
17671            b,
17672        );
17673        let e = _mm512_set_ph(
17674            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
17675            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
17676        );
17677        assert_eq_m512h(r, e);
17678    }
17679
17680    #[simd_test(enable = "avx512fp16")]
17681    unsafe fn test_mm_sub_round_sh() {
17682        let a = _mm_set_sh(1.0);
17683        let b = _mm_set_sh(2.0);
17684        let r = _mm_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17685        let e = _mm_set_sh(-1.0);
17686        assert_eq_m128h(r, e);
17687    }
17688
17689    #[simd_test(enable = "avx512fp16")]
17690    unsafe fn test_mm_mask_sub_round_sh() {
17691        let a = _mm_set_sh(1.0);
17692        let b = _mm_set_sh(2.0);
17693        let src = _mm_set_sh(4.0);
17694        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17695            src, 0, a, b,
17696        );
17697        let e = _mm_set_sh(4.0);
17698        assert_eq_m128h(r, e);
17699        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17700            src, 1, a, b,
17701        );
17702        let e = _mm_set_sh(-1.0);
17703        assert_eq_m128h(r, e);
17704    }
17705
17706    #[simd_test(enable = "avx512fp16")]
17707    unsafe fn test_mm_maskz_sub_round_sh() {
17708        let a = _mm_set_sh(1.0);
17709        let b = _mm_set_sh(2.0);
17710        let r =
17711            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
17712        let e = _mm_set_sh(0.0);
17713        assert_eq_m128h(r, e);
17714        let r =
17715            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
17716        let e = _mm_set_sh(-1.0);
17717        assert_eq_m128h(r, e);
17718    }
17719
17720    #[simd_test(enable = "avx512fp16")]
17721    unsafe fn test_mm_sub_sh() {
17722        let a = _mm_set_sh(1.0);
17723        let b = _mm_set_sh(2.0);
17724        let r = _mm_sub_sh(a, b);
17725        let e = _mm_set_sh(-1.0);
17726        assert_eq_m128h(r, e);
17727    }
17728
17729    #[simd_test(enable = "avx512fp16")]
17730    unsafe fn test_mm_mask_sub_sh() {
17731        let a = _mm_set_sh(1.0);
17732        let b = _mm_set_sh(2.0);
17733        let src = _mm_set_sh(4.0);
17734        let r = _mm_mask_sub_sh(src, 0, a, b);
17735        let e = _mm_set_sh(4.0);
17736        assert_eq_m128h(r, e);
17737        let r = _mm_mask_sub_sh(src, 1, a, b);
17738        let e = _mm_set_sh(-1.0);
17739        assert_eq_m128h(r, e);
17740    }
17741
17742    #[simd_test(enable = "avx512fp16")]
17743    unsafe fn test_mm_maskz_sub_sh() {
17744        let a = _mm_set_sh(1.0);
17745        let b = _mm_set_sh(2.0);
17746        let r = _mm_maskz_sub_sh(0, a, b);
17747        let e = _mm_set_sh(0.0);
17748        assert_eq_m128h(r, e);
17749        let r = _mm_maskz_sub_sh(1, a, b);
17750        let e = _mm_set_sh(-1.0);
17751        assert_eq_m128h(r, e);
17752    }
17753
17754    #[simd_test(enable = "avx512fp16,avx512vl")]
17755    unsafe fn test_mm_mul_ph() {
17756        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17757        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17758        let r = _mm_mul_ph(a, b);
17759        let e = _mm_set_ph(8.0, 14.0, 18.0, 20.0, 20.0, 18.0, 14.0, 8.0);
17760        assert_eq_m128h(r, e);
17761    }
17762
17763    #[simd_test(enable = "avx512fp16,avx512vl")]
17764    unsafe fn test_mm_mask_mul_ph() {
17765        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17766        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17767        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17768        let r = _mm_mask_mul_ph(src, 0b01010101, a, b);
17769        let e = _mm_set_ph(10., 14., 12., 20., 14., 18., 16., 8.);
17770        assert_eq_m128h(r, e);
17771    }
17772
17773    #[simd_test(enable = "avx512fp16,avx512vl")]
17774    unsafe fn test_mm_maskz_mul_ph() {
17775        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17776        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17777        let r = _mm_maskz_mul_ph(0b01010101, a, b);
17778        let e = _mm_set_ph(0., 14., 0., 20., 0., 18., 0., 8.);
17779        assert_eq_m128h(r, e);
17780    }
17781
17782    #[simd_test(enable = "avx512fp16,avx512vl")]
17783    unsafe fn test_mm256_mul_ph() {
17784        let a = _mm256_set_ph(
17785            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17786        );
17787        let b = _mm256_set_ph(
17788            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17789        );
17790        let r = _mm256_mul_ph(a, b);
17791        let e = _mm256_set_ph(
17792            16.0, 30.0, 42.0, 52.0, 60.0, 66.0, 70.0, 72.0, 72.0, 70.0, 66.0, 60.0, 52.0, 42.0,
17793            30.0, 16.0,
17794        );
17795        assert_eq_m256h(r, e);
17796    }
17797
17798    #[simd_test(enable = "avx512fp16,avx512vl")]
17799    unsafe fn test_mm256_mask_mul_ph() {
17800        let a = _mm256_set_ph(
17801            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17802        );
17803        let b = _mm256_set_ph(
17804            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17805        );
17806        let src = _mm256_set_ph(
17807            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
17808        );
17809        let r = _mm256_mask_mul_ph(src, 0b0101010101010101, a, b);
17810        let e = _mm256_set_ph(
17811            18., 30., 20., 52., 22., 66., 24., 72., 26., 70., 28., 60., 30., 42., 32., 16.,
17812        );
17813        assert_eq_m256h(r, e);
17814    }
17815
17816    #[simd_test(enable = "avx512fp16,avx512vl")]
17817    unsafe fn test_mm256_maskz_mul_ph() {
17818        let a = _mm256_set_ph(
17819            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17820        );
17821        let b = _mm256_set_ph(
17822            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17823        );
17824        let r = _mm256_maskz_mul_ph(0b0101010101010101, a, b);
17825        let e = _mm256_set_ph(
17826            0., 30., 0., 52., 0., 66., 0., 72., 0., 70., 0., 60., 0., 42., 0., 16.,
17827        );
17828        assert_eq_m256h(r, e);
17829    }
17830
17831    #[simd_test(enable = "avx512fp16")]
17832    unsafe fn test_mm512_mul_ph() {
17833        let a = _mm512_set_ph(
17834            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17835            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17836            31.0, 32.0,
17837        );
17838        let b = _mm512_set_ph(
17839            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17840            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17841            3.0, 2.0, 1.0,
17842        );
17843        let r = _mm512_mul_ph(a, b);
17844        let e = _mm512_set_ph(
17845            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
17846            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
17847            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
17848        );
17849        assert_eq_m512h(r, e);
17850    }
17851
17852    #[simd_test(enable = "avx512fp16")]
17853    unsafe fn test_mm512_mask_mul_ph() {
17854        let a = _mm512_set_ph(
17855            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17856            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17857            31.0, 32.0,
17858        );
17859        let b = _mm512_set_ph(
17860            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17861            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17862            3.0, 2.0, 1.0,
17863        );
17864        let src = _mm512_set_ph(
17865            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17866            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17867        );
17868        let r = _mm512_mask_mul_ph(src, 0b01010101010101010101010101010101, a, b);
17869        let e = _mm512_set_ph(
17870            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
17871            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
17872        );
17873        assert_eq_m512h(r, e);
17874    }
17875
17876    #[simd_test(enable = "avx512fp16")]
17877    unsafe fn test_mm512_maskz_mul_ph() {
17878        let a = _mm512_set_ph(
17879            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17880            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17881            31.0, 32.0,
17882        );
17883        let b = _mm512_set_ph(
17884            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17885            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17886            3.0, 2.0, 1.0,
17887        );
17888        let r = _mm512_maskz_mul_ph(0b01010101010101010101010101010101, a, b);
17889        let e = _mm512_set_ph(
17890            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
17891            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
17892        );
17893        assert_eq_m512h(r, e);
17894    }
17895
17896    #[simd_test(enable = "avx512fp16")]
17897    unsafe fn test_mm512_mul_round_ph() {
17898        let a = _mm512_set_ph(
17899            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17900            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17901            31.0, 32.0,
17902        );
17903        let b = _mm512_set_ph(
17904            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17905            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17906            3.0, 2.0, 1.0,
17907        );
17908        let r = _mm512_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17909        let e = _mm512_set_ph(
17910            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
17911            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
17912            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
17913        );
17914        assert_eq_m512h(r, e);
17915    }
17916
17917    #[simd_test(enable = "avx512fp16")]
17918    unsafe fn test_mm512_mask_mul_round_ph() {
17919        let a = _mm512_set_ph(
17920            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17921            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17922            31.0, 32.0,
17923        );
17924        let b = _mm512_set_ph(
17925            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17926            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17927            3.0, 2.0, 1.0,
17928        );
17929        let src = _mm512_set_ph(
17930            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17931            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17932        );
17933        let r = _mm512_mask_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17934            src,
17935            0b01010101010101010101010101010101,
17936            a,
17937            b,
17938        );
17939        let e = _mm512_set_ph(
17940            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
17941            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
17942        );
17943        assert_eq_m512h(r, e);
17944    }
17945
17946    #[simd_test(enable = "avx512fp16")]
17947    unsafe fn test_mm512_maskz_mul_round_ph() {
17948        let a = _mm512_set_ph(
17949            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17950            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17951            31.0, 32.0,
17952        );
17953        let b = _mm512_set_ph(
17954            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17955            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17956            3.0, 2.0, 1.0,
17957        );
17958        let r = _mm512_maskz_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17959            0b01010101010101010101010101010101,
17960            a,
17961            b,
17962        );
17963        let e = _mm512_set_ph(
17964            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
17965            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
17966        );
17967        assert_eq_m512h(r, e);
17968    }
17969
17970    #[simd_test(enable = "avx512fp16")]
17971    unsafe fn test_mm_mul_round_sh() {
17972        let a = _mm_set_sh(1.0);
17973        let b = _mm_set_sh(2.0);
17974        let r = _mm_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17975        let e = _mm_set_sh(2.0);
17976        assert_eq_m128h(r, e);
17977    }
17978
17979    #[simd_test(enable = "avx512fp16")]
17980    unsafe fn test_mm_mask_mul_round_sh() {
17981        let a = _mm_set_sh(1.0);
17982        let b = _mm_set_sh(2.0);
17983        let src = _mm_set_sh(4.0);
17984        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17985            src, 0, a, b,
17986        );
17987        let e = _mm_set_sh(4.0);
17988        assert_eq_m128h(r, e);
17989        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17990            src, 1, a, b,
17991        );
17992        let e = _mm_set_sh(2.0);
17993        assert_eq_m128h(r, e);
17994    }
17995
17996    #[simd_test(enable = "avx512fp16")]
17997    unsafe fn test_mm_maskz_mul_round_sh() {
17998        let a = _mm_set_sh(1.0);
17999        let b = _mm_set_sh(2.0);
18000        let r =
18001            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18002        let e = _mm_set_sh(0.0);
18003        assert_eq_m128h(r, e);
18004        let r =
18005            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18006        let e = _mm_set_sh(2.0);
18007        assert_eq_m128h(r, e);
18008    }
18009
18010    #[simd_test(enable = "avx512fp16")]
18011    unsafe fn test_mm_mul_sh() {
18012        let a = _mm_set_sh(1.0);
18013        let b = _mm_set_sh(2.0);
18014        let r = _mm_mul_sh(a, b);
18015        let e = _mm_set_sh(2.0);
18016        assert_eq_m128h(r, e);
18017    }
18018
18019    #[simd_test(enable = "avx512fp16")]
18020    unsafe fn test_mm_mask_mul_sh() {
18021        let a = _mm_set_sh(1.0);
18022        let b = _mm_set_sh(2.0);
18023        let src = _mm_set_sh(4.0);
18024        let r = _mm_mask_mul_sh(src, 0, a, b);
18025        let e = _mm_set_sh(4.0);
18026        assert_eq_m128h(r, e);
18027        let r = _mm_mask_mul_sh(src, 1, a, b);
18028        let e = _mm_set_sh(2.0);
18029        assert_eq_m128h(r, e);
18030    }
18031
18032    #[simd_test(enable = "avx512fp16")]
18033    unsafe fn test_mm_maskz_mul_sh() {
18034        let a = _mm_set_sh(1.0);
18035        let b = _mm_set_sh(2.0);
18036        let r = _mm_maskz_mul_sh(0, a, b);
18037        let e = _mm_set_sh(0.0);
18038        assert_eq_m128h(r, e);
18039        let r = _mm_maskz_mul_sh(1, a, b);
18040        let e = _mm_set_sh(2.0);
18041        assert_eq_m128h(r, e);
18042    }
18043
18044    #[simd_test(enable = "avx512fp16,avx512vl")]
18045    unsafe fn test_mm_div_ph() {
18046        let a = _mm_set1_ph(1.0);
18047        let b = _mm_set1_ph(2.0);
18048        let r = _mm_div_ph(a, b);
18049        let e = _mm_set1_ph(0.5);
18050        assert_eq_m128h(r, e);
18051    }
18052
18053    #[simd_test(enable = "avx512fp16,avx512vl")]
18054    unsafe fn test_mm_mask_div_ph() {
18055        let a = _mm_set1_ph(1.0);
18056        let b = _mm_set1_ph(2.0);
18057        let src = _mm_set_ph(4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0);
18058        let r = _mm_mask_div_ph(src, 0b01010101, a, b);
18059        let e = _mm_set_ph(4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5);
18060        assert_eq_m128h(r, e);
18061    }
18062
18063    #[simd_test(enable = "avx512fp16,avx512vl")]
18064    unsafe fn test_mm_maskz_div_ph() {
18065        let a = _mm_set1_ph(1.0);
18066        let b = _mm_set1_ph(2.0);
18067        let r = _mm_maskz_div_ph(0b01010101, a, b);
18068        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
18069        assert_eq_m128h(r, e);
18070    }
18071
18072    #[simd_test(enable = "avx512fp16,avx512vl")]
18073    unsafe fn test_mm256_div_ph() {
18074        let a = _mm256_set1_ph(1.0);
18075        let b = _mm256_set1_ph(2.0);
18076        let r = _mm256_div_ph(a, b);
18077        let e = _mm256_set1_ph(0.5);
18078        assert_eq_m256h(r, e);
18079    }
18080
18081    #[simd_test(enable = "avx512fp16,avx512vl")]
18082    unsafe fn test_mm256_mask_div_ph() {
18083        let a = _mm256_set1_ph(1.0);
18084        let b = _mm256_set1_ph(2.0);
18085        let src = _mm256_set_ph(
18086            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18087            19.0,
18088        );
18089        let r = _mm256_mask_div_ph(src, 0b0101010101010101, a, b);
18090        let e = _mm256_set_ph(
18091            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18092        );
18093        assert_eq_m256h(r, e);
18094    }
18095
18096    #[simd_test(enable = "avx512fp16,avx512vl")]
18097    unsafe fn test_mm256_maskz_div_ph() {
18098        let a = _mm256_set1_ph(1.0);
18099        let b = _mm256_set1_ph(2.0);
18100        let r = _mm256_maskz_div_ph(0b0101010101010101, a, b);
18101        let e = _mm256_set_ph(
18102            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18103        );
18104        assert_eq_m256h(r, e);
18105    }
18106
18107    #[simd_test(enable = "avx512fp16")]
18108    unsafe fn test_mm512_div_ph() {
18109        let a = _mm512_set1_ph(1.0);
18110        let b = _mm512_set1_ph(2.0);
18111        let r = _mm512_div_ph(a, b);
18112        let e = _mm512_set1_ph(0.5);
18113        assert_eq_m512h(r, e);
18114    }
18115
18116    #[simd_test(enable = "avx512fp16")]
18117    unsafe fn test_mm512_mask_div_ph() {
18118        let a = _mm512_set1_ph(1.0);
18119        let b = _mm512_set1_ph(2.0);
18120        let src = _mm512_set_ph(
18121            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18122            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18123            33.0, 34.0, 35.0,
18124        );
18125        let r = _mm512_mask_div_ph(src, 0b01010101010101010101010101010101, a, b);
18126        let e = _mm512_set_ph(
18127            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18128            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18129        );
18130        assert_eq_m512h(r, e);
18131    }
18132
18133    #[simd_test(enable = "avx512fp16")]
18134    unsafe fn test_mm512_maskz_div_ph() {
18135        let a = _mm512_set1_ph(1.0);
18136        let b = _mm512_set1_ph(2.0);
18137        let r = _mm512_maskz_div_ph(0b01010101010101010101010101010101, a, b);
18138        let e = _mm512_set_ph(
18139            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18140            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18141        );
18142        assert_eq_m512h(r, e);
18143    }
18144
18145    #[simd_test(enable = "avx512fp16")]
18146    unsafe fn test_mm512_div_round_ph() {
18147        let a = _mm512_set1_ph(1.0);
18148        let b = _mm512_set1_ph(2.0);
18149        let r = _mm512_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18150        let e = _mm512_set1_ph(0.5);
18151        assert_eq_m512h(r, e);
18152    }
18153
18154    #[simd_test(enable = "avx512fp16")]
18155    unsafe fn test_mm512_mask_div_round_ph() {
18156        let a = _mm512_set1_ph(1.0);
18157        let b = _mm512_set1_ph(2.0);
18158        let src = _mm512_set_ph(
18159            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18160            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18161            33.0, 34.0, 35.0,
18162        );
18163        let r = _mm512_mask_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18164            src,
18165            0b01010101010101010101010101010101,
18166            a,
18167            b,
18168        );
18169        let e = _mm512_set_ph(
18170            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18171            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18172        );
18173        assert_eq_m512h(r, e);
18174    }
18175
18176    #[simd_test(enable = "avx512fp16")]
18177    unsafe fn test_mm512_maskz_div_round_ph() {
18178        let a = _mm512_set1_ph(1.0);
18179        let b = _mm512_set1_ph(2.0);
18180        let r = _mm512_maskz_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18181            0b01010101010101010101010101010101,
18182            a,
18183            b,
18184        );
18185        let e = _mm512_set_ph(
18186            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18187            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18188        );
18189        assert_eq_m512h(r, e);
18190    }
18191
18192    #[simd_test(enable = "avx512fp16")]
18193    unsafe fn test_mm_div_round_sh() {
18194        let a = _mm_set_sh(1.0);
18195        let b = _mm_set_sh(2.0);
18196        let r = _mm_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18197        let e = _mm_set_sh(0.5);
18198        assert_eq_m128h(r, e);
18199    }
18200
18201    #[simd_test(enable = "avx512fp16")]
18202    unsafe fn test_mm_mask_div_round_sh() {
18203        let a = _mm_set_sh(1.0);
18204        let b = _mm_set_sh(2.0);
18205        let src = _mm_set_sh(4.0);
18206        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18207            src, 0, a, b,
18208        );
18209        let e = _mm_set_sh(4.0);
18210        assert_eq_m128h(r, e);
18211        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18212            src, 1, a, b,
18213        );
18214        let e = _mm_set_sh(0.5);
18215        assert_eq_m128h(r, e);
18216    }
18217
18218    #[simd_test(enable = "avx512fp16")]
18219    unsafe fn test_mm_maskz_div_round_sh() {
18220        let a = _mm_set_sh(1.0);
18221        let b = _mm_set_sh(2.0);
18222        let r =
18223            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18224        let e = _mm_set_sh(0.0);
18225        assert_eq_m128h(r, e);
18226        let r =
18227            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18228        let e = _mm_set_sh(0.5);
18229        assert_eq_m128h(r, e);
18230    }
18231
18232    #[simd_test(enable = "avx512fp16")]
18233    unsafe fn test_mm_div_sh() {
18234        let a = _mm_set_sh(1.0);
18235        let b = _mm_set_sh(2.0);
18236        let r = _mm_div_sh(a, b);
18237        let e = _mm_set_sh(0.5);
18238        assert_eq_m128h(r, e);
18239    }
18240
18241    #[simd_test(enable = "avx512fp16")]
18242    unsafe fn test_mm_mask_div_sh() {
18243        let a = _mm_set_sh(1.0);
18244        let b = _mm_set_sh(2.0);
18245        let src = _mm_set_sh(4.0);
18246        let r = _mm_mask_div_sh(src, 0, a, b);
18247        let e = _mm_set_sh(4.0);
18248        assert_eq_m128h(r, e);
18249        let r = _mm_mask_div_sh(src, 1, a, b);
18250        let e = _mm_set_sh(0.5);
18251        assert_eq_m128h(r, e);
18252    }
18253
18254    #[simd_test(enable = "avx512fp16")]
18255    unsafe fn test_mm_maskz_div_sh() {
18256        let a = _mm_set_sh(1.0);
18257        let b = _mm_set_sh(2.0);
18258        let r = _mm_maskz_div_sh(0, a, b);
18259        let e = _mm_set_sh(0.0);
18260        assert_eq_m128h(r, e);
18261        let r = _mm_maskz_div_sh(1, a, b);
18262        let e = _mm_set_sh(0.5);
18263        assert_eq_m128h(r, e);
18264    }
18265
18266    #[simd_test(enable = "avx512fp16,avx512vl")]
18267    unsafe fn test_mm_mul_pch() {
18268        let a = _mm_set1_pch(0.0, 1.0);
18269        let b = _mm_set1_pch(0.0, 1.0);
18270        let r = _mm_mul_pch(a, b);
18271        let e = _mm_set1_pch(-1.0, 0.0);
18272        assert_eq_m128h(r, e);
18273    }
18274
18275    #[simd_test(enable = "avx512fp16,avx512vl")]
18276    unsafe fn test_mm_mask_mul_pch() {
18277        let a = _mm_set1_pch(0.0, 1.0);
18278        let b = _mm_set1_pch(0.0, 1.0);
18279        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18280        let r = _mm_mask_mul_pch(src, 0b0101, a, b);
18281        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18282        assert_eq_m128h(r, e);
18283    }
18284
18285    #[simd_test(enable = "avx512fp16,avx512vl")]
18286    unsafe fn test_mm_maskz_mul_pch() {
18287        let a = _mm_set1_pch(0.0, 1.0);
18288        let b = _mm_set1_pch(0.0, 1.0);
18289        let r = _mm_maskz_mul_pch(0b0101, a, b);
18290        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18291        assert_eq_m128h(r, e);
18292    }
18293
18294    #[simd_test(enable = "avx512fp16,avx512vl")]
18295    unsafe fn test_mm256_mul_pch() {
18296        let a = _mm256_set1_pch(0.0, 1.0);
18297        let b = _mm256_set1_pch(0.0, 1.0);
18298        let r = _mm256_mul_pch(a, b);
18299        let e = _mm256_set1_pch(-1.0, 0.0);
18300        assert_eq_m256h(r, e);
18301    }
18302
18303    #[simd_test(enable = "avx512fp16,avx512vl")]
18304    unsafe fn test_mm256_mask_mul_pch() {
18305        let a = _mm256_set1_pch(0.0, 1.0);
18306        let b = _mm256_set1_pch(0.0, 1.0);
18307        let src = _mm256_setr_ph(
18308            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18309        );
18310        let r = _mm256_mask_mul_pch(src, 0b01010101, a, b);
18311        let e = _mm256_setr_ph(
18312            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18313        );
18314        assert_eq_m256h(r, e);
18315    }
18316
18317    #[simd_test(enable = "avx512fp16,avx512vl")]
18318    unsafe fn test_mm256_maskz_mul_pch() {
18319        let a = _mm256_set1_pch(0.0, 1.0);
18320        let b = _mm256_set1_pch(0.0, 1.0);
18321        let r = _mm256_maskz_mul_pch(0b01010101, a, b);
18322        let e = _mm256_setr_ph(
18323            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18324        );
18325        assert_eq_m256h(r, e);
18326    }
18327
18328    #[simd_test(enable = "avx512fp16")]
18329    unsafe fn test_mm512_mul_pch() {
18330        let a = _mm512_set1_pch(0.0, 1.0);
18331        let b = _mm512_set1_pch(0.0, 1.0);
18332        let r = _mm512_mul_pch(a, b);
18333        let e = _mm512_set1_pch(-1.0, 0.0);
18334        assert_eq_m512h(r, e);
18335    }
18336
18337    #[simd_test(enable = "avx512fp16")]
18338    unsafe fn test_mm512_mask_mul_pch() {
18339        let a = _mm512_set1_pch(0.0, 1.0);
18340        let b = _mm512_set1_pch(0.0, 1.0);
18341        let src = _mm512_setr_ph(
18342            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18343            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18344            32.0, 33.0,
18345        );
18346        let r = _mm512_mask_mul_pch(src, 0b0101010101010101, a, b);
18347        let e = _mm512_setr_ph(
18348            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18349            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18350            33.0,
18351        );
18352        assert_eq_m512h(r, e);
18353    }
18354
18355    #[simd_test(enable = "avx512fp16")]
18356    unsafe fn test_mm512_maskz_mul_pch() {
18357        let a = _mm512_set1_pch(0.0, 1.0);
18358        let b = _mm512_set1_pch(0.0, 1.0);
18359        let r = _mm512_maskz_mul_pch(0b0101010101010101, a, b);
18360        let e = _mm512_setr_ph(
18361            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18362            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18363        );
18364        assert_eq_m512h(r, e);
18365    }
18366
18367    #[simd_test(enable = "avx512fp16")]
18368    unsafe fn test_mm512_mul_round_pch() {
18369        let a = _mm512_set1_pch(0.0, 1.0);
18370        let b = _mm512_set1_pch(0.0, 1.0);
18371        let r = _mm512_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18372        let e = _mm512_set1_pch(-1.0, 0.0);
18373        assert_eq_m512h(r, e);
18374    }
18375
18376    #[simd_test(enable = "avx512fp16")]
18377    unsafe fn test_mm512_mask_mul_round_pch() {
18378        let a = _mm512_set1_pch(0.0, 1.0);
18379        let b = _mm512_set1_pch(0.0, 1.0);
18380        let src = _mm512_setr_ph(
18381            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18382            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18383            32.0, 33.0,
18384        );
18385        let r = _mm512_mask_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18386            src,
18387            0b0101010101010101,
18388            a,
18389            b,
18390        );
18391        let e = _mm512_setr_ph(
18392            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18393            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18394            33.0,
18395        );
18396        assert_eq_m512h(r, e);
18397    }
18398
18399    #[simd_test(enable = "avx512fp16")]
18400    unsafe fn test_mm512_maskz_mul_round_pch() {
18401        let a = _mm512_set1_pch(0.0, 1.0);
18402        let b = _mm512_set1_pch(0.0, 1.0);
18403        let r = _mm512_maskz_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18404            0b0101010101010101,
18405            a,
18406            b,
18407        );
18408        let e = _mm512_setr_ph(
18409            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18410            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18411        );
18412        assert_eq_m512h(r, e);
18413    }
18414
18415    #[simd_test(enable = "avx512fp16")]
18416    unsafe fn test_mm_mul_round_sch() {
18417        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18418        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18419        let r = _mm_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18420        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18421        assert_eq_m128h(r, e);
18422    }
18423
18424    #[simd_test(enable = "avx512fp16")]
18425    unsafe fn test_mm_mask_mul_round_sch() {
18426        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18427        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18428        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18429        let r = _mm_mask_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18430            src, 0, a, b,
18431        );
18432        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18433        assert_eq_m128h(r, e);
18434    }
18435
18436    #[simd_test(enable = "avx512fp16")]
18437    unsafe fn test_mm_maskz_mul_round_sch() {
18438        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18439        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18440        let r =
18441            _mm_maskz_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18442        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18443        assert_eq_m128h(r, e);
18444    }
18445
18446    #[simd_test(enable = "avx512fp16")]
18447    unsafe fn test_mm_mul_sch() {
18448        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18449        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18450        let r = _mm_mul_sch(a, b);
18451        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18452        assert_eq_m128h(r, e);
18453    }
18454
18455    #[simd_test(enable = "avx512fp16")]
18456    unsafe fn test_mm_mask_mul_sch() {
18457        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18458        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18459        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18460        let r = _mm_mask_mul_sch(src, 0, a, b);
18461        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18462        assert_eq_m128h(r, e);
18463    }
18464
18465    #[simd_test(enable = "avx512fp16")]
18466    unsafe fn test_mm_maskz_mul_sch() {
18467        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18468        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18469        let r = _mm_maskz_mul_sch(0, a, b);
18470        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18471        assert_eq_m128h(r, e);
18472    }
18473
18474    #[simd_test(enable = "avx512fp16,avx512vl")]
18475    unsafe fn test_mm_fmul_pch() {
18476        let a = _mm_set1_pch(0.0, 1.0);
18477        let b = _mm_set1_pch(0.0, 1.0);
18478        let r = _mm_fmul_pch(a, b);
18479        let e = _mm_set1_pch(-1.0, 0.0);
18480        assert_eq_m128h(r, e);
18481    }
18482
18483    #[simd_test(enable = "avx512fp16,avx512vl")]
18484    unsafe fn test_mm_mask_fmul_pch() {
18485        let a = _mm_set1_pch(0.0, 1.0);
18486        let b = _mm_set1_pch(0.0, 1.0);
18487        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18488        let r = _mm_mask_fmul_pch(src, 0b0101, a, b);
18489        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18490        assert_eq_m128h(r, e);
18491    }
18492
18493    #[simd_test(enable = "avx512fp16,avx512vl")]
18494    unsafe fn test_mm_maskz_fmul_pch() {
18495        let a = _mm_set1_pch(0.0, 1.0);
18496        let b = _mm_set1_pch(0.0, 1.0);
18497        let r = _mm_maskz_fmul_pch(0b0101, a, b);
18498        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18499        assert_eq_m128h(r, e);
18500    }
18501
18502    #[simd_test(enable = "avx512fp16,avx512vl")]
18503    unsafe fn test_mm256_fmul_pch() {
18504        let a = _mm256_set1_pch(0.0, 1.0);
18505        let b = _mm256_set1_pch(0.0, 1.0);
18506        let r = _mm256_fmul_pch(a, b);
18507        let e = _mm256_set1_pch(-1.0, 0.0);
18508        assert_eq_m256h(r, e);
18509    }
18510
18511    #[simd_test(enable = "avx512fp16,avx512vl")]
18512    unsafe fn test_mm256_mask_fmul_pch() {
18513        let a = _mm256_set1_pch(0.0, 1.0);
18514        let b = _mm256_set1_pch(0.0, 1.0);
18515        let src = _mm256_setr_ph(
18516            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18517        );
18518        let r = _mm256_mask_fmul_pch(src, 0b01010101, a, b);
18519        let e = _mm256_setr_ph(
18520            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18521        );
18522        assert_eq_m256h(r, e);
18523    }
18524
18525    #[simd_test(enable = "avx512fp16,avx512vl")]
18526    unsafe fn test_mm256_maskz_fmul_pch() {
18527        let a = _mm256_set1_pch(0.0, 1.0);
18528        let b = _mm256_set1_pch(0.0, 1.0);
18529        let r = _mm256_maskz_fmul_pch(0b01010101, a, b);
18530        let e = _mm256_setr_ph(
18531            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18532        );
18533        assert_eq_m256h(r, e);
18534    }
18535
18536    #[simd_test(enable = "avx512fp16")]
18537    unsafe fn test_mm512_fmul_pch() {
18538        let a = _mm512_set1_pch(0.0, 1.0);
18539        let b = _mm512_set1_pch(0.0, 1.0);
18540        let r = _mm512_fmul_pch(a, b);
18541        let e = _mm512_set1_pch(-1.0, 0.0);
18542        assert_eq_m512h(r, e);
18543    }
18544
18545    #[simd_test(enable = "avx512fp16")]
18546    unsafe fn test_mm512_mask_fmul_pch() {
18547        let a = _mm512_set1_pch(0.0, 1.0);
18548        let b = _mm512_set1_pch(0.0, 1.0);
18549        let src = _mm512_setr_ph(
18550            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18551            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18552            32.0, 33.0,
18553        );
18554        let r = _mm512_mask_fmul_pch(src, 0b0101010101010101, a, b);
18555        let e = _mm512_setr_ph(
18556            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18557            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18558            33.0,
18559        );
18560        assert_eq_m512h(r, e);
18561    }
18562
18563    #[simd_test(enable = "avx512fp16")]
18564    unsafe fn test_mm512_maskz_fmul_pch() {
18565        let a = _mm512_set1_pch(0.0, 1.0);
18566        let b = _mm512_set1_pch(0.0, 1.0);
18567        let r = _mm512_maskz_fmul_pch(0b0101010101010101, a, b);
18568        let e = _mm512_setr_ph(
18569            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18570            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18571        );
18572        assert_eq_m512h(r, e);
18573    }
18574
18575    #[simd_test(enable = "avx512fp16")]
18576    unsafe fn test_mm512_fmul_round_pch() {
18577        let a = _mm512_set1_pch(0.0, 1.0);
18578        let b = _mm512_set1_pch(0.0, 1.0);
18579        let r = _mm512_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18580        let e = _mm512_set1_pch(-1.0, 0.0);
18581        assert_eq_m512h(r, e);
18582    }
18583
18584    #[simd_test(enable = "avx512fp16")]
18585    unsafe fn test_mm512_mask_fmul_round_pch() {
18586        let a = _mm512_set1_pch(0.0, 1.0);
18587        let b = _mm512_set1_pch(0.0, 1.0);
18588        let src = _mm512_setr_ph(
18589            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18590            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18591            32.0, 33.0,
18592        );
18593        let r = _mm512_mask_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18594            src,
18595            0b0101010101010101,
18596            a,
18597            b,
18598        );
18599        let e = _mm512_setr_ph(
18600            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18601            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18602            33.0,
18603        );
18604        assert_eq_m512h(r, e);
18605    }
18606
18607    #[simd_test(enable = "avx512fp16")]
18608    unsafe fn test_mm512_maskz_fmul_round_pch() {
18609        let a = _mm512_set1_pch(0.0, 1.0);
18610        let b = _mm512_set1_pch(0.0, 1.0);
18611        let r = _mm512_maskz_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18612            0b0101010101010101,
18613            a,
18614            b,
18615        );
18616        let e = _mm512_setr_ph(
18617            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18618            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18619        );
18620        assert_eq_m512h(r, e);
18621    }
18622
18623    #[simd_test(enable = "avx512fp16")]
18624    unsafe fn test_mm_fmul_round_sch() {
18625        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18626        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18627        let r = _mm_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18628        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18629        assert_eq_m128h(r, e);
18630    }
18631
18632    #[simd_test(enable = "avx512fp16")]
18633    unsafe fn test_mm_mask_fmul_round_sch() {
18634        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18635        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18636        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18637        let r = _mm_mask_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18638            src, 0, a, b,
18639        );
18640        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18641        assert_eq_m128h(r, e);
18642    }
18643
18644    #[simd_test(enable = "avx512fp16")]
18645    unsafe fn test_mm_maskz_fmul_round_sch() {
18646        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18647        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18648        let r =
18649            _mm_maskz_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18650        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18651        assert_eq_m128h(r, e);
18652    }
18653
18654    #[simd_test(enable = "avx512fp16")]
18655    unsafe fn test_mm_fmul_sch() {
18656        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18657        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18658        let r = _mm_fmul_sch(a, b);
18659        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18660        assert_eq_m128h(r, e);
18661    }
18662
18663    #[simd_test(enable = "avx512fp16")]
18664    unsafe fn test_mm_mask_fmul_sch() {
18665        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18666        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18667        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18668        let r = _mm_mask_fmul_sch(src, 0, a, b);
18669        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18670        assert_eq_m128h(r, e);
18671    }
18672
18673    #[simd_test(enable = "avx512fp16")]
18674    unsafe fn test_mm_maskz_fmul_sch() {
18675        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18676        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18677        let r = _mm_maskz_fmul_sch(0, a, b);
18678        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18679        assert_eq_m128h(r, e);
18680    }
18681
18682    #[simd_test(enable = "avx512fp16,avx512vl")]
18683    unsafe fn test_mm_cmul_pch() {
18684        let a = _mm_set1_pch(0.0, 1.0);
18685        let b = _mm_set1_pch(0.0, -1.0);
18686        let r = _mm_cmul_pch(a, b);
18687        let e = _mm_set1_pch(-1.0, 0.0);
18688        assert_eq_m128h(r, e);
18689    }
18690
18691    #[simd_test(enable = "avx512fp16,avx512vl")]
18692    unsafe fn test_mm_mask_cmul_pch() {
18693        let a = _mm_set1_pch(0.0, 1.0);
18694        let b = _mm_set1_pch(0.0, -1.0);
18695        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18696        let r = _mm_mask_cmul_pch(src, 0b0101, a, b);
18697        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18698        assert_eq_m128h(r, e);
18699    }
18700
18701    #[simd_test(enable = "avx512fp16,avx512vl")]
18702    unsafe fn test_mm_maskz_cmul_pch() {
18703        let a = _mm_set1_pch(0.0, 1.0);
18704        let b = _mm_set1_pch(0.0, -1.0);
18705        let r = _mm_maskz_cmul_pch(0b0101, a, b);
18706        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18707        assert_eq_m128h(r, e);
18708    }
18709
18710    #[simd_test(enable = "avx512fp16,avx512vl")]
18711    unsafe fn test_mm256_cmul_pch() {
18712        let a = _mm256_set1_pch(0.0, 1.0);
18713        let b = _mm256_set1_pch(0.0, -1.0);
18714        let r = _mm256_cmul_pch(a, b);
18715        let e = _mm256_set1_pch(-1.0, 0.0);
18716        assert_eq_m256h(r, e);
18717    }
18718
18719    #[simd_test(enable = "avx512fp16,avx512vl")]
18720    unsafe fn test_mm256_mask_cmul_pch() {
18721        let a = _mm256_set1_pch(0.0, 1.0);
18722        let b = _mm256_set1_pch(0.0, -1.0);
18723        let src = _mm256_setr_ph(
18724            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18725        );
18726        let r = _mm256_mask_cmul_pch(src, 0b01010101, a, b);
18727        let e = _mm256_setr_ph(
18728            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18729        );
18730        assert_eq_m256h(r, e);
18731    }
18732
18733    #[simd_test(enable = "avx512fp16,avx512vl")]
18734    unsafe fn test_mm256_maskz_cmul_pch() {
18735        let a = _mm256_set1_pch(0.0, 1.0);
18736        let b = _mm256_set1_pch(0.0, -1.0);
18737        let r = _mm256_maskz_cmul_pch(0b01010101, a, b);
18738        let e = _mm256_setr_ph(
18739            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18740        );
18741        assert_eq_m256h(r, e);
18742    }
18743
18744    #[simd_test(enable = "avx512fp16")]
18745    unsafe fn test_mm512_cmul_pch() {
18746        let a = _mm512_set1_pch(0.0, 1.0);
18747        let b = _mm512_set1_pch(0.0, -1.0);
18748        let r = _mm512_cmul_pch(a, b);
18749        let e = _mm512_set1_pch(-1.0, 0.0);
18750        assert_eq_m512h(r, e);
18751    }
18752
18753    #[simd_test(enable = "avx512fp16")]
18754    unsafe fn test_mm512_mask_cmul_pch() {
18755        let a = _mm512_set1_pch(0.0, 1.0);
18756        let b = _mm512_set1_pch(0.0, -1.0);
18757        let src = _mm512_setr_ph(
18758            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18759            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18760            32.0, 33.0,
18761        );
18762        let r = _mm512_mask_cmul_pch(src, 0b0101010101010101, a, b);
18763        let e = _mm512_setr_ph(
18764            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18765            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18766            33.0,
18767        );
18768        assert_eq_m512h(r, e);
18769    }
18770
18771    #[simd_test(enable = "avx512fp16")]
18772    unsafe fn test_mm512_maskz_cmul_pch() {
18773        let a = _mm512_set1_pch(0.0, 1.0);
18774        let b = _mm512_set1_pch(0.0, -1.0);
18775        let r = _mm512_maskz_cmul_pch(0b0101010101010101, a, b);
18776        let e = _mm512_setr_ph(
18777            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18778            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18779        );
18780        assert_eq_m512h(r, e);
18781    }
18782
18783    #[simd_test(enable = "avx512fp16")]
18784    unsafe fn test_mm512_cmul_round_pch() {
18785        let a = _mm512_set1_pch(0.0, 1.0);
18786        let b = _mm512_set1_pch(0.0, -1.0);
18787        let r = _mm512_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18788        let e = _mm512_set1_pch(-1.0, 0.0);
18789        assert_eq_m512h(r, e);
18790    }
18791
18792    #[simd_test(enable = "avx512fp16")]
18793    unsafe fn test_mm512_mask_cmul_round_pch() {
18794        let a = _mm512_set1_pch(0.0, 1.0);
18795        let b = _mm512_set1_pch(0.0, -1.0);
18796        let src = _mm512_setr_ph(
18797            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18798            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18799            32.0, 33.0,
18800        );
18801        let r = _mm512_mask_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18802            src,
18803            0b0101010101010101,
18804            a,
18805            b,
18806        );
18807        let e = _mm512_setr_ph(
18808            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18809            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18810            33.0,
18811        );
18812        assert_eq_m512h(r, e);
18813    }
18814
18815    #[simd_test(enable = "avx512fp16")]
18816    unsafe fn test_mm512_maskz_cmul_round_pch() {
18817        let a = _mm512_set1_pch(0.0, 1.0);
18818        let b = _mm512_set1_pch(0.0, -1.0);
18819        let r = _mm512_maskz_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18820            0b0101010101010101,
18821            a,
18822            b,
18823        );
18824        let e = _mm512_setr_ph(
18825            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18826            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18827        );
18828        assert_eq_m512h(r, e);
18829    }
18830
18831    #[simd_test(enable = "avx512fp16")]
18832    unsafe fn test_mm_cmul_sch() {
18833        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18834        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
18835        let r = _mm_cmul_sch(a, b);
18836        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18837        assert_eq_m128h(r, e);
18838    }
18839
18840    #[simd_test(enable = "avx512fp16")]
18841    unsafe fn test_mm_mask_cmul_sch() {
18842        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18843        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
18844        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18845        let r = _mm_mask_cmul_sch(src, 0, a, b);
18846        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18847        assert_eq_m128h(r, e);
18848    }
18849
18850    #[simd_test(enable = "avx512fp16")]
18851    unsafe fn test_mm_maskz_cmul_sch() {
18852        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18853        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
18854        let r = _mm_maskz_cmul_sch(0, a, b);
18855        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18856        assert_eq_m128h(r, e);
18857    }
18858
18859    #[simd_test(enable = "avx512fp16")]
18860    unsafe fn test_mm_cmul_round_sch() {
18861        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18862        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
18863        let r = _mm_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18864        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18865        assert_eq_m128h(r, e);
18866    }
18867
18868    #[simd_test(enable = "avx512fp16")]
18869    unsafe fn test_mm_mask_cmul_round_sch() {
18870        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18871        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
18872        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18873        let r = _mm_mask_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18874            src, 0, a, b,
18875        );
18876        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18877        assert_eq_m128h(r, e);
18878    }
18879
18880    #[simd_test(enable = "avx512fp16")]
18881    unsafe fn test_mm_maskz_cmul_round_sch() {
18882        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18883        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
18884        let r =
18885            _mm_maskz_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18886        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18887        assert_eq_m128h(r, e);
18888    }
18889
18890    #[simd_test(enable = "avx512fp16,avx512vl")]
18891    unsafe fn test_mm_fcmul_pch() {
18892        let a = _mm_set1_pch(0.0, 1.0);
18893        let b = _mm_set1_pch(0.0, -1.0);
18894        let r = _mm_fcmul_pch(a, b);
18895        let e = _mm_set1_pch(-1.0, 0.0);
18896        assert_eq_m128h(r, e);
18897    }
18898
18899    #[simd_test(enable = "avx512fp16,avx512vl")]
18900    unsafe fn test_mm_mask_fcmul_pch() {
18901        let a = _mm_set1_pch(0.0, 1.0);
18902        let b = _mm_set1_pch(0.0, -1.0);
18903        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18904        let r = _mm_mask_fcmul_pch(src, 0b0101, a, b);
18905        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18906        assert_eq_m128h(r, e);
18907    }
18908
18909    #[simd_test(enable = "avx512fp16,avx512vl")]
18910    unsafe fn test_mm_maskz_fcmul_pch() {
18911        let a = _mm_set1_pch(0.0, 1.0);
18912        let b = _mm_set1_pch(0.0, -1.0);
18913        let r = _mm_maskz_fcmul_pch(0b0101, a, b);
18914        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18915        assert_eq_m128h(r, e);
18916    }
18917
18918    #[simd_test(enable = "avx512fp16,avx512vl")]
18919    unsafe fn test_mm256_fcmul_pch() {
18920        let a = _mm256_set1_pch(0.0, 1.0);
18921        let b = _mm256_set1_pch(0.0, -1.0);
18922        let r = _mm256_fcmul_pch(a, b);
18923        let e = _mm256_set1_pch(-1.0, 0.0);
18924        assert_eq_m256h(r, e);
18925    }
18926
18927    #[simd_test(enable = "avx512fp16,avx512vl")]
18928    unsafe fn test_mm256_mask_fcmul_pch() {
18929        let a = _mm256_set1_pch(0.0, 1.0);
18930        let b = _mm256_set1_pch(0.0, -1.0);
18931        let src = _mm256_setr_ph(
18932            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18933        );
18934        let r = _mm256_mask_fcmul_pch(src, 0b01010101, a, b);
18935        let e = _mm256_setr_ph(
18936            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18937        );
18938        assert_eq_m256h(r, e);
18939    }
18940
18941    #[simd_test(enable = "avx512fp16,avx512vl")]
18942    unsafe fn test_mm256_maskz_fcmul_pch() {
18943        let a = _mm256_set1_pch(0.0, 1.0);
18944        let b = _mm256_set1_pch(0.0, -1.0);
18945        let r = _mm256_maskz_fcmul_pch(0b01010101, a, b);
18946        let e = _mm256_setr_ph(
18947            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18948        );
18949        assert_eq_m256h(r, e);
18950    }
18951
18952    #[simd_test(enable = "avx512fp16")]
18953    unsafe fn test_mm512_fcmul_pch() {
18954        let a = _mm512_set1_pch(0.0, 1.0);
18955        let b = _mm512_set1_pch(0.0, -1.0);
18956        let r = _mm512_fcmul_pch(a, b);
18957        let e = _mm512_set1_pch(-1.0, 0.0);
18958        assert_eq_m512h(r, e);
18959    }
18960
18961    #[simd_test(enable = "avx512fp16")]
18962    unsafe fn test_mm512_mask_fcmul_pch() {
18963        let a = _mm512_set1_pch(0.0, 1.0);
18964        let b = _mm512_set1_pch(0.0, -1.0);
18965        let src = _mm512_setr_ph(
18966            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18967            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18968            32.0, 33.0,
18969        );
18970        let r = _mm512_mask_fcmul_pch(src, 0b0101010101010101, a, b);
18971        let e = _mm512_setr_ph(
18972            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18973            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18974            33.0,
18975        );
18976        assert_eq_m512h(r, e);
18977    }
18978
18979    #[simd_test(enable = "avx512fp16")]
18980    unsafe fn test_mm512_maskz_fcmul_pch() {
18981        let a = _mm512_set1_pch(0.0, 1.0);
18982        let b = _mm512_set1_pch(0.0, -1.0);
18983        let r = _mm512_maskz_fcmul_pch(0b0101010101010101, a, b);
18984        let e = _mm512_setr_ph(
18985            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18986            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18987        );
18988        assert_eq_m512h(r, e);
18989    }
18990
18991    #[simd_test(enable = "avx512fp16")]
18992    unsafe fn test_mm512_fcmul_round_pch() {
18993        let a = _mm512_set1_pch(0.0, 1.0);
18994        let b = _mm512_set1_pch(0.0, -1.0);
18995        let r = _mm512_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18996        let e = _mm512_set1_pch(-1.0, 0.0);
18997        assert_eq_m512h(r, e);
18998    }
18999
19000    #[simd_test(enable = "avx512fp16")]
19001    unsafe fn test_mm512_mask_fcmul_round_pch() {
19002        let a = _mm512_set1_pch(0.0, 1.0);
19003        let b = _mm512_set1_pch(0.0, -1.0);
19004        let src = _mm512_setr_ph(
19005            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19006            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19007            32.0, 33.0,
19008        );
19009        let r = _mm512_mask_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19010            src,
19011            0b0101010101010101,
19012            a,
19013            b,
19014        );
19015        let e = _mm512_setr_ph(
19016            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19017            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19018            33.0,
19019        );
19020        assert_eq_m512h(r, e);
19021    }
19022
19023    #[simd_test(enable = "avx512fp16")]
19024    unsafe fn test_mm512_maskz_fcmul_round_pch() {
19025        let a = _mm512_set1_pch(0.0, 1.0);
19026        let b = _mm512_set1_pch(0.0, -1.0);
19027        let r = _mm512_maskz_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19028            0b0101010101010101,
19029            a,
19030            b,
19031        );
19032        let e = _mm512_setr_ph(
19033            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19034            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19035        );
19036        assert_eq_m512h(r, e);
19037    }
19038
19039    #[simd_test(enable = "avx512fp16")]
19040    unsafe fn test_mm_fcmul_sch() {
19041        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19042        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19043        let r = _mm_fcmul_sch(a, b);
19044        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19045        assert_eq_m128h(r, e);
19046    }
19047
19048    #[simd_test(enable = "avx512fp16")]
19049    unsafe fn test_mm_mask_fcmul_sch() {
19050        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19051        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19052        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19053        let r = _mm_mask_fcmul_sch(src, 0, a, b);
19054        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19055        assert_eq_m128h(r, e);
19056    }
19057
19058    #[simd_test(enable = "avx512fp16")]
19059    unsafe fn test_mm_maskz_fcmul_sch() {
19060        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19061        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19062        let r = _mm_maskz_fcmul_sch(0, a, b);
19063        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19064        assert_eq_m128h(r, e);
19065    }
19066
19067    #[simd_test(enable = "avx512fp16")]
19068    unsafe fn test_mm_fcmul_round_sch() {
19069        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19070        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19071        let r = _mm_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19072        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19073        assert_eq_m128h(r, e);
19074    }
19075
19076    #[simd_test(enable = "avx512fp16")]
19077    unsafe fn test_mm_mask_fcmul_round_sch() {
19078        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19079        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19080        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19081        let r = _mm_mask_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19082            src, 0, a, b,
19083        );
19084        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19085        assert_eq_m128h(r, e);
19086    }
19087
19088    #[simd_test(enable = "avx512fp16")]
19089    unsafe fn test_mm_maskz_fcmul_round_sch() {
19090        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19091        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19092        let r =
19093            _mm_maskz_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19094        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19095        assert_eq_m128h(r, e);
19096    }
19097
19098    #[simd_test(enable = "avx512fp16,avx512vl")]
19099    unsafe fn test_mm_abs_ph() {
19100        let a = _mm_set_ph(-1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0);
19101        let r = _mm_abs_ph(a);
19102        let e = _mm_set_ph(1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0);
19103        assert_eq_m128h(r, e);
19104    }
19105
19106    #[simd_test(enable = "avx512fp16,avx512vl")]
19107    unsafe fn test_mm256_abs_ph() {
19108        let a = _mm256_set_ph(
19109            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19110            -14.0,
19111        );
19112        let r = _mm256_abs_ph(a);
19113        let e = _mm256_set_ph(
19114            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19115        );
19116        assert_eq_m256h(r, e);
19117    }
19118
19119    #[simd_test(enable = "avx512fp16")]
19120    unsafe fn test_mm512_abs_ph() {
19121        let a = _mm512_set_ph(
19122            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19123            -14.0, 15.0, -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0,
19124            27.0, -28.0, 29.0, -30.0,
19125        );
19126        let r = _mm512_abs_ph(a);
19127        let e = _mm512_set_ph(
19128            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19129            15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0,
19130            29.0, 30.0,
19131        );
19132        assert_eq_m512h(r, e);
19133    }
19134
19135    #[simd_test(enable = "avx512fp16,avx512vl")]
19136    unsafe fn test_mm_conj_pch() {
19137        let a = _mm_set1_pch(0.0, 1.0);
19138        let r = _mm_conj_pch(a);
19139        let e = _mm_set1_pch(0.0, -1.0);
19140        assert_eq_m128h(r, e);
19141    }
19142
19143    #[simd_test(enable = "avx512fp16,avx512vl")]
19144    unsafe fn test_mm_mask_conj_pch() {
19145        let a = _mm_set1_pch(0.0, 1.0);
19146        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19147        let r = _mm_mask_conj_pch(src, 0b0101, a);
19148        let e = _mm_setr_ph(0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0);
19149        assert_eq_m128h(r, e);
19150    }
19151
19152    #[simd_test(enable = "avx512fp16,avx512vl")]
19153    unsafe fn test_mm_maskz_conj_pch() {
19154        let a = _mm_set1_pch(0.0, 1.0);
19155        let r = _mm_maskz_conj_pch(0b0101, a);
19156        let e = _mm_setr_ph(0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0);
19157        assert_eq_m128h(r, e);
19158    }
19159
19160    #[simd_test(enable = "avx512fp16,avx512vl")]
19161    unsafe fn test_mm256_conj_pch() {
19162        let a = _mm256_set1_pch(0.0, 1.0);
19163        let r = _mm256_conj_pch(a);
19164        let e = _mm256_set1_pch(0.0, -1.0);
19165        assert_eq_m256h(r, e);
19166    }
19167
19168    #[simd_test(enable = "avx512fp16,avx512vl")]
19169    unsafe fn test_mm256_mask_conj_pch() {
19170        let a = _mm256_set1_pch(0.0, 1.0);
19171        let src = _mm256_setr_ph(
19172            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19173        );
19174        let r = _mm256_mask_conj_pch(src, 0b01010101, a);
19175        let e = _mm256_setr_ph(
19176            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19177        );
19178        assert_eq_m256h(r, e);
19179    }
19180
19181    #[simd_test(enable = "avx512fp16,avx512vl")]
19182    unsafe fn test_mm256_maskz_conj_pch() {
19183        let a = _mm256_set1_pch(0.0, 1.0);
19184        let r = _mm256_maskz_conj_pch(0b01010101, a);
19185        let e = _mm256_setr_ph(
19186            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19187        );
19188        assert_eq_m256h(r, e);
19189    }
19190
19191    #[simd_test(enable = "avx512fp16")]
19192    unsafe fn test_mm512_conj_pch() {
19193        let a = _mm512_set1_pch(0.0, 1.0);
19194        let r = _mm512_conj_pch(a);
19195        let e = _mm512_set1_pch(0.0, -1.0);
19196        assert_eq_m512h(r, e);
19197    }
19198
19199    #[simd_test(enable = "avx512fp16")]
19200    unsafe fn test_mm512_mask_conj_pch() {
19201        let a = _mm512_set1_pch(0.0, 1.0);
19202        let src = _mm512_setr_ph(
19203            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19204            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19205            32.0, 33.0,
19206        );
19207        let r = _mm512_mask_conj_pch(src, 0b0101010101010101, a);
19208        let e = _mm512_setr_ph(
19209            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19210            0.0, -1.0, 20.0, 21.0, 0.0, -1.0, 24.0, 25.0, 0.0, -1.0, 28.0, 29.0, 0.0, -1.0, 32.0,
19211            33.0,
19212        );
19213        assert_eq_m512h(r, e);
19214    }
19215
19216    #[simd_test(enable = "avx512fp16")]
19217    unsafe fn test_mm512_maskz_conj_pch() {
19218        let a = _mm512_set1_pch(0.0, 1.0);
19219        let r = _mm512_maskz_conj_pch(0b0101010101010101, a);
19220        let e = _mm512_setr_ph(
19221            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19222            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19223        );
19224        assert_eq_m512h(r, e);
19225    }
19226
19227    #[simd_test(enable = "avx512fp16,avx512vl")]
19228    unsafe fn test_mm_fmadd_pch() {
19229        let a = _mm_set1_pch(0.0, 1.0);
19230        let b = _mm_set1_pch(0.0, 2.0);
19231        let c = _mm_set1_pch(0.0, 3.0);
19232        let r = _mm_fmadd_pch(a, b, c);
19233        let e = _mm_set1_pch(-2.0, 3.0);
19234        assert_eq_m128h(r, e);
19235    }
19236
19237    #[simd_test(enable = "avx512fp16,avx512vl")]
19238    unsafe fn test_mm_mask_fmadd_pch() {
19239        let a = _mm_set1_pch(0.0, 1.0);
19240        let b = _mm_set1_pch(0.0, 2.0);
19241        let c = _mm_set1_pch(0.0, 3.0);
19242        let r = _mm_mask_fmadd_pch(a, 0b0101, b, c);
19243        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0);
19244        assert_eq_m128h(r, e);
19245    }
19246
19247    #[simd_test(enable = "avx512fp16,avx512vl")]
19248    unsafe fn test_mm_mask3_fmadd_pch() {
19249        let a = _mm_set1_pch(0.0, 1.0);
19250        let b = _mm_set1_pch(0.0, 2.0);
19251        let c = _mm_set1_pch(0.0, 3.0);
19252        let r = _mm_mask3_fmadd_pch(a, b, c, 0b0101);
19253        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0);
19254        assert_eq_m128h(r, e);
19255    }
19256
19257    #[simd_test(enable = "avx512fp16,avx512vl")]
19258    unsafe fn test_mm_maskz_fmadd_pch() {
19259        let a = _mm_set1_pch(0.0, 1.0);
19260        let b = _mm_set1_pch(0.0, 2.0);
19261        let c = _mm_set1_pch(0.0, 3.0);
19262        let r = _mm_maskz_fmadd_pch(0b0101, a, b, c);
19263        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0);
19264        assert_eq_m128h(r, e);
19265    }
19266
19267    #[simd_test(enable = "avx512fp16,avx512vl")]
19268    unsafe fn test_mm256_fmadd_pch() {
19269        let a = _mm256_set1_pch(0.0, 1.0);
19270        let b = _mm256_set1_pch(0.0, 2.0);
19271        let c = _mm256_set1_pch(0.0, 3.0);
19272        let r = _mm256_fmadd_pch(a, b, c);
19273        let e = _mm256_set1_pch(-2.0, 3.0);
19274        assert_eq_m256h(r, e);
19275    }
19276
19277    #[simd_test(enable = "avx512fp16,avx512vl")]
19278    unsafe fn test_mm256_mask_fmadd_pch() {
19279        let a = _mm256_set1_pch(0.0, 1.0);
19280        let b = _mm256_set1_pch(0.0, 2.0);
19281        let c = _mm256_set1_pch(0.0, 3.0);
19282        let r = _mm256_mask_fmadd_pch(a, 0b01010101, b, c);
19283        let e = _mm256_setr_ph(
19284            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19285        );
19286        assert_eq_m256h(r, e);
19287    }
19288
19289    #[simd_test(enable = "avx512fp16,avx512vl")]
19290    unsafe fn test_mm256_mask3_fmadd_pch() {
19291        let a = _mm256_set1_pch(0.0, 1.0);
19292        let b = _mm256_set1_pch(0.0, 2.0);
19293        let c = _mm256_set1_pch(0.0, 3.0);
19294        let r = _mm256_mask3_fmadd_pch(a, b, c, 0b01010101);
19295        let e = _mm256_setr_ph(
19296            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19297        );
19298        assert_eq_m256h(r, e);
19299    }
19300
19301    #[simd_test(enable = "avx512fp16,avx512vl")]
19302    unsafe fn test_mm256_maskz_fmadd_pch() {
19303        let a = _mm256_set1_pch(0.0, 1.0);
19304        let b = _mm256_set1_pch(0.0, 2.0);
19305        let c = _mm256_set1_pch(0.0, 3.0);
19306        let r = _mm256_maskz_fmadd_pch(0b01010101, a, b, c);
19307        let e = _mm256_setr_ph(
19308            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19309        );
19310        assert_eq_m256h(r, e);
19311    }
19312
19313    #[simd_test(enable = "avx512fp16")]
19314    unsafe fn test_mm512_fmadd_pch() {
19315        let a = _mm512_set1_pch(0.0, 1.0);
19316        let b = _mm512_set1_pch(0.0, 2.0);
19317        let c = _mm512_set1_pch(0.0, 3.0);
19318        let r = _mm512_fmadd_pch(a, b, c);
19319        let e = _mm512_set1_pch(-2.0, 3.0);
19320        assert_eq_m512h(r, e);
19321    }
19322
19323    #[simd_test(enable = "avx512fp16")]
19324    unsafe fn test_mm512_mask_fmadd_pch() {
19325        let a = _mm512_set1_pch(0.0, 1.0);
19326        let b = _mm512_set1_pch(0.0, 2.0);
19327        let c = _mm512_set1_pch(0.0, 3.0);
19328        let r = _mm512_mask_fmadd_pch(a, 0b0101010101010101, b, c);
19329        let e = _mm512_setr_ph(
19330            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19331            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19332        );
19333        assert_eq_m512h(r, e);
19334    }
19335
19336    #[simd_test(enable = "avx512fp16")]
19337    unsafe fn test_mm512_mask3_fmadd_pch() {
19338        let a = _mm512_set1_pch(0.0, 1.0);
19339        let b = _mm512_set1_pch(0.0, 2.0);
19340        let c = _mm512_set1_pch(0.0, 3.0);
19341        let r = _mm512_mask3_fmadd_pch(a, b, c, 0b0101010101010101);
19342        let e = _mm512_setr_ph(
19343            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19344            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19345        );
19346        assert_eq_m512h(r, e);
19347    }
19348
19349    #[simd_test(enable = "avx512fp16")]
19350    unsafe fn test_mm512_maskz_fmadd_pch() {
19351        let a = _mm512_set1_pch(0.0, 1.0);
19352        let b = _mm512_set1_pch(0.0, 2.0);
19353        let c = _mm512_set1_pch(0.0, 3.0);
19354        let r = _mm512_maskz_fmadd_pch(0b0101010101010101, a, b, c);
19355        let e = _mm512_setr_ph(
19356            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19357            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19358        );
19359        assert_eq_m512h(r, e);
19360    }
19361
19362    #[simd_test(enable = "avx512fp16")]
19363    unsafe fn test_mm512_fmadd_round_pch() {
19364        let a = _mm512_set1_pch(0.0, 1.0);
19365        let b = _mm512_set1_pch(0.0, 2.0);
19366        let c = _mm512_set1_pch(0.0, 3.0);
19367        let r =
19368            _mm512_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19369        let e = _mm512_set1_pch(-2.0, 3.0);
19370        assert_eq_m512h(r, e);
19371    }
19372
19373    #[simd_test(enable = "avx512fp16")]
19374    unsafe fn test_mm512_mask_fmadd_round_pch() {
19375        let a = _mm512_set1_pch(0.0, 1.0);
19376        let b = _mm512_set1_pch(0.0, 2.0);
19377        let c = _mm512_set1_pch(0.0, 3.0);
19378        let r = _mm512_mask_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19379            a,
19380            0b0101010101010101,
19381            b,
19382            c,
19383        );
19384        let e = _mm512_setr_ph(
19385            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19386            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19387        );
19388        assert_eq_m512h(r, e);
19389    }
19390
19391    #[simd_test(enable = "avx512fp16")]
19392    unsafe fn test_mm512_mask3_fmadd_round_pch() {
19393        let a = _mm512_set1_pch(0.0, 1.0);
19394        let b = _mm512_set1_pch(0.0, 2.0);
19395        let c = _mm512_set1_pch(0.0, 3.0);
19396        let r = _mm512_mask3_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19397            a,
19398            b,
19399            c,
19400            0b0101010101010101,
19401        );
19402        let e = _mm512_setr_ph(
19403            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19404            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19405        );
19406        assert_eq_m512h(r, e);
19407    }
19408
19409    #[simd_test(enable = "avx512fp16")]
19410    unsafe fn test_mm512_maskz_fmadd_round_pch() {
19411        let a = _mm512_set1_pch(0.0, 1.0);
19412        let b = _mm512_set1_pch(0.0, 2.0);
19413        let c = _mm512_set1_pch(0.0, 3.0);
19414        let r = _mm512_maskz_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19415            0b0101010101010101,
19416            a,
19417            b,
19418            c,
19419        );
19420        let e = _mm512_setr_ph(
19421            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19422            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19423        );
19424        assert_eq_m512h(r, e);
19425    }
19426
19427    #[simd_test(enable = "avx512fp16")]
19428    unsafe fn test_mm_fmadd_sch() {
19429        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19430        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19431        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19432        let r = _mm_fmadd_sch(a, b, c);
19433        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19434        assert_eq_m128h(r, e);
19435    }
19436
19437    #[simd_test(enable = "avx512fp16")]
19438    unsafe fn test_mm_mask_fmadd_sch() {
19439        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19440        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19441        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19442        let r = _mm_mask_fmadd_sch(a, 0, b, c);
19443        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19444        assert_eq_m128h(r, e);
19445        let r = _mm_mask_fmadd_sch(a, 1, b, c);
19446        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19447        assert_eq_m128h(r, e);
19448    }
19449
19450    #[simd_test(enable = "avx512fp16")]
19451    unsafe fn test_mm_mask3_fmadd_sch() {
19452        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19453        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19454        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19455        let r = _mm_mask3_fmadd_sch(a, b, c, 0);
19456        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19457        assert_eq_m128h(r, e);
19458        let r = _mm_mask3_fmadd_sch(a, b, c, 1);
19459        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19460        assert_eq_m128h(r, e);
19461    }
19462
19463    #[simd_test(enable = "avx512fp16")]
19464    unsafe fn test_mm_maskz_fmadd_sch() {
19465        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19466        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19467        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19468        let r = _mm_maskz_fmadd_sch(0, a, b, c);
19469        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19470        assert_eq_m128h(r, e);
19471        let r = _mm_maskz_fmadd_sch(1, a, b, c);
19472        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19473        assert_eq_m128h(r, e);
19474    }
19475
19476    #[simd_test(enable = "avx512fp16")]
19477    unsafe fn test_mm_fmadd_round_sch() {
19478        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19479        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19480        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19481        let r = _mm_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19482        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19483        assert_eq_m128h(r, e);
19484    }
19485
19486    #[simd_test(enable = "avx512fp16")]
19487    unsafe fn test_mm_mask_fmadd_round_sch() {
19488        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19489        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19490        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19491        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19492            a, 0, b, c,
19493        );
19494        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19495        assert_eq_m128h(r, e);
19496        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19497            a, 1, b, c,
19498        );
19499        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19500        assert_eq_m128h(r, e);
19501    }
19502
19503    #[simd_test(enable = "avx512fp16")]
19504    unsafe fn test_mm_mask3_fmadd_round_sch() {
19505        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19506        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19507        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19508        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19509            a, b, c, 0,
19510        );
19511        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19512        assert_eq_m128h(r, e);
19513        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19514            a, b, c, 1,
19515        );
19516        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19517        assert_eq_m128h(r, e);
19518    }
19519
19520    #[simd_test(enable = "avx512fp16")]
19521    unsafe fn test_mm_maskz_fmadd_round_sch() {
19522        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19523        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19524        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19525        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19526            0, a, b, c,
19527        );
19528        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19529        assert_eq_m128h(r, e);
19530        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19531            1, a, b, c,
19532        );
19533        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19534        assert_eq_m128h(r, e);
19535    }
19536
19537    #[simd_test(enable = "avx512fp16,avx512vl")]
19538    unsafe fn test_mm_fcmadd_pch() {
19539        let a = _mm_set1_pch(0.0, 1.0);
19540        let b = _mm_set1_pch(0.0, 2.0);
19541        let c = _mm_set1_pch(0.0, 3.0);
19542        let r = _mm_fcmadd_pch(a, b, c);
19543        let e = _mm_set1_pch(2.0, 3.0);
19544        assert_eq_m128h(r, e);
19545    }
19546
19547    #[simd_test(enable = "avx512fp16,avx512vl")]
19548    unsafe fn test_mm_mask_fcmadd_pch() {
19549        let a = _mm_set1_pch(0.0, 1.0);
19550        let b = _mm_set1_pch(0.0, 2.0);
19551        let c = _mm_set1_pch(0.0, 3.0);
19552        let r = _mm_mask_fcmadd_pch(a, 0b0101, b, c);
19553        let e = _mm_setr_ph(2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0);
19554        assert_eq_m128h(r, e);
19555    }
19556
19557    #[simd_test(enable = "avx512fp16,avx512vl")]
19558    unsafe fn test_mm_mask3_fcmadd_pch() {
19559        let a = _mm_set1_pch(0.0, 1.0);
19560        let b = _mm_set1_pch(0.0, 2.0);
19561        let c = _mm_set1_pch(0.0, 3.0);
19562        let r = _mm_mask3_fcmadd_pch(a, b, c, 0b0101);
19563        let e = _mm_setr_ph(2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0);
19564        assert_eq_m128h(r, e);
19565    }
19566
19567    #[simd_test(enable = "avx512fp16,avx512vl")]
19568    unsafe fn test_mm_maskz_fcmadd_pch() {
19569        let a = _mm_set1_pch(0.0, 1.0);
19570        let b = _mm_set1_pch(0.0, 2.0);
19571        let c = _mm_set1_pch(0.0, 3.0);
19572        let r = _mm_maskz_fcmadd_pch(0b0101, a, b, c);
19573        let e = _mm_setr_ph(2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0);
19574        assert_eq_m128h(r, e);
19575    }
19576
19577    #[simd_test(enable = "avx512fp16,avx512vl")]
19578    unsafe fn test_mm256_fcmadd_pch() {
19579        let a = _mm256_set1_pch(0.0, 1.0);
19580        let b = _mm256_set1_pch(0.0, 2.0);
19581        let c = _mm256_set1_pch(0.0, 3.0);
19582        let r = _mm256_fcmadd_pch(a, b, c);
19583        let e = _mm256_set1_pch(2.0, 3.0);
19584        assert_eq_m256h(r, e);
19585    }
19586
19587    #[simd_test(enable = "avx512fp16,avx512vl")]
19588    unsafe fn test_mm256_mask_fcmadd_pch() {
19589        let a = _mm256_set1_pch(0.0, 1.0);
19590        let b = _mm256_set1_pch(0.0, 2.0);
19591        let c = _mm256_set1_pch(0.0, 3.0);
19592        let r = _mm256_mask_fcmadd_pch(a, 0b01010101, b, c);
19593        let e = _mm256_setr_ph(
19594            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19595        );
19596        assert_eq_m256h(r, e);
19597    }
19598
19599    #[simd_test(enable = "avx512fp16,avx512vl")]
19600    unsafe fn test_mm256_mask3_fcmadd_pch() {
19601        let a = _mm256_set1_pch(0.0, 1.0);
19602        let b = _mm256_set1_pch(0.0, 2.0);
19603        let c = _mm256_set1_pch(0.0, 3.0);
19604        let r = _mm256_mask3_fcmadd_pch(a, b, c, 0b01010101);
19605        let e = _mm256_setr_ph(
19606            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
19607        );
19608        assert_eq_m256h(r, e);
19609    }
19610
19611    #[simd_test(enable = "avx512fp16,avx512vl")]
19612    unsafe fn test_mm256_maskz_fcmadd_pch() {
19613        let a = _mm256_set1_pch(0.0, 1.0);
19614        let b = _mm256_set1_pch(0.0, 2.0);
19615        let c = _mm256_set1_pch(0.0, 3.0);
19616        let r = _mm256_maskz_fcmadd_pch(0b01010101, a, b, c);
19617        let e = _mm256_setr_ph(
19618            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
19619        );
19620        assert_eq_m256h(r, e);
19621    }
19622
19623    #[simd_test(enable = "avx512fp16")]
19624    unsafe fn test_mm512_fcmadd_pch() {
19625        let a = _mm512_set1_pch(0.0, 1.0);
19626        let b = _mm512_set1_pch(0.0, 2.0);
19627        let c = _mm512_set1_pch(0.0, 3.0);
19628        let r = _mm512_fcmadd_pch(a, b, c);
19629        let e = _mm512_set1_pch(2.0, 3.0);
19630        assert_eq_m512h(r, e);
19631    }
19632
19633    #[simd_test(enable = "avx512fp16")]
19634    unsafe fn test_mm512_mask_fcmadd_pch() {
19635        let a = _mm512_set1_pch(0.0, 1.0);
19636        let b = _mm512_set1_pch(0.0, 2.0);
19637        let c = _mm512_set1_pch(0.0, 3.0);
19638        let r = _mm512_mask_fcmadd_pch(a, 0b0101010101010101, b, c);
19639        let e = _mm512_setr_ph(
19640            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
19641            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19642        );
19643        assert_eq_m512h(r, e);
19644    }
19645
19646    #[simd_test(enable = "avx512fp16")]
19647    unsafe fn test_mm512_mask3_fcmadd_pch() {
19648        let a = _mm512_set1_pch(0.0, 1.0);
19649        let b = _mm512_set1_pch(0.0, 2.0);
19650        let c = _mm512_set1_pch(0.0, 3.0);
19651        let r = _mm512_mask3_fcmadd_pch(a, b, c, 0b0101010101010101);
19652        let e = _mm512_setr_ph(
19653            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
19654            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
19655        );
19656        assert_eq_m512h(r, e);
19657    }
19658
19659    #[simd_test(enable = "avx512fp16")]
19660    unsafe fn test_mm512_maskz_fcmadd_pch() {
19661        let a = _mm512_set1_pch(0.0, 1.0);
19662        let b = _mm512_set1_pch(0.0, 2.0);
19663        let c = _mm512_set1_pch(0.0, 3.0);
19664        let r = _mm512_maskz_fcmadd_pch(0b0101010101010101, a, b, c);
19665        let e = _mm512_setr_ph(
19666            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
19667            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
19668        );
19669        assert_eq_m512h(r, e);
19670    }
19671
19672    #[simd_test(enable = "avx512fp16")]
19673    unsafe fn test_mm512_fcmadd_round_pch() {
19674        let a = _mm512_set1_pch(0.0, 1.0);
19675        let b = _mm512_set1_pch(0.0, 2.0);
19676        let c = _mm512_set1_pch(0.0, 3.0);
19677        let r =
19678            _mm512_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19679        let e = _mm512_set1_pch(2.0, 3.0);
19680        assert_eq_m512h(r, e);
19681    }
19682
19683    #[simd_test(enable = "avx512fp16")]
19684    unsafe fn test_mm512_mask_fcmadd_round_pch() {
19685        let a = _mm512_set1_pch(0.0, 1.0);
19686        let b = _mm512_set1_pch(0.0, 2.0);
19687        let c = _mm512_set1_pch(0.0, 3.0);
19688        let r = _mm512_mask_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19689            a,
19690            0b0101010101010101,
19691            b,
19692            c,
19693        );
19694        let e = _mm512_setr_ph(
19695            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
19696            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19697        );
19698        assert_eq_m512h(r, e);
19699    }
19700
19701    #[simd_test(enable = "avx512fp16")]
19702    unsafe fn test_mm512_mask3_fcmadd_round_pch() {
19703        let a = _mm512_set1_pch(0.0, 1.0);
19704        let b = _mm512_set1_pch(0.0, 2.0);
19705        let c = _mm512_set1_pch(0.0, 3.0);
19706        let r = _mm512_mask3_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19707            a,
19708            b,
19709            c,
19710            0b0101010101010101,
19711        );
19712        let e = _mm512_setr_ph(
19713            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
19714            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
19715        );
19716        assert_eq_m512h(r, e);
19717    }
19718
19719    #[simd_test(enable = "avx512fp16")]
19720    unsafe fn test_mm512_maskz_fcmadd_round_pch() {
19721        let a = _mm512_set1_pch(0.0, 1.0);
19722        let b = _mm512_set1_pch(0.0, 2.0);
19723        let c = _mm512_set1_pch(0.0, 3.0);
19724        let r = _mm512_maskz_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19725            0b0101010101010101,
19726            a,
19727            b,
19728            c,
19729        );
19730        let e = _mm512_setr_ph(
19731            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
19732            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
19733        );
19734        assert_eq_m512h(r, e);
19735    }
19736
19737    #[simd_test(enable = "avx512fp16")]
19738    unsafe fn test_mm_fcmadd_sch() {
19739        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19740        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19741        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19742        let r = _mm_fcmadd_sch(a, b, c);
19743        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19744        assert_eq_m128h(r, e);
19745    }
19746
19747    #[simd_test(enable = "avx512fp16")]
19748    unsafe fn test_mm_mask_fcmadd_sch() {
19749        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19750        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19751        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19752        let r = _mm_mask_fcmadd_sch(a, 0, b, c);
19753        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19754        assert_eq_m128h(r, e);
19755        let r = _mm_mask_fcmadd_sch(a, 1, b, c);
19756        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19757        assert_eq_m128h(r, e);
19758    }
19759
19760    #[simd_test(enable = "avx512fp16")]
19761    unsafe fn test_mm_mask3_fcmadd_sch() {
19762        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19763        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19764        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19765        let r = _mm_mask3_fcmadd_sch(a, b, c, 0);
19766        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19767        assert_eq_m128h(r, e);
19768        let r = _mm_mask3_fcmadd_sch(a, b, c, 1);
19769        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19770        assert_eq_m128h(r, e);
19771    }
19772
19773    #[simd_test(enable = "avx512fp16")]
19774    unsafe fn test_mm_maskz_fcmadd_sch() {
19775        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19776        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19777        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19778        let r = _mm_maskz_fcmadd_sch(0, a, b, c);
19779        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19780        assert_eq_m128h(r, e);
19781        let r = _mm_maskz_fcmadd_sch(1, a, b, c);
19782        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19783        assert_eq_m128h(r, e);
19784    }
19785
19786    #[simd_test(enable = "avx512fp16")]
19787    unsafe fn test_mm_fcmadd_round_sch() {
19788        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19789        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19790        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19791        let r = _mm_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19792        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19793        assert_eq_m128h(r, e);
19794    }
19795
19796    #[simd_test(enable = "avx512fp16")]
19797    unsafe fn test_mm_mask_fcmadd_round_sch() {
19798        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19799        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19800        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19801        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19802            a, 0, b, c,
19803        );
19804        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19805        assert_eq_m128h(r, e);
19806        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19807            a, 1, b, c,
19808        );
19809        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19810        assert_eq_m128h(r, e);
19811    }
19812
19813    #[simd_test(enable = "avx512fp16")]
19814    unsafe fn test_mm_mask3_fcmadd_round_sch() {
19815        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19816        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19817        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19818        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19819            a, b, c, 0,
19820        );
19821        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19822        assert_eq_m128h(r, e);
19823        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19824            a, b, c, 1,
19825        );
19826        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19827        assert_eq_m128h(r, e);
19828    }
19829
19830    #[simd_test(enable = "avx512fp16")]
19831    unsafe fn test_mm_maskz_fcmadd_round_sch() {
19832        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19833        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19834        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19835        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19836            0, a, b, c,
19837        );
19838        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19839        assert_eq_m128h(r, e);
19840        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19841            1, a, b, c,
19842        );
19843        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19844        assert_eq_m128h(r, e);
19845    }
19846
19847    #[simd_test(enable = "avx512fp16,avx512vl")]
19848    unsafe fn test_mm_fmadd_ph() {
19849        let a = _mm_set1_ph(1.0);
19850        let b = _mm_set1_ph(2.0);
19851        let c = _mm_set1_ph(3.0);
19852        let r = _mm_fmadd_ph(a, b, c);
19853        let e = _mm_set1_ph(5.0);
19854        assert_eq_m128h(r, e);
19855    }
19856
19857    #[simd_test(enable = "avx512fp16,avx512vl")]
19858    unsafe fn test_mm_mask_fmadd_ph() {
19859        let a = _mm_set1_ph(1.0);
19860        let b = _mm_set1_ph(2.0);
19861        let c = _mm_set1_ph(3.0);
19862        let r = _mm_mask_fmadd_ph(a, 0b01010101, b, c);
19863        let e = _mm_set_ph(1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0);
19864        assert_eq_m128h(r, e);
19865    }
19866
19867    #[simd_test(enable = "avx512fp16,avx512vl")]
19868    unsafe fn test_mm_mask3_fmadd_ph() {
19869        let a = _mm_set1_ph(1.0);
19870        let b = _mm_set1_ph(2.0);
19871        let c = _mm_set1_ph(3.0);
19872        let r = _mm_mask3_fmadd_ph(a, b, c, 0b01010101);
19873        let e = _mm_set_ph(3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0);
19874        assert_eq_m128h(r, e);
19875    }
19876
19877    #[simd_test(enable = "avx512fp16,avx512vl")]
19878    unsafe fn test_mm_maskz_fmadd_ph() {
19879        let a = _mm_set1_ph(1.0);
19880        let b = _mm_set1_ph(2.0);
19881        let c = _mm_set1_ph(3.0);
19882        let r = _mm_maskz_fmadd_ph(0b01010101, a, b, c);
19883        let e = _mm_set_ph(0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0);
19884        assert_eq_m128h(r, e);
19885    }
19886
19887    #[simd_test(enable = "avx512fp16,avx512vl")]
19888    unsafe fn test_mm256_fmadd_ph() {
19889        let a = _mm256_set1_ph(1.0);
19890        let b = _mm256_set1_ph(2.0);
19891        let c = _mm256_set1_ph(3.0);
19892        let r = _mm256_fmadd_ph(a, b, c);
19893        let e = _mm256_set1_ph(5.0);
19894        assert_eq_m256h(r, e);
19895    }
19896
19897    #[simd_test(enable = "avx512fp16,avx512vl")]
19898    unsafe fn test_mm256_mask_fmadd_ph() {
19899        let a = _mm256_set1_ph(1.0);
19900        let b = _mm256_set1_ph(2.0);
19901        let c = _mm256_set1_ph(3.0);
19902        let r = _mm256_mask_fmadd_ph(a, 0b0101010101010101, b, c);
19903        let e = _mm256_set_ph(
19904            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
19905        );
19906        assert_eq_m256h(r, e);
19907    }
19908
19909    #[simd_test(enable = "avx512fp16,avx512vl")]
19910    unsafe fn test_mm256_mask3_fmadd_ph() {
19911        let a = _mm256_set1_ph(1.0);
19912        let b = _mm256_set1_ph(2.0);
19913        let c = _mm256_set1_ph(3.0);
19914        let r = _mm256_mask3_fmadd_ph(a, b, c, 0b0101010101010101);
19915        let e = _mm256_set_ph(
19916            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
19917        );
19918        assert_eq_m256h(r, e);
19919    }
19920
19921    #[simd_test(enable = "avx512fp16,avx512vl")]
19922    unsafe fn test_mm256_maskz_fmadd_ph() {
19923        let a = _mm256_set1_ph(1.0);
19924        let b = _mm256_set1_ph(2.0);
19925        let c = _mm256_set1_ph(3.0);
19926        let r = _mm256_maskz_fmadd_ph(0b0101010101010101, a, b, c);
19927        let e = _mm256_set_ph(
19928            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
19929        );
19930        assert_eq_m256h(r, e);
19931    }
19932
19933    #[simd_test(enable = "avx512fp16")]
19934    unsafe fn test_mm512_fmadd_ph() {
19935        let a = _mm512_set1_ph(1.0);
19936        let b = _mm512_set1_ph(2.0);
19937        let c = _mm512_set1_ph(3.0);
19938        let r = _mm512_fmadd_ph(a, b, c);
19939        let e = _mm512_set1_ph(5.0);
19940        assert_eq_m512h(r, e);
19941    }
19942
19943    #[simd_test(enable = "avx512fp16")]
19944    unsafe fn test_mm512_mask_fmadd_ph() {
19945        let a = _mm512_set1_ph(1.0);
19946        let b = _mm512_set1_ph(2.0);
19947        let c = _mm512_set1_ph(3.0);
19948        let r = _mm512_mask_fmadd_ph(a, 0b01010101010101010101010101010101, b, c);
19949        let e = _mm512_set_ph(
19950            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
19951            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
19952        );
19953        assert_eq_m512h(r, e);
19954    }
19955
19956    #[simd_test(enable = "avx512fp16")]
19957    unsafe fn test_mm512_mask3_fmadd_ph() {
19958        let a = _mm512_set1_ph(1.0);
19959        let b = _mm512_set1_ph(2.0);
19960        let c = _mm512_set1_ph(3.0);
19961        let r = _mm512_mask3_fmadd_ph(a, b, c, 0b01010101010101010101010101010101);
19962        let e = _mm512_set_ph(
19963            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
19964            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
19965        );
19966        assert_eq_m512h(r, e);
19967    }
19968
19969    #[simd_test(enable = "avx512fp16")]
19970    unsafe fn test_mm512_maskz_fmadd_ph() {
19971        let a = _mm512_set1_ph(1.0);
19972        let b = _mm512_set1_ph(2.0);
19973        let c = _mm512_set1_ph(3.0);
19974        let r = _mm512_maskz_fmadd_ph(0b01010101010101010101010101010101, a, b, c);
19975        let e = _mm512_set_ph(
19976            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
19977            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
19978        );
19979        assert_eq_m512h(r, e);
19980    }
19981
19982    #[simd_test(enable = "avx512fp16")]
19983    unsafe fn test_mm512_fmadd_round_ph() {
19984        let a = _mm512_set1_ph(1.0);
19985        let b = _mm512_set1_ph(2.0);
19986        let c = _mm512_set1_ph(3.0);
19987        let r = _mm512_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19988        let e = _mm512_set1_ph(5.0);
19989        assert_eq_m512h(r, e);
19990    }
19991
19992    #[simd_test(enable = "avx512fp16")]
19993    unsafe fn test_mm512_mask_fmadd_round_ph() {
19994        let a = _mm512_set1_ph(1.0);
19995        let b = _mm512_set1_ph(2.0);
19996        let c = _mm512_set1_ph(3.0);
19997        let r = _mm512_mask_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19998            a,
19999            0b01010101010101010101010101010101,
20000            b,
20001            c,
20002        );
20003        let e = _mm512_set_ph(
20004            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20005            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20006        );
20007        assert_eq_m512h(r, e);
20008    }
20009
20010    #[simd_test(enable = "avx512fp16")]
20011    unsafe fn test_mm512_mask3_fmadd_round_ph() {
20012        let a = _mm512_set1_ph(1.0);
20013        let b = _mm512_set1_ph(2.0);
20014        let c = _mm512_set1_ph(3.0);
20015        let r = _mm512_mask3_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20016            a,
20017            b,
20018            c,
20019            0b01010101010101010101010101010101,
20020        );
20021        let e = _mm512_set_ph(
20022            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20023            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20024        );
20025        assert_eq_m512h(r, e);
20026    }
20027
20028    #[simd_test(enable = "avx512fp16")]
20029    unsafe fn test_mm512_maskz_fmadd_round_ph() {
20030        let a = _mm512_set1_ph(1.0);
20031        let b = _mm512_set1_ph(2.0);
20032        let c = _mm512_set1_ph(3.0);
20033        let r = _mm512_maskz_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20034            0b01010101010101010101010101010101,
20035            a,
20036            b,
20037            c,
20038        );
20039        let e = _mm512_set_ph(
20040            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20041            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20042        );
20043        assert_eq_m512h(r, e);
20044    }
20045
20046    #[simd_test(enable = "avx512fp16")]
20047    unsafe fn test_mm_fmadd_sh() {
20048        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20049        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20050        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20051        let r = _mm_fmadd_sh(a, b, c);
20052        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20053        assert_eq_m128h(r, e);
20054    }
20055
20056    #[simd_test(enable = "avx512fp16")]
20057    unsafe fn test_mm_mask_fmadd_sh() {
20058        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20059        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20060        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20061        let r = _mm_mask_fmadd_sh(a, 0, b, c);
20062        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20063        assert_eq_m128h(r, e);
20064        let r = _mm_mask_fmadd_sh(a, 1, b, c);
20065        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20066        assert_eq_m128h(r, e);
20067    }
20068
20069    #[simd_test(enable = "avx512fp16")]
20070    unsafe fn test_mm_mask3_fmadd_sh() {
20071        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20072        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20073        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20074        let r = _mm_mask3_fmadd_sh(a, b, c, 0);
20075        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20076        assert_eq_m128h(r, e);
20077        let r = _mm_mask3_fmadd_sh(a, b, c, 1);
20078        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20079        assert_eq_m128h(r, e);
20080    }
20081
20082    #[simd_test(enable = "avx512fp16")]
20083    unsafe fn test_mm_maskz_fmadd_sh() {
20084        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20085        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20086        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20087        let r = _mm_maskz_fmadd_sh(0, a, b, c);
20088        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20089        assert_eq_m128h(r, e);
20090        let r = _mm_maskz_fmadd_sh(1, a, b, c);
20091        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20092        assert_eq_m128h(r, e);
20093    }
20094
20095    #[simd_test(enable = "avx512fp16")]
20096    unsafe fn test_mm_fmadd_round_sh() {
20097        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20098        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20099        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20100        let r = _mm_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20101        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20102        assert_eq_m128h(r, e);
20103    }
20104
20105    #[simd_test(enable = "avx512fp16")]
20106    unsafe fn test_mm_mask_fmadd_round_sh() {
20107        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20108        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20109        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20110        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20111            a, 0, b, c,
20112        );
20113        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20114        assert_eq_m128h(r, e);
20115        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20116            a, 1, b, c,
20117        );
20118        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20119        assert_eq_m128h(r, e);
20120    }
20121
20122    #[simd_test(enable = "avx512fp16")]
20123    unsafe fn test_mm_mask3_fmadd_round_sh() {
20124        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20125        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20126        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20127        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20128            a, b, c, 0,
20129        );
20130        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20131        assert_eq_m128h(r, e);
20132        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20133            a, b, c, 1,
20134        );
20135        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20136        assert_eq_m128h(r, e);
20137    }
20138
20139    #[simd_test(enable = "avx512fp16")]
20140    unsafe fn test_mm_maskz_fmadd_round_sh() {
20141        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20142        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20143        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20144        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20145            0, a, b, c,
20146        );
20147        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20148        assert_eq_m128h(r, e);
20149        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20150            1, a, b, c,
20151        );
20152        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20153        assert_eq_m128h(r, e);
20154    }
20155
20156    #[simd_test(enable = "avx512fp16,avx512vl")]
20157    unsafe fn test_mm_fmsub_ph() {
20158        let a = _mm_set1_ph(1.0);
20159        let b = _mm_set1_ph(2.0);
20160        let c = _mm_set1_ph(3.0);
20161        let r = _mm_fmsub_ph(a, b, c);
20162        let e = _mm_set1_ph(-1.0);
20163        assert_eq_m128h(r, e);
20164    }
20165
20166    #[simd_test(enable = "avx512fp16,avx512vl")]
20167    unsafe fn test_mm_mask_fmsub_ph() {
20168        let a = _mm_set1_ph(1.0);
20169        let b = _mm_set1_ph(2.0);
20170        let c = _mm_set1_ph(3.0);
20171        let r = _mm_mask_fmsub_ph(a, 0b01010101, b, c);
20172        let e = _mm_set_ph(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
20173        assert_eq_m128h(r, e);
20174    }
20175
20176    #[simd_test(enable = "avx512fp16,avx512vl")]
20177    unsafe fn test_mm_mask3_fmsub_ph() {
20178        let a = _mm_set1_ph(1.0);
20179        let b = _mm_set1_ph(2.0);
20180        let c = _mm_set1_ph(3.0);
20181        let r = _mm_mask3_fmsub_ph(a, b, c, 0b01010101);
20182        let e = _mm_set_ph(3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0);
20183        assert_eq_m128h(r, e);
20184    }
20185
20186    #[simd_test(enable = "avx512fp16,avx512vl")]
20187    unsafe fn test_mm_maskz_fmsub_ph() {
20188        let a = _mm_set1_ph(1.0);
20189        let b = _mm_set1_ph(2.0);
20190        let c = _mm_set1_ph(3.0);
20191        let r = _mm_maskz_fmsub_ph(0b01010101, a, b, c);
20192        let e = _mm_set_ph(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0);
20193        assert_eq_m128h(r, e);
20194    }
20195
20196    #[simd_test(enable = "avx512fp16,avx512vl")]
20197    unsafe fn test_mm256_fmsub_ph() {
20198        let a = _mm256_set1_ph(1.0);
20199        let b = _mm256_set1_ph(2.0);
20200        let c = _mm256_set1_ph(3.0);
20201        let r = _mm256_fmsub_ph(a, b, c);
20202        let e = _mm256_set1_ph(-1.0);
20203        assert_eq_m256h(r, e);
20204    }
20205
20206    #[simd_test(enable = "avx512fp16,avx512vl")]
20207    unsafe fn test_mm256_mask_fmsub_ph() {
20208        let a = _mm256_set1_ph(1.0);
20209        let b = _mm256_set1_ph(2.0);
20210        let c = _mm256_set1_ph(3.0);
20211        let r = _mm256_mask_fmsub_ph(a, 0b0101010101010101, b, c);
20212        let e = _mm256_set_ph(
20213            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20214        );
20215        assert_eq_m256h(r, e);
20216    }
20217
20218    #[simd_test(enable = "avx512fp16,avx512vl")]
20219    unsafe fn test_mm256_mask3_fmsub_ph() {
20220        let a = _mm256_set1_ph(1.0);
20221        let b = _mm256_set1_ph(2.0);
20222        let c = _mm256_set1_ph(3.0);
20223        let r = _mm256_mask3_fmsub_ph(a, b, c, 0b0101010101010101);
20224        let e = _mm256_set_ph(
20225            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20226        );
20227        assert_eq_m256h(r, e);
20228    }
20229
20230    #[simd_test(enable = "avx512fp16,avx512vl")]
20231    unsafe fn test_mm256_maskz_fmsub_ph() {
20232        let a = _mm256_set1_ph(1.0);
20233        let b = _mm256_set1_ph(2.0);
20234        let c = _mm256_set1_ph(3.0);
20235        let r = _mm256_maskz_fmsub_ph(0b0101010101010101, a, b, c);
20236        let e = _mm256_set_ph(
20237            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20238        );
20239        assert_eq_m256h(r, e);
20240    }
20241
20242    #[simd_test(enable = "avx512fp16")]
20243    unsafe fn test_mm512_fmsub_ph() {
20244        let a = _mm512_set1_ph(1.0);
20245        let b = _mm512_set1_ph(2.0);
20246        let c = _mm512_set1_ph(3.0);
20247        let r = _mm512_fmsub_ph(a, b, c);
20248        let e = _mm512_set1_ph(-1.0);
20249        assert_eq_m512h(r, e);
20250    }
20251
20252    #[simd_test(enable = "avx512fp16")]
20253    unsafe fn test_mm512_mask_fmsub_ph() {
20254        let a = _mm512_set1_ph(1.0);
20255        let b = _mm512_set1_ph(2.0);
20256        let c = _mm512_set1_ph(3.0);
20257        let r = _mm512_mask_fmsub_ph(a, 0b01010101010101010101010101010101, b, c);
20258        let e = _mm512_set_ph(
20259            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20260            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20261        );
20262        assert_eq_m512h(r, e);
20263    }
20264
20265    #[simd_test(enable = "avx512fp16")]
20266    unsafe fn test_mm512_mask3_fmsub_ph() {
20267        let a = _mm512_set1_ph(1.0);
20268        let b = _mm512_set1_ph(2.0);
20269        let c = _mm512_set1_ph(3.0);
20270        let r = _mm512_mask3_fmsub_ph(a, b, c, 0b01010101010101010101010101010101);
20271        let e = _mm512_set_ph(
20272            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20273            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20274        );
20275        assert_eq_m512h(r, e);
20276    }
20277
20278    #[simd_test(enable = "avx512fp16")]
20279    unsafe fn test_mm512_maskz_fmsub_ph() {
20280        let a = _mm512_set1_ph(1.0);
20281        let b = _mm512_set1_ph(2.0);
20282        let c = _mm512_set1_ph(3.0);
20283        let r = _mm512_maskz_fmsub_ph(0b01010101010101010101010101010101, a, b, c);
20284        let e = _mm512_set_ph(
20285            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20286            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20287        );
20288        assert_eq_m512h(r, e);
20289    }
20290
20291    #[simd_test(enable = "avx512fp16")]
20292    unsafe fn test_mm512_fmsub_round_ph() {
20293        let a = _mm512_set1_ph(1.0);
20294        let b = _mm512_set1_ph(2.0);
20295        let c = _mm512_set1_ph(3.0);
20296        let r = _mm512_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20297        let e = _mm512_set1_ph(-1.0);
20298        assert_eq_m512h(r, e);
20299    }
20300
20301    #[simd_test(enable = "avx512fp16")]
20302    unsafe fn test_mm512_mask_fmsub_round_ph() {
20303        let a = _mm512_set1_ph(1.0);
20304        let b = _mm512_set1_ph(2.0);
20305        let c = _mm512_set1_ph(3.0);
20306        let r = _mm512_mask_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20307            a,
20308            0b01010101010101010101010101010101,
20309            b,
20310            c,
20311        );
20312        let e = _mm512_set_ph(
20313            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20314            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20315        );
20316        assert_eq_m512h(r, e);
20317    }
20318
20319    #[simd_test(enable = "avx512fp16")]
20320    unsafe fn test_mm512_mask3_fmsub_round_ph() {
20321        let a = _mm512_set1_ph(1.0);
20322        let b = _mm512_set1_ph(2.0);
20323        let c = _mm512_set1_ph(3.0);
20324        let r = _mm512_mask3_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20325            a,
20326            b,
20327            c,
20328            0b01010101010101010101010101010101,
20329        );
20330        let e = _mm512_set_ph(
20331            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20332            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20333        );
20334        assert_eq_m512h(r, e);
20335    }
20336
20337    #[simd_test(enable = "avx512fp16")]
20338    unsafe fn test_mm512_maskz_fmsub_round_ph() {
20339        let a = _mm512_set1_ph(1.0);
20340        let b = _mm512_set1_ph(2.0);
20341        let c = _mm512_set1_ph(3.0);
20342        let r = _mm512_maskz_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20343            0b01010101010101010101010101010101,
20344            a,
20345            b,
20346            c,
20347        );
20348        let e = _mm512_set_ph(
20349            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20350            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20351        );
20352        assert_eq_m512h(r, e);
20353    }
20354
20355    #[simd_test(enable = "avx512fp16")]
20356    unsafe fn test_mm_fmsub_sh() {
20357        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20358        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20359        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20360        let r = _mm_fmsub_sh(a, b, c);
20361        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20362        assert_eq_m128h(r, e);
20363    }
20364
20365    #[simd_test(enable = "avx512fp16")]
20366    unsafe fn test_mm_mask_fmsub_sh() {
20367        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20368        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20369        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20370        let r = _mm_mask_fmsub_sh(a, 0, b, c);
20371        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20372        assert_eq_m128h(r, e);
20373        let r = _mm_mask_fmsub_sh(a, 1, b, c);
20374        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20375        assert_eq_m128h(r, e);
20376    }
20377
20378    #[simd_test(enable = "avx512fp16")]
20379    unsafe fn test_mm_mask3_fmsub_sh() {
20380        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20381        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20382        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20383        let r = _mm_mask3_fmsub_sh(a, b, c, 0);
20384        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20385        assert_eq_m128h(r, e);
20386        let r = _mm_mask3_fmsub_sh(a, b, c, 1);
20387        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
20388        assert_eq_m128h(r, e);
20389    }
20390
20391    #[simd_test(enable = "avx512fp16")]
20392    unsafe fn test_mm_maskz_fmsub_sh() {
20393        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20394        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20395        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20396        let r = _mm_maskz_fmsub_sh(0, a, b, c);
20397        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20398        assert_eq_m128h(r, e);
20399        let r = _mm_maskz_fmsub_sh(1, a, b, c);
20400        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20401        assert_eq_m128h(r, e);
20402    }
20403
20404    #[simd_test(enable = "avx512fp16")]
20405    unsafe fn test_mm_fmsub_round_sh() {
20406        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20407        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20408        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20409        let r = _mm_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20410        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20411        assert_eq_m128h(r, e);
20412    }
20413
20414    #[simd_test(enable = "avx512fp16")]
20415    unsafe fn test_mm_mask_fmsub_round_sh() {
20416        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20417        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20418        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20419        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20420            a, 0, b, c,
20421        );
20422        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20423        assert_eq_m128h(r, e);
20424        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20425            a, 1, b, c,
20426        );
20427        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20428        assert_eq_m128h(r, e);
20429    }
20430
20431    #[simd_test(enable = "avx512fp16")]
20432    unsafe fn test_mm_mask3_fmsub_round_sh() {
20433        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20434        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20435        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20436        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20437            a, b, c, 0,
20438        );
20439        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20440        assert_eq_m128h(r, e);
20441        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20442            a, b, c, 1,
20443        );
20444        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
20445        assert_eq_m128h(r, e);
20446    }
20447
20448    #[simd_test(enable = "avx512fp16")]
20449    unsafe fn test_mm_maskz_fmsub_round_sh() {
20450        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20451        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20452        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20453        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20454            0, a, b, c,
20455        );
20456        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20457        assert_eq_m128h(r, e);
20458        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20459            1, a, b, c,
20460        );
20461        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20462        assert_eq_m128h(r, e);
20463    }
20464
20465    #[simd_test(enable = "avx512fp16,avx512vl")]
20466    unsafe fn test_mm_fnmadd_ph() {
20467        let a = _mm_set1_ph(1.0);
20468        let b = _mm_set1_ph(2.0);
20469        let c = _mm_set1_ph(3.0);
20470        let r = _mm_fnmadd_ph(a, b, c);
20471        let e = _mm_set1_ph(1.0);
20472        assert_eq_m128h(r, e);
20473    }
20474
20475    #[simd_test(enable = "avx512fp16,avx512vl")]
20476    unsafe fn test_mm_mask_fnmadd_ph() {
20477        let a = _mm_set1_ph(1.0);
20478        let b = _mm_set1_ph(2.0);
20479        let c = _mm_set1_ph(3.0);
20480        let r = _mm_mask_fnmadd_ph(a, 0b01010101, b, c);
20481        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
20482        assert_eq_m128h(r, e);
20483    }
20484
20485    #[simd_test(enable = "avx512fp16,avx512vl")]
20486    unsafe fn test_mm_mask3_fnmadd_ph() {
20487        let a = _mm_set1_ph(1.0);
20488        let b = _mm_set1_ph(2.0);
20489        let c = _mm_set1_ph(3.0);
20490        let r = _mm_mask3_fnmadd_ph(a, b, c, 0b01010101);
20491        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
20492        assert_eq_m128h(r, e);
20493    }
20494
20495    #[simd_test(enable = "avx512fp16,avx512vl")]
20496    unsafe fn test_mm_maskz_fnmadd_ph() {
20497        let a = _mm_set1_ph(1.0);
20498        let b = _mm_set1_ph(2.0);
20499        let c = _mm_set1_ph(3.0);
20500        let r = _mm_maskz_fnmadd_ph(0b01010101, a, b, c);
20501        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
20502        assert_eq_m128h(r, e);
20503    }
20504
20505    #[simd_test(enable = "avx512fp16,avx512vl")]
20506    unsafe fn test_mm256_fnmadd_ph() {
20507        let a = _mm256_set1_ph(1.0);
20508        let b = _mm256_set1_ph(2.0);
20509        let c = _mm256_set1_ph(3.0);
20510        let r = _mm256_fnmadd_ph(a, b, c);
20511        let e = _mm256_set1_ph(1.0);
20512        assert_eq_m256h(r, e);
20513    }
20514
20515    #[simd_test(enable = "avx512fp16,avx512vl")]
20516    unsafe fn test_mm256_mask_fnmadd_ph() {
20517        let a = _mm256_set1_ph(1.0);
20518        let b = _mm256_set1_ph(2.0);
20519        let c = _mm256_set1_ph(3.0);
20520        let r = _mm256_mask_fnmadd_ph(a, 0b0101010101010101, b, c);
20521        let e = _mm256_set_ph(
20522            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20523        );
20524        assert_eq_m256h(r, e);
20525    }
20526
20527    #[simd_test(enable = "avx512fp16,avx512vl")]
20528    unsafe fn test_mm256_mask3_fnmadd_ph() {
20529        let a = _mm256_set1_ph(1.0);
20530        let b = _mm256_set1_ph(2.0);
20531        let c = _mm256_set1_ph(3.0);
20532        let r = _mm256_mask3_fnmadd_ph(a, b, c, 0b0101010101010101);
20533        let e = _mm256_set_ph(
20534            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20535        );
20536        assert_eq_m256h(r, e);
20537    }
20538
20539    #[simd_test(enable = "avx512fp16,avx512vl")]
20540    unsafe fn test_mm256_maskz_fnmadd_ph() {
20541        let a = _mm256_set1_ph(1.0);
20542        let b = _mm256_set1_ph(2.0);
20543        let c = _mm256_set1_ph(3.0);
20544        let r = _mm256_maskz_fnmadd_ph(0b0101010101010101, a, b, c);
20545        let e = _mm256_set_ph(
20546            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20547        );
20548        assert_eq_m256h(r, e);
20549    }
20550
20551    #[simd_test(enable = "avx512fp16")]
20552    unsafe fn test_mm512_fnmadd_ph() {
20553        let a = _mm512_set1_ph(1.0);
20554        let b = _mm512_set1_ph(2.0);
20555        let c = _mm512_set1_ph(3.0);
20556        let r = _mm512_fnmadd_ph(a, b, c);
20557        let e = _mm512_set1_ph(1.0);
20558        assert_eq_m512h(r, e);
20559    }
20560
20561    #[simd_test(enable = "avx512fp16")]
20562    unsafe fn test_mm512_mask_fnmadd_ph() {
20563        let a = _mm512_set1_ph(1.0);
20564        let b = _mm512_set1_ph(2.0);
20565        let c = _mm512_set1_ph(3.0);
20566        let r = _mm512_mask_fnmadd_ph(a, 0b01010101010101010101010101010101, b, c);
20567        let e = _mm512_set_ph(
20568            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20569            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20570        );
20571        assert_eq_m512h(r, e);
20572    }
20573
20574    #[simd_test(enable = "avx512fp16")]
20575    unsafe fn test_mm512_mask3_fnmadd_ph() {
20576        let a = _mm512_set1_ph(1.0);
20577        let b = _mm512_set1_ph(2.0);
20578        let c = _mm512_set1_ph(3.0);
20579        let r = _mm512_mask3_fnmadd_ph(a, b, c, 0b01010101010101010101010101010101);
20580        let e = _mm512_set_ph(
20581            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
20582            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20583        );
20584        assert_eq_m512h(r, e);
20585    }
20586
20587    #[simd_test(enable = "avx512fp16")]
20588    unsafe fn test_mm512_maskz_fnmadd_ph() {
20589        let a = _mm512_set1_ph(1.0);
20590        let b = _mm512_set1_ph(2.0);
20591        let c = _mm512_set1_ph(3.0);
20592        let r = _mm512_maskz_fnmadd_ph(0b01010101010101010101010101010101, a, b, c);
20593        let e = _mm512_set_ph(
20594            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
20595            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20596        );
20597        assert_eq_m512h(r, e);
20598    }
20599
20600    #[simd_test(enable = "avx512fp16")]
20601    unsafe fn test_mm512_fnmadd_round_ph() {
20602        let a = _mm512_set1_ph(1.0);
20603        let b = _mm512_set1_ph(2.0);
20604        let c = _mm512_set1_ph(3.0);
20605        let r =
20606            _mm512_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20607        let e = _mm512_set1_ph(1.0);
20608        assert_eq_m512h(r, e);
20609    }
20610
20611    #[simd_test(enable = "avx512fp16")]
20612    unsafe fn test_mm512_mask_fnmadd_round_ph() {
20613        let a = _mm512_set1_ph(1.0);
20614        let b = _mm512_set1_ph(2.0);
20615        let c = _mm512_set1_ph(3.0);
20616        let r = _mm512_mask_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20617            a,
20618            0b01010101010101010101010101010101,
20619            b,
20620            c,
20621        );
20622        let e = _mm512_set_ph(
20623            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20624            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20625        );
20626        assert_eq_m512h(r, e);
20627    }
20628
20629    #[simd_test(enable = "avx512fp16")]
20630    unsafe fn test_mm512_mask3_fnmadd_round_ph() {
20631        let a = _mm512_set1_ph(1.0);
20632        let b = _mm512_set1_ph(2.0);
20633        let c = _mm512_set1_ph(3.0);
20634        let r = _mm512_mask3_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20635            a,
20636            b,
20637            c,
20638            0b01010101010101010101010101010101,
20639        );
20640        let e = _mm512_set_ph(
20641            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
20642            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20643        );
20644        assert_eq_m512h(r, e);
20645    }
20646
20647    #[simd_test(enable = "avx512fp16")]
20648    unsafe fn test_mm512_maskz_fnmadd_round_ph() {
20649        let a = _mm512_set1_ph(1.0);
20650        let b = _mm512_set1_ph(2.0);
20651        let c = _mm512_set1_ph(3.0);
20652        let r = _mm512_maskz_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20653            0b01010101010101010101010101010101,
20654            a,
20655            b,
20656            c,
20657        );
20658        let e = _mm512_set_ph(
20659            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
20660            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20661        );
20662        assert_eq_m512h(r, e);
20663    }
20664
20665    #[simd_test(enable = "avx512fp16")]
20666    unsafe fn test_mm_fnmadd_sh() {
20667        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20668        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20669        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20670        let r = _mm_fnmadd_sh(a, b, c);
20671        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20672        assert_eq_m128h(r, e);
20673    }
20674
20675    #[simd_test(enable = "avx512fp16")]
20676    unsafe fn test_mm_mask_fnmadd_sh() {
20677        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20678        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20679        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20680        let r = _mm_mask_fnmadd_sh(a, 0, b, c);
20681        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20682        assert_eq_m128h(r, e);
20683        let r = _mm_mask_fnmadd_sh(a, 1, b, c);
20684        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20685        assert_eq_m128h(r, e);
20686    }
20687
20688    #[simd_test(enable = "avx512fp16")]
20689    unsafe fn test_mm_mask3_fnmadd_sh() {
20690        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20691        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20692        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20693        let r = _mm_mask3_fnmadd_sh(a, b, c, 0);
20694        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20695        assert_eq_m128h(r, e);
20696        let r = _mm_mask3_fnmadd_sh(a, b, c, 1);
20697        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
20698        assert_eq_m128h(r, e);
20699    }
20700
20701    #[simd_test(enable = "avx512fp16")]
20702    unsafe fn test_mm_maskz_fnmadd_sh() {
20703        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20704        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20705        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20706        let r = _mm_maskz_fnmadd_sh(0, a, b, c);
20707        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20708        assert_eq_m128h(r, e);
20709        let r = _mm_maskz_fnmadd_sh(1, a, b, c);
20710        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20711        assert_eq_m128h(r, e);
20712    }
20713
20714    #[simd_test(enable = "avx512fp16")]
20715    unsafe fn test_mm_fnmadd_round_sh() {
20716        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20717        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20718        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20719        let r = _mm_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20720        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20721        assert_eq_m128h(r, e);
20722    }
20723
20724    #[simd_test(enable = "avx512fp16")]
20725    unsafe fn test_mm_mask_fnmadd_round_sh() {
20726        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20727        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20728        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20729        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20730            a, 0, b, c,
20731        );
20732        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20733        assert_eq_m128h(r, e);
20734        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20735            a, 1, b, c,
20736        );
20737        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20738        assert_eq_m128h(r, e);
20739    }
20740
20741    #[simd_test(enable = "avx512fp16")]
20742    unsafe fn test_mm_mask3_fnmadd_round_sh() {
20743        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20744        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20745        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20746        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20747            a, b, c, 0,
20748        );
20749        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20750        assert_eq_m128h(r, e);
20751        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20752            a, b, c, 1,
20753        );
20754        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
20755        assert_eq_m128h(r, e);
20756    }
20757
20758    #[simd_test(enable = "avx512fp16")]
20759    unsafe fn test_mm_maskz_fnmadd_round_sh() {
20760        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20761        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20762        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20763        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20764            0, a, b, c,
20765        );
20766        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20767        assert_eq_m128h(r, e);
20768        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20769            1, a, b, c,
20770        );
20771        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20772        assert_eq_m128h(r, e);
20773    }
20774
20775    #[simd_test(enable = "avx512fp16,avx512vl")]
20776    unsafe fn test_mm_fnmsub_ph() {
20777        let a = _mm_set1_ph(1.0);
20778        let b = _mm_set1_ph(2.0);
20779        let c = _mm_set1_ph(3.0);
20780        let r = _mm_fnmsub_ph(a, b, c);
20781        let e = _mm_set1_ph(-5.0);
20782        assert_eq_m128h(r, e);
20783    }
20784
20785    #[simd_test(enable = "avx512fp16,avx512vl")]
20786    unsafe fn test_mm_mask_fnmsub_ph() {
20787        let a = _mm_set1_ph(1.0);
20788        let b = _mm_set1_ph(2.0);
20789        let c = _mm_set1_ph(3.0);
20790        let r = _mm_mask_fnmsub_ph(a, 0b01010101, b, c);
20791        let e = _mm_set_ph(1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0);
20792        assert_eq_m128h(r, e);
20793    }
20794
20795    #[simd_test(enable = "avx512fp16,avx512vl")]
20796    unsafe fn test_mm_mask3_fnmsub_ph() {
20797        let a = _mm_set1_ph(1.0);
20798        let b = _mm_set1_ph(2.0);
20799        let c = _mm_set1_ph(3.0);
20800        let r = _mm_mask3_fnmsub_ph(a, b, c, 0b01010101);
20801        let e = _mm_set_ph(3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0);
20802        assert_eq_m128h(r, e);
20803    }
20804
20805    #[simd_test(enable = "avx512fp16,avx512vl")]
20806    unsafe fn test_mm_maskz_fnmsub_ph() {
20807        let a = _mm_set1_ph(1.0);
20808        let b = _mm_set1_ph(2.0);
20809        let c = _mm_set1_ph(3.0);
20810        let r = _mm_maskz_fnmsub_ph(0b01010101, a, b, c);
20811        let e = _mm_set_ph(0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0);
20812        assert_eq_m128h(r, e);
20813    }
20814
20815    #[simd_test(enable = "avx512fp16,avx512vl")]
20816    unsafe fn test_mm256_fnmsub_ph() {
20817        let a = _mm256_set1_ph(1.0);
20818        let b = _mm256_set1_ph(2.0);
20819        let c = _mm256_set1_ph(3.0);
20820        let r = _mm256_fnmsub_ph(a, b, c);
20821        let e = _mm256_set1_ph(-5.0);
20822        assert_eq_m256h(r, e);
20823    }
20824
20825    #[simd_test(enable = "avx512fp16,avx512vl")]
20826    unsafe fn test_mm256_mask_fnmsub_ph() {
20827        let a = _mm256_set1_ph(1.0);
20828        let b = _mm256_set1_ph(2.0);
20829        let c = _mm256_set1_ph(3.0);
20830        let r = _mm256_mask_fnmsub_ph(a, 0b0101010101010101, b, c);
20831        let e = _mm256_set_ph(
20832            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
20833        );
20834        assert_eq_m256h(r, e);
20835    }
20836
20837    #[simd_test(enable = "avx512fp16,avx512vl")]
20838    unsafe fn test_mm256_mask3_fnmsub_ph() {
20839        let a = _mm256_set1_ph(1.0);
20840        let b = _mm256_set1_ph(2.0);
20841        let c = _mm256_set1_ph(3.0);
20842        let r = _mm256_mask3_fnmsub_ph(a, b, c, 0b0101010101010101);
20843        let e = _mm256_set_ph(
20844            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
20845        );
20846        assert_eq_m256h(r, e);
20847    }
20848
20849    #[simd_test(enable = "avx512fp16,avx512vl")]
20850    unsafe fn test_mm256_maskz_fnmsub_ph() {
20851        let a = _mm256_set1_ph(1.0);
20852        let b = _mm256_set1_ph(2.0);
20853        let c = _mm256_set1_ph(3.0);
20854        let r = _mm256_maskz_fnmsub_ph(0b0101010101010101, a, b, c);
20855        let e = _mm256_set_ph(
20856            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
20857        );
20858        assert_eq_m256h(r, e);
20859    }
20860
20861    #[simd_test(enable = "avx512fp16")]
20862    unsafe fn test_mm512_fnmsub_ph() {
20863        let a = _mm512_set1_ph(1.0);
20864        let b = _mm512_set1_ph(2.0);
20865        let c = _mm512_set1_ph(3.0);
20866        let r = _mm512_fnmsub_ph(a, b, c);
20867        let e = _mm512_set1_ph(-5.0);
20868        assert_eq_m512h(r, e);
20869    }
20870
20871    #[simd_test(enable = "avx512fp16")]
20872    unsafe fn test_mm512_mask_fnmsub_ph() {
20873        let a = _mm512_set1_ph(1.0);
20874        let b = _mm512_set1_ph(2.0);
20875        let c = _mm512_set1_ph(3.0);
20876        let r = _mm512_mask_fnmsub_ph(a, 0b01010101010101010101010101010101, b, c);
20877        let e = _mm512_set_ph(
20878            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
20879            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
20880        );
20881        assert_eq_m512h(r, e);
20882    }
20883
20884    #[simd_test(enable = "avx512fp16")]
20885    unsafe fn test_mm512_mask3_fnmsub_ph() {
20886        let a = _mm512_set1_ph(1.0);
20887        let b = _mm512_set1_ph(2.0);
20888        let c = _mm512_set1_ph(3.0);
20889        let r = _mm512_mask3_fnmsub_ph(a, b, c, 0b01010101010101010101010101010101);
20890        let e = _mm512_set_ph(
20891            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
20892            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
20893        );
20894        assert_eq_m512h(r, e);
20895    }
20896
20897    #[simd_test(enable = "avx512fp16")]
20898    unsafe fn test_mm512_maskz_fnmsub_ph() {
20899        let a = _mm512_set1_ph(1.0);
20900        let b = _mm512_set1_ph(2.0);
20901        let c = _mm512_set1_ph(3.0);
20902        let r = _mm512_maskz_fnmsub_ph(0b01010101010101010101010101010101, a, b, c);
20903        let e = _mm512_set_ph(
20904            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
20905            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
20906        );
20907        assert_eq_m512h(r, e);
20908    }
20909
20910    #[simd_test(enable = "avx512fp16")]
20911    unsafe fn test_mm512_fnmsub_round_ph() {
20912        let a = _mm512_set1_ph(1.0);
20913        let b = _mm512_set1_ph(2.0);
20914        let c = _mm512_set1_ph(3.0);
20915        let r =
20916            _mm512_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20917        let e = _mm512_set1_ph(-5.0);
20918        assert_eq_m512h(r, e);
20919    }
20920
20921    #[simd_test(enable = "avx512fp16")]
20922    unsafe fn test_mm512_mask_fnmsub_round_ph() {
20923        let a = _mm512_set1_ph(1.0);
20924        let b = _mm512_set1_ph(2.0);
20925        let c = _mm512_set1_ph(3.0);
20926        let r = _mm512_mask_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20927            a,
20928            0b01010101010101010101010101010101,
20929            b,
20930            c,
20931        );
20932        let e = _mm512_set_ph(
20933            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
20934            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
20935        );
20936        assert_eq_m512h(r, e);
20937    }
20938
20939    #[simd_test(enable = "avx512fp16")]
20940    unsafe fn test_mm512_mask3_fnmsub_round_ph() {
20941        let a = _mm512_set1_ph(1.0);
20942        let b = _mm512_set1_ph(2.0);
20943        let c = _mm512_set1_ph(3.0);
20944        let r = _mm512_mask3_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20945            a,
20946            b,
20947            c,
20948            0b01010101010101010101010101010101,
20949        );
20950        let e = _mm512_set_ph(
20951            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
20952            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
20953        );
20954        assert_eq_m512h(r, e);
20955    }
20956
20957    #[simd_test(enable = "avx512fp16")]
20958    unsafe fn test_mm512_maskz_fnmsub_round_ph() {
20959        let a = _mm512_set1_ph(1.0);
20960        let b = _mm512_set1_ph(2.0);
20961        let c = _mm512_set1_ph(3.0);
20962        let r = _mm512_maskz_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20963            0b01010101010101010101010101010101,
20964            a,
20965            b,
20966            c,
20967        );
20968        let e = _mm512_set_ph(
20969            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
20970            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
20971        );
20972        assert_eq_m512h(r, e);
20973    }
20974
20975    #[simd_test(enable = "avx512fp16")]
20976    unsafe fn test_mm_fnmsub_sh() {
20977        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20978        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20979        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20980        let r = _mm_fnmsub_sh(a, b, c);
20981        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
20982        assert_eq_m128h(r, e);
20983    }
20984
20985    #[simd_test(enable = "avx512fp16")]
20986    unsafe fn test_mm_mask_fnmsub_sh() {
20987        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20988        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20989        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20990        let r = _mm_mask_fnmsub_sh(a, 0, b, c);
20991        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20992        assert_eq_m128h(r, e);
20993        let r = _mm_mask_fnmsub_sh(a, 1, b, c);
20994        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
20995        assert_eq_m128h(r, e);
20996    }
20997
20998    #[simd_test(enable = "avx512fp16")]
20999    unsafe fn test_mm_mask3_fnmsub_sh() {
21000        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21001        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21002        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21003        let r = _mm_mask3_fnmsub_sh(a, b, c, 0);
21004        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21005        assert_eq_m128h(r, e);
21006        let r = _mm_mask3_fnmsub_sh(a, b, c, 1);
21007        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21008        assert_eq_m128h(r, e);
21009    }
21010
21011    #[simd_test(enable = "avx512fp16")]
21012    unsafe fn test_mm_maskz_fnmsub_sh() {
21013        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21014        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21015        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21016        let r = _mm_maskz_fnmsub_sh(0, a, b, c);
21017        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21018        assert_eq_m128h(r, e);
21019        let r = _mm_maskz_fnmsub_sh(1, a, b, c);
21020        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21021        assert_eq_m128h(r, e);
21022    }
21023
21024    #[simd_test(enable = "avx512fp16")]
21025    unsafe fn test_mm_fnmsub_round_sh() {
21026        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21027        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21028        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21029        let r = _mm_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21030        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21031        assert_eq_m128h(r, e);
21032    }
21033
21034    #[simd_test(enable = "avx512fp16")]
21035    unsafe fn test_mm_mask_fnmsub_round_sh() {
21036        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21037        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21038        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21039        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21040            a, 0, b, c,
21041        );
21042        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21043        assert_eq_m128h(r, e);
21044        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21045            a, 1, b, c,
21046        );
21047        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21048        assert_eq_m128h(r, e);
21049    }
21050
21051    #[simd_test(enable = "avx512fp16")]
21052    unsafe fn test_mm_mask3_fnmsub_round_sh() {
21053        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21054        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21055        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21056        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21057            a, b, c, 0,
21058        );
21059        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21060        assert_eq_m128h(r, e);
21061        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21062            a, b, c, 1,
21063        );
21064        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21065        assert_eq_m128h(r, e);
21066    }
21067
21068    #[simd_test(enable = "avx512fp16")]
21069    unsafe fn test_mm_maskz_fnmsub_round_sh() {
21070        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21071        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21072        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21073        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21074            0, a, b, c,
21075        );
21076        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21077        assert_eq_m128h(r, e);
21078        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21079            1, a, b, c,
21080        );
21081        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21082        assert_eq_m128h(r, e);
21083    }
21084
21085    #[simd_test(enable = "avx512fp16,avx512vl")]
21086    unsafe fn test_mm_fmaddsub_ph() {
21087        let a = _mm_set1_ph(1.0);
21088        let b = _mm_set1_ph(2.0);
21089        let c = _mm_set1_ph(3.0);
21090        let r = _mm_fmaddsub_ph(a, b, c);
21091        let e = _mm_set_ph(5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0);
21092        assert_eq_m128h(r, e);
21093    }
21094
21095    #[simd_test(enable = "avx512fp16,avx512vl")]
21096    unsafe fn test_mm_mask_fmaddsub_ph() {
21097        let a = _mm_set1_ph(1.0);
21098        let b = _mm_set1_ph(2.0);
21099        let c = _mm_set1_ph(3.0);
21100        let r = _mm_mask_fmaddsub_ph(a, 0b00110011, b, c);
21101        let e = _mm_set_ph(1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0);
21102        assert_eq_m128h(r, e);
21103    }
21104
21105    #[simd_test(enable = "avx512fp16,avx512vl")]
21106    unsafe fn test_mm_mask3_fmaddsub_ph() {
21107        let a = _mm_set1_ph(1.0);
21108        let b = _mm_set1_ph(2.0);
21109        let c = _mm_set1_ph(3.0);
21110        let r = _mm_mask3_fmaddsub_ph(a, b, c, 0b00110011);
21111        let e = _mm_set_ph(3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0);
21112        assert_eq_m128h(r, e);
21113    }
21114
21115    #[simd_test(enable = "avx512fp16,avx512vl")]
21116    unsafe fn test_mm_maskz_fmaddsub_ph() {
21117        let a = _mm_set1_ph(1.0);
21118        let b = _mm_set1_ph(2.0);
21119        let c = _mm_set1_ph(3.0);
21120        let r = _mm_maskz_fmaddsub_ph(0b00110011, a, b, c);
21121        let e = _mm_set_ph(0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0);
21122        assert_eq_m128h(r, e);
21123    }
21124
21125    #[simd_test(enable = "avx512fp16,avx512vl")]
21126    unsafe fn test_mm256_fmaddsub_ph() {
21127        let a = _mm256_set1_ph(1.0);
21128        let b = _mm256_set1_ph(2.0);
21129        let c = _mm256_set1_ph(3.0);
21130        let r = _mm256_fmaddsub_ph(a, b, c);
21131        let e = _mm256_set_ph(
21132            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21133        );
21134        assert_eq_m256h(r, e);
21135    }
21136
21137    #[simd_test(enable = "avx512fp16,avx512vl")]
21138    unsafe fn test_mm256_mask_fmaddsub_ph() {
21139        let a = _mm256_set1_ph(1.0);
21140        let b = _mm256_set1_ph(2.0);
21141        let c = _mm256_set1_ph(3.0);
21142        let r = _mm256_mask_fmaddsub_ph(a, 0b0011001100110011, b, c);
21143        let e = _mm256_set_ph(
21144            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21145        );
21146        assert_eq_m256h(r, e);
21147    }
21148
21149    #[simd_test(enable = "avx512fp16,avx512vl")]
21150    unsafe fn test_mm256_mask3_fmaddsub_ph() {
21151        let a = _mm256_set1_ph(1.0);
21152        let b = _mm256_set1_ph(2.0);
21153        let c = _mm256_set1_ph(3.0);
21154        let r = _mm256_mask3_fmaddsub_ph(a, b, c, 0b0011001100110011);
21155        let e = _mm256_set_ph(
21156            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21157        );
21158        assert_eq_m256h(r, e);
21159    }
21160
21161    #[simd_test(enable = "avx512fp16,avx512vl")]
21162    unsafe fn test_mm256_maskz_fmaddsub_ph() {
21163        let a = _mm256_set1_ph(1.0);
21164        let b = _mm256_set1_ph(2.0);
21165        let c = _mm256_set1_ph(3.0);
21166        let r = _mm256_maskz_fmaddsub_ph(0b0011001100110011, a, b, c);
21167        let e = _mm256_set_ph(
21168            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21169        );
21170        assert_eq_m256h(r, e);
21171    }
21172
21173    #[simd_test(enable = "avx512fp16")]
21174    unsafe fn test_mm512_fmaddsub_ph() {
21175        let a = _mm512_set1_ph(1.0);
21176        let b = _mm512_set1_ph(2.0);
21177        let c = _mm512_set1_ph(3.0);
21178        let r = _mm512_fmaddsub_ph(a, b, c);
21179        let e = _mm512_set_ph(
21180            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21181            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21182        );
21183        assert_eq_m512h(r, e);
21184    }
21185
21186    #[simd_test(enable = "avx512fp16")]
21187    unsafe fn test_mm512_mask_fmaddsub_ph() {
21188        let a = _mm512_set1_ph(1.0);
21189        let b = _mm512_set1_ph(2.0);
21190        let c = _mm512_set1_ph(3.0);
21191        let r = _mm512_mask_fmaddsub_ph(a, 0b00110011001100110011001100110011, b, c);
21192        let e = _mm512_set_ph(
21193            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21194            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21195        );
21196        assert_eq_m512h(r, e);
21197    }
21198
21199    #[simd_test(enable = "avx512fp16")]
21200    unsafe fn test_mm512_mask3_fmaddsub_ph() {
21201        let a = _mm512_set1_ph(1.0);
21202        let b = _mm512_set1_ph(2.0);
21203        let c = _mm512_set1_ph(3.0);
21204        let r = _mm512_mask3_fmaddsub_ph(a, b, c, 0b00110011001100110011001100110011);
21205        let e = _mm512_set_ph(
21206            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21207            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21208        );
21209        assert_eq_m512h(r, e);
21210    }
21211
21212    #[simd_test(enable = "avx512fp16")]
21213    unsafe fn test_mm512_maskz_fmaddsub_ph() {
21214        let a = _mm512_set1_ph(1.0);
21215        let b = _mm512_set1_ph(2.0);
21216        let c = _mm512_set1_ph(3.0);
21217        let r = _mm512_maskz_fmaddsub_ph(0b00110011001100110011001100110011, a, b, c);
21218        let e = _mm512_set_ph(
21219            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21220            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21221        );
21222        assert_eq_m512h(r, e);
21223    }
21224
21225    #[simd_test(enable = "avx512fp16")]
21226    unsafe fn test_mm512_fmaddsub_round_ph() {
21227        let a = _mm512_set1_ph(1.0);
21228        let b = _mm512_set1_ph(2.0);
21229        let c = _mm512_set1_ph(3.0);
21230        let r =
21231            _mm512_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21232        let e = _mm512_set_ph(
21233            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21234            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21235        );
21236        assert_eq_m512h(r, e);
21237    }
21238
21239    #[simd_test(enable = "avx512fp16")]
21240    unsafe fn test_mm512_mask_fmaddsub_round_ph() {
21241        let a = _mm512_set1_ph(1.0);
21242        let b = _mm512_set1_ph(2.0);
21243        let c = _mm512_set1_ph(3.0);
21244        let r = _mm512_mask_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21245            a,
21246            0b00110011001100110011001100110011,
21247            b,
21248            c,
21249        );
21250        let e = _mm512_set_ph(
21251            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21252            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21253        );
21254        assert_eq_m512h(r, e);
21255    }
21256
21257    #[simd_test(enable = "avx512fp16")]
21258    unsafe fn test_mm512_mask3_fmaddsub_round_ph() {
21259        let a = _mm512_set1_ph(1.0);
21260        let b = _mm512_set1_ph(2.0);
21261        let c = _mm512_set1_ph(3.0);
21262        let r = _mm512_mask3_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21263            a,
21264            b,
21265            c,
21266            0b00110011001100110011001100110011,
21267        );
21268        let e = _mm512_set_ph(
21269            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21270            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21271        );
21272        assert_eq_m512h(r, e);
21273    }
21274
21275    #[simd_test(enable = "avx512fp16")]
21276    unsafe fn test_mm512_maskz_fmaddsub_round_ph() {
21277        let a = _mm512_set1_ph(1.0);
21278        let b = _mm512_set1_ph(2.0);
21279        let c = _mm512_set1_ph(3.0);
21280        let r = _mm512_maskz_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21281            0b00110011001100110011001100110011,
21282            a,
21283            b,
21284            c,
21285        );
21286        let e = _mm512_set_ph(
21287            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21288            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21289        );
21290        assert_eq_m512h(r, e);
21291    }
21292
21293    #[simd_test(enable = "avx512fp16,avx512vl")]
21294    unsafe fn test_mm_fmsubadd_ph() {
21295        let a = _mm_set1_ph(1.0);
21296        let b = _mm_set1_ph(2.0);
21297        let c = _mm_set1_ph(3.0);
21298        let r = _mm_fmsubadd_ph(a, b, c);
21299        let e = _mm_set_ph(-1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0);
21300        assert_eq_m128h(r, e);
21301    }
21302
21303    #[simd_test(enable = "avx512fp16,avx512vl")]
21304    unsafe fn test_mm_mask_fmsubadd_ph() {
21305        let a = _mm_set1_ph(1.0);
21306        let b = _mm_set1_ph(2.0);
21307        let c = _mm_set1_ph(3.0);
21308        let r = _mm_mask_fmsubadd_ph(a, 0b00110011, b, c);
21309        let e = _mm_set_ph(1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0);
21310        assert_eq_m128h(r, e);
21311    }
21312
21313    #[simd_test(enable = "avx512fp16,avx512vl")]
21314    unsafe fn test_mm_mask3_fmsubadd_ph() {
21315        let a = _mm_set1_ph(1.0);
21316        let b = _mm_set1_ph(2.0);
21317        let c = _mm_set1_ph(3.0);
21318        let r = _mm_mask3_fmsubadd_ph(a, b, c, 0b00110011);
21319        let e = _mm_set_ph(3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0);
21320        assert_eq_m128h(r, e);
21321    }
21322
21323    #[simd_test(enable = "avx512fp16,avx512vl")]
21324    unsafe fn test_mm_maskz_fmsubadd_ph() {
21325        let a = _mm_set1_ph(1.0);
21326        let b = _mm_set1_ph(2.0);
21327        let c = _mm_set1_ph(3.0);
21328        let r = _mm_maskz_fmsubadd_ph(0b00110011, a, b, c);
21329        let e = _mm_set_ph(0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0);
21330        assert_eq_m128h(r, e);
21331    }
21332
21333    #[simd_test(enable = "avx512fp16,avx512vl")]
21334    unsafe fn test_mm256_fmsubadd_ph() {
21335        let a = _mm256_set1_ph(1.0);
21336        let b = _mm256_set1_ph(2.0);
21337        let c = _mm256_set1_ph(3.0);
21338        let r = _mm256_fmsubadd_ph(a, b, c);
21339        let e = _mm256_set_ph(
21340            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21341        );
21342        assert_eq_m256h(r, e);
21343    }
21344
21345    #[simd_test(enable = "avx512fp16,avx512vl")]
21346    unsafe fn test_mm256_mask_fmsubadd_ph() {
21347        let a = _mm256_set1_ph(1.0);
21348        let b = _mm256_set1_ph(2.0);
21349        let c = _mm256_set1_ph(3.0);
21350        let r = _mm256_mask_fmsubadd_ph(a, 0b0011001100110011, b, c);
21351        let e = _mm256_set_ph(
21352            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21353        );
21354        assert_eq_m256h(r, e);
21355    }
21356
21357    #[simd_test(enable = "avx512fp16,avx512vl")]
21358    unsafe fn test_mm256_mask3_fmsubadd_ph() {
21359        let a = _mm256_set1_ph(1.0);
21360        let b = _mm256_set1_ph(2.0);
21361        let c = _mm256_set1_ph(3.0);
21362        let r = _mm256_mask3_fmsubadd_ph(a, b, c, 0b0011001100110011);
21363        let e = _mm256_set_ph(
21364            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21365        );
21366        assert_eq_m256h(r, e);
21367    }
21368
21369    #[simd_test(enable = "avx512fp16,avx512vl")]
21370    unsafe fn test_mm256_maskz_fmsubadd_ph() {
21371        let a = _mm256_set1_ph(1.0);
21372        let b = _mm256_set1_ph(2.0);
21373        let c = _mm256_set1_ph(3.0);
21374        let r = _mm256_maskz_fmsubadd_ph(0b0011001100110011, a, b, c);
21375        let e = _mm256_set_ph(
21376            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21377        );
21378        assert_eq_m256h(r, e);
21379    }
21380
21381    #[simd_test(enable = "avx512fp16")]
21382    unsafe fn test_mm512_fmsubadd_ph() {
21383        let a = _mm512_set1_ph(1.0);
21384        let b = _mm512_set1_ph(2.0);
21385        let c = _mm512_set1_ph(3.0);
21386        let r = _mm512_fmsubadd_ph(a, b, c);
21387        let e = _mm512_set_ph(
21388            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21389            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21390        );
21391        assert_eq_m512h(r, e);
21392    }
21393
21394    #[simd_test(enable = "avx512fp16")]
21395    unsafe fn test_mm512_mask_fmsubadd_ph() {
21396        let a = _mm512_set1_ph(1.0);
21397        let b = _mm512_set1_ph(2.0);
21398        let c = _mm512_set1_ph(3.0);
21399        let r = _mm512_mask_fmsubadd_ph(a, 0b00110011001100110011001100110011, b, c);
21400        let e = _mm512_set_ph(
21401            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21402            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21403        );
21404        assert_eq_m512h(r, e);
21405    }
21406
21407    #[simd_test(enable = "avx512fp16")]
21408    unsafe fn test_mm512_mask3_fmsubadd_ph() {
21409        let a = _mm512_set1_ph(1.0);
21410        let b = _mm512_set1_ph(2.0);
21411        let c = _mm512_set1_ph(3.0);
21412        let r = _mm512_mask3_fmsubadd_ph(a, b, c, 0b00110011001100110011001100110011);
21413        let e = _mm512_set_ph(
21414            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21415            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21416        );
21417        assert_eq_m512h(r, e);
21418    }
21419
21420    #[simd_test(enable = "avx512fp16")]
21421    unsafe fn test_mm512_maskz_fmsubadd_ph() {
21422        let a = _mm512_set1_ph(1.0);
21423        let b = _mm512_set1_ph(2.0);
21424        let c = _mm512_set1_ph(3.0);
21425        let r = _mm512_maskz_fmsubadd_ph(0b00110011001100110011001100110011, a, b, c);
21426        let e = _mm512_set_ph(
21427            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21428            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21429        );
21430        assert_eq_m512h(r, e);
21431    }
21432
21433    #[simd_test(enable = "avx512fp16")]
21434    unsafe fn test_mm512_fmsubadd_round_ph() {
21435        let a = _mm512_set1_ph(1.0);
21436        let b = _mm512_set1_ph(2.0);
21437        let c = _mm512_set1_ph(3.0);
21438        let r =
21439            _mm512_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21440        let e = _mm512_set_ph(
21441            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21442            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21443        );
21444        assert_eq_m512h(r, e);
21445    }
21446
21447    #[simd_test(enable = "avx512fp16")]
21448    unsafe fn test_mm512_mask_fmsubadd_round_ph() {
21449        let a = _mm512_set1_ph(1.0);
21450        let b = _mm512_set1_ph(2.0);
21451        let c = _mm512_set1_ph(3.0);
21452        let r = _mm512_mask_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21453            a,
21454            0b00110011001100110011001100110011,
21455            b,
21456            c,
21457        );
21458        let e = _mm512_set_ph(
21459            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21460            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21461        );
21462        assert_eq_m512h(r, e);
21463    }
21464
21465    #[simd_test(enable = "avx512fp16")]
21466    unsafe fn test_mm512_mask3_fmsubadd_round_ph() {
21467        let a = _mm512_set1_ph(1.0);
21468        let b = _mm512_set1_ph(2.0);
21469        let c = _mm512_set1_ph(3.0);
21470        let r = _mm512_mask3_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21471            a,
21472            b,
21473            c,
21474            0b00110011001100110011001100110011,
21475        );
21476        let e = _mm512_set_ph(
21477            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21478            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21479        );
21480        assert_eq_m512h(r, e);
21481    }
21482
21483    #[simd_test(enable = "avx512fp16")]
21484    unsafe fn test_mm512_maskz_fmsubadd_round_ph() {
21485        let a = _mm512_set1_ph(1.0);
21486        let b = _mm512_set1_ph(2.0);
21487        let c = _mm512_set1_ph(3.0);
21488        let r = _mm512_maskz_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21489            0b00110011001100110011001100110011,
21490            a,
21491            b,
21492            c,
21493        );
21494        let e = _mm512_set_ph(
21495            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21496            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21497        );
21498        assert_eq_m512h(r, e);
21499    }
21500
21501    #[simd_test(enable = "avx512fp16,avx512vl")]
21502    unsafe fn test_mm_rcp_ph() {
21503        let a = _mm_set1_ph(2.0);
21504        let r = _mm_rcp_ph(a);
21505        let e = _mm_set1_ph(0.5);
21506        assert_eq_m128h(r, e);
21507    }
21508
21509    #[simd_test(enable = "avx512fp16,avx512vl")]
21510    unsafe fn test_mm_mask_rcp_ph() {
21511        let a = _mm_set1_ph(2.0);
21512        let src = _mm_set1_ph(1.0);
21513        let r = _mm_mask_rcp_ph(src, 0b01010101, a);
21514        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
21515        assert_eq_m128h(r, e);
21516    }
21517
21518    #[simd_test(enable = "avx512fp16,avx512vl")]
21519    unsafe fn test_mm_maskz_rcp_ph() {
21520        let a = _mm_set1_ph(2.0);
21521        let r = _mm_maskz_rcp_ph(0b01010101, a);
21522        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
21523        assert_eq_m128h(r, e);
21524    }
21525
21526    #[simd_test(enable = "avx512fp16,avx512vl")]
21527    unsafe fn test_mm256_rcp_ph() {
21528        let a = _mm256_set1_ph(2.0);
21529        let r = _mm256_rcp_ph(a);
21530        let e = _mm256_set1_ph(0.5);
21531        assert_eq_m256h(r, e);
21532    }
21533
21534    #[simd_test(enable = "avx512fp16,avx512vl")]
21535    unsafe fn test_mm256_mask_rcp_ph() {
21536        let a = _mm256_set1_ph(2.0);
21537        let src = _mm256_set1_ph(1.0);
21538        let r = _mm256_mask_rcp_ph(src, 0b0101010101010101, a);
21539        let e = _mm256_set_ph(
21540            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21541        );
21542        assert_eq_m256h(r, e);
21543    }
21544
21545    #[simd_test(enable = "avx512fp16,avx512vl")]
21546    unsafe fn test_mm256_maskz_rcp_ph() {
21547        let a = _mm256_set1_ph(2.0);
21548        let r = _mm256_maskz_rcp_ph(0b0101010101010101, a);
21549        let e = _mm256_set_ph(
21550            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21551        );
21552        assert_eq_m256h(r, e);
21553    }
21554
21555    #[simd_test(enable = "avx512fp16")]
21556    unsafe fn test_mm512_rcp_ph() {
21557        let a = _mm512_set1_ph(2.0);
21558        let r = _mm512_rcp_ph(a);
21559        let e = _mm512_set1_ph(0.5);
21560        assert_eq_m512h(r, e);
21561    }
21562
21563    #[simd_test(enable = "avx512fp16")]
21564    unsafe fn test_mm512_mask_rcp_ph() {
21565        let a = _mm512_set1_ph(2.0);
21566        let src = _mm512_set1_ph(1.0);
21567        let r = _mm512_mask_rcp_ph(src, 0b01010101010101010101010101010101, a);
21568        let e = _mm512_set_ph(
21569            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
21570            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21571        );
21572        assert_eq_m512h(r, e);
21573    }
21574
21575    #[simd_test(enable = "avx512fp16")]
21576    unsafe fn test_mm512_maskz_rcp_ph() {
21577        let a = _mm512_set1_ph(2.0);
21578        let r = _mm512_maskz_rcp_ph(0b01010101010101010101010101010101, a);
21579        let e = _mm512_set_ph(
21580            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
21581            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21582        );
21583        assert_eq_m512h(r, e);
21584    }
21585
21586    #[simd_test(enable = "avx512fp16")]
21587    unsafe fn test_mm_rcp_sh() {
21588        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21589        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21590        let r = _mm_rcp_sh(a, b);
21591        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21592        assert_eq_m128h(r, e);
21593    }
21594
21595    #[simd_test(enable = "avx512fp16")]
21596    unsafe fn test_mm_mask_rcp_sh() {
21597        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21598        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21599        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
21600        let r = _mm_mask_rcp_sh(src, 0, a, b);
21601        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21602        assert_eq_m128h(r, e);
21603        let r = _mm_mask_rcp_sh(src, 1, a, b);
21604        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21605        assert_eq_m128h(r, e);
21606    }
21607
21608    #[simd_test(enable = "avx512fp16")]
21609    unsafe fn test_mm_maskz_rcp_sh() {
21610        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21611        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21612        let r = _mm_maskz_rcp_sh(0, a, b);
21613        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21614        assert_eq_m128h(r, e);
21615        let r = _mm_maskz_rcp_sh(1, a, b);
21616        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21617        assert_eq_m128h(r, e);
21618    }
21619
21620    #[simd_test(enable = "avx512fp16,avx512vl")]
21621    unsafe fn test_mm_rsqrt_ph() {
21622        let a = _mm_set1_ph(4.0);
21623        let r = _mm_rsqrt_ph(a);
21624        let e = _mm_set1_ph(0.5);
21625        assert_eq_m128h(r, e);
21626    }
21627
21628    #[simd_test(enable = "avx512fp16,avx512vl")]
21629    unsafe fn test_mm_mask_rsqrt_ph() {
21630        let a = _mm_set1_ph(4.0);
21631        let src = _mm_set1_ph(1.0);
21632        let r = _mm_mask_rsqrt_ph(src, 0b01010101, a);
21633        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
21634        assert_eq_m128h(r, e);
21635    }
21636
21637    #[simd_test(enable = "avx512fp16,avx512vl")]
21638    unsafe fn test_mm_maskz_rsqrt_ph() {
21639        let a = _mm_set1_ph(4.0);
21640        let r = _mm_maskz_rsqrt_ph(0b01010101, a);
21641        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
21642        assert_eq_m128h(r, e);
21643    }
21644
21645    #[simd_test(enable = "avx512fp16,avx512vl")]
21646    unsafe fn test_mm256_rsqrt_ph() {
21647        let a = _mm256_set1_ph(4.0);
21648        let r = _mm256_rsqrt_ph(a);
21649        let e = _mm256_set1_ph(0.5);
21650        assert_eq_m256h(r, e);
21651    }
21652
21653    #[simd_test(enable = "avx512fp16,avx512vl")]
21654    unsafe fn test_mm256_mask_rsqrt_ph() {
21655        let a = _mm256_set1_ph(4.0);
21656        let src = _mm256_set1_ph(1.0);
21657        let r = _mm256_mask_rsqrt_ph(src, 0b0101010101010101, a);
21658        let e = _mm256_set_ph(
21659            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21660        );
21661        assert_eq_m256h(r, e);
21662    }
21663
21664    #[simd_test(enable = "avx512fp16,avx512vl")]
21665    unsafe fn test_mm256_maskz_rsqrt_ph() {
21666        let a = _mm256_set1_ph(4.0);
21667        let r = _mm256_maskz_rsqrt_ph(0b0101010101010101, a);
21668        let e = _mm256_set_ph(
21669            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21670        );
21671        assert_eq_m256h(r, e);
21672    }
21673
21674    #[simd_test(enable = "avx512fp16")]
21675    unsafe fn test_mm512_rsqrt_ph() {
21676        let a = _mm512_set1_ph(4.0);
21677        let r = _mm512_rsqrt_ph(a);
21678        let e = _mm512_set1_ph(0.5);
21679        assert_eq_m512h(r, e);
21680    }
21681
21682    #[simd_test(enable = "avx512fp16")]
21683    unsafe fn test_mm512_mask_rsqrt_ph() {
21684        let a = _mm512_set1_ph(4.0);
21685        let src = _mm512_set1_ph(1.0);
21686        let r = _mm512_mask_rsqrt_ph(src, 0b01010101010101010101010101010101, a);
21687        let e = _mm512_set_ph(
21688            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
21689            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21690        );
21691        assert_eq_m512h(r, e);
21692    }
21693
21694    #[simd_test(enable = "avx512fp16")]
21695    unsafe fn test_mm512_maskz_rsqrt_ph() {
21696        let a = _mm512_set1_ph(4.0);
21697        let r = _mm512_maskz_rsqrt_ph(0b01010101010101010101010101010101, a);
21698        let e = _mm512_set_ph(
21699            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
21700            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21701        );
21702        assert_eq_m512h(r, e);
21703    }
21704
21705    #[simd_test(enable = "avx512fp16")]
21706    unsafe fn test_mm_rsqrt_sh() {
21707        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21708        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
21709        let r = _mm_rsqrt_sh(a, b);
21710        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21711        assert_eq_m128h(r, e);
21712    }
21713
21714    #[simd_test(enable = "avx512fp16")]
21715    unsafe fn test_mm_mask_rsqrt_sh() {
21716        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21717        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
21718        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
21719        let r = _mm_mask_rsqrt_sh(src, 0, a, b);
21720        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21721        assert_eq_m128h(r, e);
21722        let r = _mm_mask_rsqrt_sh(src, 1, a, b);
21723        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21724        assert_eq_m128h(r, e);
21725    }
21726
21727    #[simd_test(enable = "avx512fp16")]
21728    unsafe fn test_mm_maskz_rsqrt_sh() {
21729        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21730        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
21731        let r = _mm_maskz_rsqrt_sh(0, a, b);
21732        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21733        assert_eq_m128h(r, e);
21734        let r = _mm_maskz_rsqrt_sh(1, a, b);
21735        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21736        assert_eq_m128h(r, e);
21737    }
21738
21739    #[simd_test(enable = "avx512fp16,avx512vl")]
21740    unsafe fn test_mm_sqrt_ph() {
21741        let a = _mm_set1_ph(4.0);
21742        let r = _mm_sqrt_ph(a);
21743        let e = _mm_set1_ph(2.0);
21744        assert_eq_m128h(r, e);
21745    }
21746
21747    #[simd_test(enable = "avx512fp16,avx512vl")]
21748    unsafe fn test_mm_mask_sqrt_ph() {
21749        let a = _mm_set1_ph(4.0);
21750        let src = _mm_set1_ph(1.0);
21751        let r = _mm_mask_sqrt_ph(src, 0b01010101, a);
21752        let e = _mm_set_ph(1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0);
21753        assert_eq_m128h(r, e);
21754    }
21755
21756    #[simd_test(enable = "avx512fp16,avx512vl")]
21757    unsafe fn test_mm_maskz_sqrt_ph() {
21758        let a = _mm_set1_ph(4.0);
21759        let r = _mm_maskz_sqrt_ph(0b01010101, a);
21760        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
21761        assert_eq_m128h(r, e);
21762    }
21763
21764    #[simd_test(enable = "avx512fp16,avx512vl")]
21765    unsafe fn test_mm256_sqrt_ph() {
21766        let a = _mm256_set1_ph(4.0);
21767        let r = _mm256_sqrt_ph(a);
21768        let e = _mm256_set1_ph(2.0);
21769        assert_eq_m256h(r, e);
21770    }
21771
21772    #[simd_test(enable = "avx512fp16,avx512vl")]
21773    unsafe fn test_mm256_mask_sqrt_ph() {
21774        let a = _mm256_set1_ph(4.0);
21775        let src = _mm256_set1_ph(1.0);
21776        let r = _mm256_mask_sqrt_ph(src, 0b0101010101010101, a);
21777        let e = _mm256_set_ph(
21778            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
21779        );
21780        assert_eq_m256h(r, e);
21781    }
21782
21783    #[simd_test(enable = "avx512fp16,avx512vl")]
21784    unsafe fn test_mm256_maskz_sqrt_ph() {
21785        let a = _mm256_set1_ph(4.0);
21786        let r = _mm256_maskz_sqrt_ph(0b0101010101010101, a);
21787        let e = _mm256_set_ph(
21788            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
21789        );
21790        assert_eq_m256h(r, e);
21791    }
21792
21793    #[simd_test(enable = "avx512fp16")]
21794    unsafe fn test_mm512_sqrt_ph() {
21795        let a = _mm512_set1_ph(4.0);
21796        let r = _mm512_sqrt_ph(a);
21797        let e = _mm512_set1_ph(2.0);
21798        assert_eq_m512h(r, e);
21799    }
21800
21801    #[simd_test(enable = "avx512fp16")]
21802    unsafe fn test_mm512_mask_sqrt_ph() {
21803        let a = _mm512_set1_ph(4.0);
21804        let src = _mm512_set1_ph(1.0);
21805        let r = _mm512_mask_sqrt_ph(src, 0b01010101010101010101010101010101, a);
21806        let e = _mm512_set_ph(
21807            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
21808            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
21809        );
21810        assert_eq_m512h(r, e);
21811    }
21812
21813    #[simd_test(enable = "avx512fp16")]
21814    unsafe fn test_mm512_maskz_sqrt_ph() {
21815        let a = _mm512_set1_ph(4.0);
21816        let r = _mm512_maskz_sqrt_ph(0b01010101010101010101010101010101, a);
21817        let e = _mm512_set_ph(
21818            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
21819            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
21820        );
21821        assert_eq_m512h(r, e);
21822    }
21823
21824    #[simd_test(enable = "avx512fp16")]
21825    unsafe fn test_mm512_sqrt_round_ph() {
21826        let a = _mm512_set1_ph(4.0);
21827        let r = _mm512_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
21828        let e = _mm512_set1_ph(2.0);
21829        assert_eq_m512h(r, e);
21830    }
21831
21832    #[simd_test(enable = "avx512fp16")]
21833    unsafe fn test_mm512_mask_sqrt_round_ph() {
21834        let a = _mm512_set1_ph(4.0);
21835        let src = _mm512_set1_ph(1.0);
21836        let r = _mm512_mask_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21837            src,
21838            0b01010101010101010101010101010101,
21839            a,
21840        );
21841        let e = _mm512_set_ph(
21842            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
21843            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
21844        );
21845        assert_eq_m512h(r, e);
21846    }
21847
21848    #[simd_test(enable = "avx512fp16")]
21849    unsafe fn test_mm512_maskz_sqrt_round_ph() {
21850        let a = _mm512_set1_ph(4.0);
21851        let r = _mm512_maskz_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21852            0b01010101010101010101010101010101,
21853            a,
21854        );
21855        let e = _mm512_set_ph(
21856            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
21857            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
21858        );
21859        assert_eq_m512h(r, e);
21860    }
21861
21862    #[simd_test(enable = "avx512fp16")]
21863    unsafe fn test_mm_sqrt_sh() {
21864        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21865        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
21866        let r = _mm_sqrt_sh(a, b);
21867        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21868        assert_eq_m128h(r, e);
21869    }
21870
21871    #[simd_test(enable = "avx512fp16")]
21872    unsafe fn test_mm_mask_sqrt_sh() {
21873        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21874        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
21875        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
21876        let r = _mm_mask_sqrt_sh(src, 0, a, b);
21877        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21878        assert_eq_m128h(r, e);
21879        let r = _mm_mask_sqrt_sh(src, 1, a, b);
21880        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21881        assert_eq_m128h(r, e);
21882    }
21883
21884    #[simd_test(enable = "avx512fp16")]
21885    unsafe fn test_mm_maskz_sqrt_sh() {
21886        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21887        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
21888        let r = _mm_maskz_sqrt_sh(0, a, b);
21889        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21890        assert_eq_m128h(r, e);
21891        let r = _mm_maskz_sqrt_sh(1, a, b);
21892        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21893        assert_eq_m128h(r, e);
21894    }
21895
21896    #[simd_test(enable = "avx512fp16")]
21897    unsafe fn test_mm_sqrt_round_sh() {
21898        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21899        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
21900        let r = _mm_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
21901        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21902        assert_eq_m128h(r, e);
21903    }
21904
21905    #[simd_test(enable = "avx512fp16")]
21906    unsafe fn test_mm_mask_sqrt_round_sh() {
21907        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21908        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
21909        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
21910        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21911            src, 0, a, b,
21912        );
21913        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21914        assert_eq_m128h(r, e);
21915        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21916            src, 1, a, b,
21917        );
21918        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21919        assert_eq_m128h(r, e);
21920    }
21921
21922    #[simd_test(enable = "avx512fp16")]
21923    unsafe fn test_mm_maskz_sqrt_round_sh() {
21924        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21925        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
21926        let r =
21927            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
21928        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21929        assert_eq_m128h(r, e);
21930        let r =
21931            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
21932        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21933        assert_eq_m128h(r, e);
21934    }
21935
21936    #[simd_test(enable = "avx512fp16,avx512vl")]
21937    unsafe fn test_mm_max_ph() {
21938        let a = _mm_set1_ph(2.0);
21939        let b = _mm_set1_ph(1.0);
21940        let r = _mm_max_ph(a, b);
21941        let e = _mm_set1_ph(2.0);
21942        assert_eq_m128h(r, e);
21943    }
21944
21945    #[simd_test(enable = "avx512fp16,avx512vl")]
21946    unsafe fn test_mm_mask_max_ph() {
21947        let a = _mm_set1_ph(2.0);
21948        let b = _mm_set1_ph(1.0);
21949        let src = _mm_set1_ph(3.0);
21950        let r = _mm_mask_max_ph(src, 0b01010101, a, b);
21951        let e = _mm_set_ph(3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0);
21952        assert_eq_m128h(r, e);
21953    }
21954
21955    #[simd_test(enable = "avx512fp16,avx512vl")]
21956    unsafe fn test_mm_maskz_max_ph() {
21957        let a = _mm_set1_ph(2.0);
21958        let b = _mm_set1_ph(1.0);
21959        let r = _mm_maskz_max_ph(0b01010101, a, b);
21960        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
21961        assert_eq_m128h(r, e);
21962    }
21963
21964    #[simd_test(enable = "avx512fp16,avx512vl")]
21965    unsafe fn test_mm256_max_ph() {
21966        let a = _mm256_set1_ph(2.0);
21967        let b = _mm256_set1_ph(1.0);
21968        let r = _mm256_max_ph(a, b);
21969        let e = _mm256_set1_ph(2.0);
21970        assert_eq_m256h(r, e);
21971    }
21972
21973    #[simd_test(enable = "avx512fp16,avx512vl")]
21974    unsafe fn test_mm256_mask_max_ph() {
21975        let a = _mm256_set1_ph(2.0);
21976        let b = _mm256_set1_ph(1.0);
21977        let src = _mm256_set1_ph(3.0);
21978        let r = _mm256_mask_max_ph(src, 0b0101010101010101, a, b);
21979        let e = _mm256_set_ph(
21980            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
21981        );
21982        assert_eq_m256h(r, e);
21983    }
21984
21985    #[simd_test(enable = "avx512fp16,avx512vl")]
21986    unsafe fn test_mm256_maskz_max_ph() {
21987        let a = _mm256_set1_ph(2.0);
21988        let b = _mm256_set1_ph(1.0);
21989        let r = _mm256_maskz_max_ph(0b0101010101010101, a, b);
21990        let e = _mm256_set_ph(
21991            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
21992        );
21993        assert_eq_m256h(r, e);
21994    }
21995
21996    #[simd_test(enable = "avx512fp16")]
21997    unsafe fn test_mm512_max_ph() {
21998        let a = _mm512_set1_ph(2.0);
21999        let b = _mm512_set1_ph(1.0);
22000        let r = _mm512_max_ph(a, b);
22001        let e = _mm512_set1_ph(2.0);
22002        assert_eq_m512h(r, e);
22003    }
22004
22005    #[simd_test(enable = "avx512fp16")]
22006    unsafe fn test_mm512_mask_max_ph() {
22007        let a = _mm512_set1_ph(2.0);
22008        let b = _mm512_set1_ph(1.0);
22009        let src = _mm512_set1_ph(3.0);
22010        let r = _mm512_mask_max_ph(src, 0b01010101010101010101010101010101, a, b);
22011        let e = _mm512_set_ph(
22012            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22013            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22014        );
22015        assert_eq_m512h(r, e);
22016    }
22017
22018    #[simd_test(enable = "avx512fp16")]
22019    unsafe fn test_mm512_maskz_max_ph() {
22020        let a = _mm512_set1_ph(2.0);
22021        let b = _mm512_set1_ph(1.0);
22022        let r = _mm512_maskz_max_ph(0b01010101010101010101010101010101, a, b);
22023        let e = _mm512_set_ph(
22024            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22025            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22026        );
22027        assert_eq_m512h(r, e);
22028    }
22029
22030    #[simd_test(enable = "avx512fp16")]
22031    unsafe fn test_mm512_max_round_ph() {
22032        let a = _mm512_set1_ph(2.0);
22033        let b = _mm512_set1_ph(1.0);
22034        let r = _mm512_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22035        let e = _mm512_set1_ph(2.0);
22036        assert_eq_m512h(r, e);
22037    }
22038
22039    #[simd_test(enable = "avx512fp16")]
22040    unsafe fn test_mm512_mask_max_round_ph() {
22041        let a = _mm512_set1_ph(2.0);
22042        let b = _mm512_set1_ph(1.0);
22043        let src = _mm512_set1_ph(3.0);
22044        let r = _mm512_mask_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22045            src,
22046            0b01010101010101010101010101010101,
22047            a,
22048            b,
22049        );
22050        let e = _mm512_set_ph(
22051            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22052            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22053        );
22054        assert_eq_m512h(r, e);
22055    }
22056
22057    #[simd_test(enable = "avx512fp16")]
22058    unsafe fn test_mm512_maskz_max_round_ph() {
22059        let a = _mm512_set1_ph(2.0);
22060        let b = _mm512_set1_ph(1.0);
22061        let r = _mm512_maskz_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22062            0b01010101010101010101010101010101,
22063            a,
22064            b,
22065        );
22066        let e = _mm512_set_ph(
22067            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22068            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22069        );
22070        assert_eq_m512h(r, e);
22071    }
22072
22073    #[simd_test(enable = "avx512fp16")]
22074    unsafe fn test_mm_max_sh() {
22075        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22076        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22077        let r = _mm_max_sh(a, b);
22078        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22079        assert_eq_m128h(r, e);
22080    }
22081
22082    #[simd_test(enable = "avx512fp16")]
22083    unsafe fn test_mm_mask_max_sh() {
22084        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22085        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22086        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22087        let r = _mm_mask_max_sh(src, 0, a, b);
22088        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22089        assert_eq_m128h(r, e);
22090        let r = _mm_mask_max_sh(src, 1, a, b);
22091        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22092        assert_eq_m128h(r, e);
22093    }
22094
22095    #[simd_test(enable = "avx512fp16")]
22096    unsafe fn test_mm_maskz_max_sh() {
22097        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22098        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22099        let r = _mm_maskz_max_sh(0, a, b);
22100        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22101        assert_eq_m128h(r, e);
22102        let r = _mm_maskz_max_sh(1, a, b);
22103        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22104        assert_eq_m128h(r, e);
22105    }
22106
22107    #[simd_test(enable = "avx512fp16")]
22108    unsafe fn test_mm_max_round_sh() {
22109        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22110        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22111        let r = _mm_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22112        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22113        assert_eq_m128h(r, e);
22114    }
22115
22116    #[simd_test(enable = "avx512fp16")]
22117    unsafe fn test_mm_mask_max_round_sh() {
22118        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22119        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22120        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22121        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22122            src, 0, a, b,
22123        );
22124        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22125        assert_eq_m128h(r, e);
22126        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22127            src, 1, a, b,
22128        );
22129        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22130        assert_eq_m128h(r, e);
22131    }
22132
22133    #[simd_test(enable = "avx512fp16")]
22134    unsafe fn test_mm_maskz_max_round_sh() {
22135        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22136        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22137        let r =
22138            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22139        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22140        assert_eq_m128h(r, e);
22141        let r =
22142            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22143        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22144        assert_eq_m128h(r, e);
22145    }
22146
22147    #[simd_test(enable = "avx512fp16,avx512vl")]
22148    unsafe fn test_mm_min_ph() {
22149        let a = _mm_set1_ph(2.0);
22150        let b = _mm_set1_ph(1.0);
22151        let r = _mm_min_ph(a, b);
22152        let e = _mm_set1_ph(1.0);
22153        assert_eq_m128h(r, e);
22154    }
22155
22156    #[simd_test(enable = "avx512fp16,avx512vl")]
22157    unsafe fn test_mm_mask_min_ph() {
22158        let a = _mm_set1_ph(2.0);
22159        let b = _mm_set1_ph(1.0);
22160        let src = _mm_set1_ph(3.0);
22161        let r = _mm_mask_min_ph(src, 0b01010101, a, b);
22162        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
22163        assert_eq_m128h(r, e);
22164    }
22165
22166    #[simd_test(enable = "avx512fp16,avx512vl")]
22167    unsafe fn test_mm_maskz_min_ph() {
22168        let a = _mm_set1_ph(2.0);
22169        let b = _mm_set1_ph(1.0);
22170        let r = _mm_maskz_min_ph(0b01010101, a, b);
22171        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22172        assert_eq_m128h(r, e);
22173    }
22174
22175    #[simd_test(enable = "avx512fp16,avx512vl")]
22176    unsafe fn test_mm256_min_ph() {
22177        let a = _mm256_set1_ph(2.0);
22178        let b = _mm256_set1_ph(1.0);
22179        let r = _mm256_min_ph(a, b);
22180        let e = _mm256_set1_ph(1.0);
22181        assert_eq_m256h(r, e);
22182    }
22183
22184    #[simd_test(enable = "avx512fp16,avx512vl")]
22185    unsafe fn test_mm256_mask_min_ph() {
22186        let a = _mm256_set1_ph(2.0);
22187        let b = _mm256_set1_ph(1.0);
22188        let src = _mm256_set1_ph(3.0);
22189        let r = _mm256_mask_min_ph(src, 0b0101010101010101, a, b);
22190        let e = _mm256_set_ph(
22191            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22192        );
22193        assert_eq_m256h(r, e);
22194    }
22195
22196    #[simd_test(enable = "avx512fp16,avx512vl")]
22197    unsafe fn test_mm256_maskz_min_ph() {
22198        let a = _mm256_set1_ph(2.0);
22199        let b = _mm256_set1_ph(1.0);
22200        let r = _mm256_maskz_min_ph(0b0101010101010101, a, b);
22201        let e = _mm256_set_ph(
22202            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22203        );
22204        assert_eq_m256h(r, e);
22205    }
22206
22207    #[simd_test(enable = "avx512fp16")]
22208    unsafe fn test_mm512_min_ph() {
22209        let a = _mm512_set1_ph(2.0);
22210        let b = _mm512_set1_ph(1.0);
22211        let r = _mm512_min_ph(a, b);
22212        let e = _mm512_set1_ph(1.0);
22213        assert_eq_m512h(r, e);
22214    }
22215
22216    #[simd_test(enable = "avx512fp16")]
22217    unsafe fn test_mm512_mask_min_ph() {
22218        let a = _mm512_set1_ph(2.0);
22219        let b = _mm512_set1_ph(1.0);
22220        let src = _mm512_set1_ph(3.0);
22221        let r = _mm512_mask_min_ph(src, 0b01010101010101010101010101010101, a, b);
22222        let e = _mm512_set_ph(
22223            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22224            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22225        );
22226        assert_eq_m512h(r, e);
22227    }
22228
22229    #[simd_test(enable = "avx512fp16")]
22230    unsafe fn test_mm512_maskz_min_ph() {
22231        let a = _mm512_set1_ph(2.0);
22232        let b = _mm512_set1_ph(1.0);
22233        let r = _mm512_maskz_min_ph(0b01010101010101010101010101010101, a, b);
22234        let e = _mm512_set_ph(
22235            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22236            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22237        );
22238        assert_eq_m512h(r, e);
22239    }
22240
22241    #[simd_test(enable = "avx512fp16")]
22242    unsafe fn test_mm512_min_round_ph() {
22243        let a = _mm512_set1_ph(2.0);
22244        let b = _mm512_set1_ph(1.0);
22245        let r = _mm512_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22246        let e = _mm512_set1_ph(1.0);
22247        assert_eq_m512h(r, e);
22248    }
22249
22250    #[simd_test(enable = "avx512fp16")]
22251    unsafe fn test_mm512_mask_min_round_ph() {
22252        let a = _mm512_set1_ph(2.0);
22253        let b = _mm512_set1_ph(1.0);
22254        let src = _mm512_set1_ph(3.0);
22255        let r = _mm512_mask_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22256            src,
22257            0b01010101010101010101010101010101,
22258            a,
22259            b,
22260        );
22261        let e = _mm512_set_ph(
22262            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22263            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22264        );
22265        assert_eq_m512h(r, e);
22266    }
22267
22268    #[simd_test(enable = "avx512fp16")]
22269    unsafe fn test_mm512_maskz_min_round_ph() {
22270        let a = _mm512_set1_ph(2.0);
22271        let b = _mm512_set1_ph(1.0);
22272        let r = _mm512_maskz_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22273            0b01010101010101010101010101010101,
22274            a,
22275            b,
22276        );
22277        let e = _mm512_set_ph(
22278            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22279            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22280        );
22281        assert_eq_m512h(r, e);
22282    }
22283
22284    #[simd_test(enable = "avx512fp16")]
22285    unsafe fn test_mm_min_sh() {
22286        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22287        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22288        let r = _mm_min_sh(a, b);
22289        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22290        assert_eq_m128h(r, e);
22291    }
22292
22293    #[simd_test(enable = "avx512fp16")]
22294    unsafe fn test_mm_mask_min_sh() {
22295        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22296        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22297        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22298        let r = _mm_mask_min_sh(src, 0, a, b);
22299        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22300        assert_eq_m128h(r, e);
22301        let r = _mm_mask_min_sh(src, 1, a, b);
22302        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22303        assert_eq_m128h(r, e);
22304    }
22305
22306    #[simd_test(enable = "avx512fp16")]
22307    unsafe fn test_mm_maskz_min_sh() {
22308        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22309        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22310        let r = _mm_maskz_min_sh(0, a, b);
22311        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22312        assert_eq_m128h(r, e);
22313        let r = _mm_maskz_min_sh(1, a, b);
22314        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22315        assert_eq_m128h(r, e);
22316    }
22317
22318    #[simd_test(enable = "avx512fp16")]
22319    unsafe fn test_mm_min_round_sh() {
22320        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22321        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22322        let r = _mm_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22323        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22324        assert_eq_m128h(r, e);
22325    }
22326
22327    #[simd_test(enable = "avx512fp16")]
22328    unsafe fn test_mm_mask_min_round_sh() {
22329        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22330        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22331        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22332        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22333            src, 0, a, b,
22334        );
22335        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22336        assert_eq_m128h(r, e);
22337        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22338            src, 1, a, b,
22339        );
22340        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22341        assert_eq_m128h(r, e);
22342    }
22343
22344    #[simd_test(enable = "avx512fp16")]
22345    unsafe fn test_mm_maskz_min_round_sh() {
22346        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22347        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22348        let r =
22349            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22350        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22351        assert_eq_m128h(r, e);
22352        let r =
22353            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22354        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22355        assert_eq_m128h(r, e);
22356    }
22357
22358    #[simd_test(enable = "avx512fp16,avx512vl")]
22359    unsafe fn test_mm_getexp_ph() {
22360        let a = _mm_set1_ph(3.0);
22361        let r = _mm_getexp_ph(a);
22362        let e = _mm_set1_ph(1.0);
22363        assert_eq_m128h(r, e);
22364    }
22365
22366    #[simd_test(enable = "avx512fp16,avx512vl")]
22367    unsafe fn test_mm_mask_getexp_ph() {
22368        let a = _mm_set1_ph(3.0);
22369        let src = _mm_set1_ph(4.0);
22370        let r = _mm_mask_getexp_ph(src, 0b01010101, a);
22371        let e = _mm_set_ph(4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0);
22372        assert_eq_m128h(r, e);
22373    }
22374
22375    #[simd_test(enable = "avx512fp16,avx512vl")]
22376    unsafe fn test_mm_maskz_getexp_ph() {
22377        let a = _mm_set1_ph(3.0);
22378        let r = _mm_maskz_getexp_ph(0b01010101, a);
22379        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22380        assert_eq_m128h(r, e);
22381    }
22382
22383    #[simd_test(enable = "avx512fp16,avx512vl")]
22384    unsafe fn test_mm256_getexp_ph() {
22385        let a = _mm256_set1_ph(3.0);
22386        let r = _mm256_getexp_ph(a);
22387        let e = _mm256_set1_ph(1.0);
22388        assert_eq_m256h(r, e);
22389    }
22390
22391    #[simd_test(enable = "avx512fp16,avx512vl")]
22392    unsafe fn test_mm256_mask_getexp_ph() {
22393        let a = _mm256_set1_ph(3.0);
22394        let src = _mm256_set1_ph(4.0);
22395        let r = _mm256_mask_getexp_ph(src, 0b0101010101010101, a);
22396        let e = _mm256_set_ph(
22397            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22398        );
22399        assert_eq_m256h(r, e);
22400    }
22401
22402    #[simd_test(enable = "avx512fp16,avx512vl")]
22403    unsafe fn test_mm256_maskz_getexp_ph() {
22404        let a = _mm256_set1_ph(3.0);
22405        let r = _mm256_maskz_getexp_ph(0b0101010101010101, a);
22406        let e = _mm256_set_ph(
22407            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22408        );
22409        assert_eq_m256h(r, e);
22410    }
22411
22412    #[simd_test(enable = "avx512fp16")]
22413    unsafe fn test_mm512_getexp_ph() {
22414        let a = _mm512_set1_ph(3.0);
22415        let r = _mm512_getexp_ph(a);
22416        let e = _mm512_set1_ph(1.0);
22417        assert_eq_m512h(r, e);
22418    }
22419
22420    #[simd_test(enable = "avx512fp16")]
22421    unsafe fn test_mm512_mask_getexp_ph() {
22422        let a = _mm512_set1_ph(3.0);
22423        let src = _mm512_set1_ph(4.0);
22424        let r = _mm512_mask_getexp_ph(src, 0b01010101010101010101010101010101, a);
22425        let e = _mm512_set_ph(
22426            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
22427            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22428        );
22429        assert_eq_m512h(r, e);
22430    }
22431
22432    #[simd_test(enable = "avx512fp16")]
22433    unsafe fn test_mm512_maskz_getexp_ph() {
22434        let a = _mm512_set1_ph(3.0);
22435        let r = _mm512_maskz_getexp_ph(0b01010101010101010101010101010101, a);
22436        let e = _mm512_set_ph(
22437            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22438            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22439        );
22440        assert_eq_m512h(r, e);
22441    }
22442
22443    #[simd_test(enable = "avx512fp16")]
22444    unsafe fn test_mm512_getexp_round_ph() {
22445        let a = _mm512_set1_ph(3.0);
22446        let r = _mm512_getexp_round_ph::<_MM_FROUND_NO_EXC>(a);
22447        let e = _mm512_set1_ph(1.0);
22448        assert_eq_m512h(r, e);
22449    }
22450
22451    #[simd_test(enable = "avx512fp16")]
22452    unsafe fn test_mm512_mask_getexp_round_ph() {
22453        let a = _mm512_set1_ph(3.0);
22454        let src = _mm512_set1_ph(4.0);
22455        let r = _mm512_mask_getexp_round_ph::<_MM_FROUND_NO_EXC>(
22456            src,
22457            0b01010101010101010101010101010101,
22458            a,
22459        );
22460        let e = _mm512_set_ph(
22461            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
22462            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22463        );
22464        assert_eq_m512h(r, e);
22465    }
22466
22467    #[simd_test(enable = "avx512fp16")]
22468    unsafe fn test_mm512_maskz_getexp_round_ph() {
22469        let a = _mm512_set1_ph(3.0);
22470        let r = _mm512_maskz_getexp_round_ph::<_MM_FROUND_NO_EXC>(
22471            0b01010101010101010101010101010101,
22472            a,
22473        );
22474        let e = _mm512_set_ph(
22475            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22476            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22477        );
22478        assert_eq_m512h(r, e);
22479    }
22480
22481    #[simd_test(enable = "avx512fp16")]
22482    unsafe fn test_mm_getexp_sh() {
22483        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22484        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22485        let r = _mm_getexp_sh(a, b);
22486        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22487        assert_eq_m128h(r, e);
22488    }
22489
22490    #[simd_test(enable = "avx512fp16")]
22491    unsafe fn test_mm_mask_getexp_sh() {
22492        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22493        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22494        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
22495        let r = _mm_mask_getexp_sh(src, 0, a, b);
22496        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22497        assert_eq_m128h(r, e);
22498        let r = _mm_mask_getexp_sh(src, 1, a, b);
22499        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22500        assert_eq_m128h(r, e);
22501    }
22502
22503    #[simd_test(enable = "avx512fp16")]
22504    unsafe fn test_mm_maskz_getexp_sh() {
22505        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22506        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22507        let r = _mm_maskz_getexp_sh(0, a, b);
22508        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22509        assert_eq_m128h(r, e);
22510        let r = _mm_maskz_getexp_sh(1, a, b);
22511        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22512        assert_eq_m128h(r, e);
22513    }
22514
22515    #[simd_test(enable = "avx512fp16")]
22516    unsafe fn test_mm_getexp_round_sh() {
22517        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22518        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22519        let r = _mm_getexp_round_sh::<_MM_FROUND_NO_EXC>(a, b);
22520        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22521        assert_eq_m128h(r, e);
22522    }
22523
22524    #[simd_test(enable = "avx512fp16")]
22525    unsafe fn test_mm_mask_getexp_round_sh() {
22526        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22527        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22528        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
22529        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 0, a, b);
22530        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22531        assert_eq_m128h(r, e);
22532        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 1, a, b);
22533        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22534        assert_eq_m128h(r, e);
22535    }
22536
22537    #[simd_test(enable = "avx512fp16")]
22538    unsafe fn test_mm_maskz_getexp_round_sh() {
22539        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22540        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22541        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(0, a, b);
22542        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22543        assert_eq_m128h(r, e);
22544        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(1, a, b);
22545        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22546        assert_eq_m128h(r, e);
22547    }
22548
22549    #[simd_test(enable = "avx512fp16,avx512vl")]
22550    unsafe fn test_mm_getmant_ph() {
22551        let a = _mm_set1_ph(10.0);
22552        let r = _mm_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22553        let e = _mm_set1_ph(1.25);
22554        assert_eq_m128h(r, e);
22555    }
22556
22557    #[simd_test(enable = "avx512fp16,avx512vl")]
22558    unsafe fn test_mm_mask_getmant_ph() {
22559        let a = _mm_set1_ph(10.0);
22560        let src = _mm_set1_ph(20.0);
22561        let r = _mm_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0b01010101, a);
22562        let e = _mm_set_ph(20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25);
22563        assert_eq_m128h(r, e);
22564    }
22565
22566    #[simd_test(enable = "avx512fp16,avx512vl")]
22567    unsafe fn test_mm_maskz_getmant_ph() {
22568        let a = _mm_set1_ph(10.0);
22569        let r = _mm_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0b01010101, a);
22570        let e = _mm_set_ph(0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25);
22571        assert_eq_m128h(r, e);
22572    }
22573
22574    #[simd_test(enable = "avx512fp16,avx512vl")]
22575    unsafe fn test_mm256_getmant_ph() {
22576        let a = _mm256_set1_ph(10.0);
22577        let r = _mm256_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22578        let e = _mm256_set1_ph(1.25);
22579        assert_eq_m256h(r, e);
22580    }
22581
22582    #[simd_test(enable = "avx512fp16,avx512vl")]
22583    unsafe fn test_mm256_mask_getmant_ph() {
22584        let a = _mm256_set1_ph(10.0);
22585        let src = _mm256_set1_ph(20.0);
22586        let r = _mm256_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22587            src,
22588            0b0101010101010101,
22589            a,
22590        );
22591        let e = _mm256_set_ph(
22592            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22593            20.0, 1.25,
22594        );
22595        assert_eq_m256h(r, e);
22596    }
22597
22598    #[simd_test(enable = "avx512fp16,avx512vl")]
22599    unsafe fn test_mm256_maskz_getmant_ph() {
22600        let a = _mm256_set1_ph(10.0);
22601        let r = _mm256_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22602            0b0101010101010101,
22603            a,
22604        );
22605        let e = _mm256_set_ph(
22606            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22607        );
22608        assert_eq_m256h(r, e);
22609    }
22610
22611    #[simd_test(enable = "avx512fp16")]
22612    unsafe fn test_mm512_getmant_ph() {
22613        let a = _mm512_set1_ph(10.0);
22614        let r = _mm512_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22615        let e = _mm512_set1_ph(1.25);
22616        assert_eq_m512h(r, e);
22617    }
22618
22619    #[simd_test(enable = "avx512fp16")]
22620    unsafe fn test_mm512_mask_getmant_ph() {
22621        let a = _mm512_set1_ph(10.0);
22622        let src = _mm512_set1_ph(20.0);
22623        let r = _mm512_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22624            src,
22625            0b01010101010101010101010101010101,
22626            a,
22627        );
22628        let e = _mm512_set_ph(
22629            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22630            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22631            20.0, 1.25, 20.0, 1.25,
22632        );
22633        assert_eq_m512h(r, e);
22634    }
22635
22636    #[simd_test(enable = "avx512fp16")]
22637    unsafe fn test_mm512_maskz_getmant_ph() {
22638        let a = _mm512_set1_ph(10.0);
22639        let r = _mm512_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22640            0b01010101010101010101010101010101,
22641            a,
22642        );
22643        let e = _mm512_set_ph(
22644            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22645            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22646        );
22647        assert_eq_m512h(r, e);
22648    }
22649
22650    #[simd_test(enable = "avx512fp16")]
22651    unsafe fn test_mm512_getmant_round_ph() {
22652        let a = _mm512_set1_ph(10.0);
22653        let r =
22654            _mm512_getmant_round_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
22655                a,
22656            );
22657        let e = _mm512_set1_ph(1.25);
22658        assert_eq_m512h(r, e);
22659    }
22660
22661    #[simd_test(enable = "avx512fp16")]
22662    unsafe fn test_mm512_mask_getmant_round_ph() {
22663        let a = _mm512_set1_ph(10.0);
22664        let src = _mm512_set1_ph(20.0);
22665        let r = _mm512_mask_getmant_round_ph::<
22666            _MM_MANT_NORM_P75_1P5,
22667            _MM_MANT_SIGN_NAN,
22668            _MM_FROUND_NO_EXC,
22669        >(src, 0b01010101010101010101010101010101, a);
22670        let e = _mm512_set_ph(
22671            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22672            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22673            20.0, 1.25, 20.0, 1.25,
22674        );
22675        assert_eq_m512h(r, e);
22676    }
22677
22678    #[simd_test(enable = "avx512fp16")]
22679    unsafe fn test_mm512_maskz_getmant_round_ph() {
22680        let a = _mm512_set1_ph(10.0);
22681        let r = _mm512_maskz_getmant_round_ph::<
22682            _MM_MANT_NORM_P75_1P5,
22683            _MM_MANT_SIGN_NAN,
22684            _MM_FROUND_NO_EXC,
22685        >(0b01010101010101010101010101010101, a);
22686        let e = _mm512_set_ph(
22687            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22688            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22689        );
22690        assert_eq_m512h(r, e);
22691    }
22692
22693    #[simd_test(enable = "avx512fp16")]
22694    unsafe fn test_mm_getmant_sh() {
22695        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22696        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22697        let r = _mm_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a, b);
22698        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
22699        assert_eq_m128h(r, e);
22700    }
22701
22702    #[simd_test(enable = "avx512fp16")]
22703    unsafe fn test_mm_mask_getmant_sh() {
22704        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22705        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22706        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
22707        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0, a, b);
22708        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
22709        assert_eq_m128h(r, e);
22710        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 1, a, b);
22711        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
22712        assert_eq_m128h(r, e);
22713    }
22714
22715    #[simd_test(enable = "avx512fp16")]
22716    unsafe fn test_mm_maskz_getmant_sh() {
22717        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22718        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22719        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0, a, b);
22720        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22721        assert_eq_m128h(r, e);
22722        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(1, a, b);
22723        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
22724        assert_eq_m128h(r, e);
22725    }
22726
22727    #[simd_test(enable = "avx512fp16")]
22728    unsafe fn test_mm_getmant_round_sh() {
22729        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22730        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22731        let r = _mm_getmant_round_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
22732            a, b,
22733        );
22734        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
22735        assert_eq_m128h(r, e);
22736    }
22737
22738    #[simd_test(enable = "avx512fp16")]
22739    unsafe fn test_mm_mask_getmant_round_sh() {
22740        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22741        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22742        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
22743        let r = _mm_mask_getmant_round_sh::<
22744            _MM_MANT_NORM_P75_1P5,
22745            _MM_MANT_SIGN_NAN,
22746            _MM_FROUND_NO_EXC,
22747        >(src, 0, a, b);
22748        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
22749        assert_eq_m128h(r, e);
22750        let r = _mm_mask_getmant_round_sh::<
22751            _MM_MANT_NORM_P75_1P5,
22752            _MM_MANT_SIGN_NAN,
22753            _MM_FROUND_NO_EXC,
22754        >(src, 1, a, b);
22755        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
22756        assert_eq_m128h(r, e);
22757    }
22758
22759    #[simd_test(enable = "avx512fp16")]
22760    unsafe fn test_mm_maskz_getmant_round_sh() {
22761        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
22762        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
22763        let r = _mm_maskz_getmant_round_sh::<
22764            _MM_MANT_NORM_P75_1P5,
22765            _MM_MANT_SIGN_NAN,
22766            _MM_FROUND_NO_EXC,
22767        >(0, a, b);
22768        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22769        assert_eq_m128h(r, e);
22770        let r = _mm_maskz_getmant_round_sh::<
22771            _MM_MANT_NORM_P75_1P5,
22772            _MM_MANT_SIGN_NAN,
22773            _MM_FROUND_NO_EXC,
22774        >(1, a, b);
22775        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
22776        assert_eq_m128h(r, e);
22777    }
22778
22779    #[simd_test(enable = "avx512fp16,avx512vl")]
22780    unsafe fn test_mm_roundscale_ph() {
22781        let a = _mm_set1_ph(1.1);
22782        let r = _mm_roundscale_ph::<0>(a);
22783        let e = _mm_set1_ph(1.0);
22784        assert_eq_m128h(r, e);
22785    }
22786
22787    #[simd_test(enable = "avx512fp16,avx512vl")]
22788    unsafe fn test_mm_mask_roundscale_ph() {
22789        let a = _mm_set1_ph(1.1);
22790        let src = _mm_set1_ph(2.0);
22791        let r = _mm_mask_roundscale_ph::<0>(src, 0b01010101, a);
22792        let e = _mm_set_ph(2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0);
22793        assert_eq_m128h(r, e);
22794    }
22795
22796    #[simd_test(enable = "avx512fp16,avx512vl")]
22797    unsafe fn test_mm_maskz_roundscale_ph() {
22798        let a = _mm_set1_ph(1.1);
22799        let r = _mm_maskz_roundscale_ph::<0>(0b01010101, a);
22800        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22801        assert_eq_m128h(r, e);
22802    }
22803
22804    #[simd_test(enable = "avx512fp16,avx512vl")]
22805    unsafe fn test_mm256_roundscale_ph() {
22806        let a = _mm256_set1_ph(1.1);
22807        let r = _mm256_roundscale_ph::<0>(a);
22808        let e = _mm256_set1_ph(1.0);
22809        assert_eq_m256h(r, e);
22810    }
22811
22812    #[simd_test(enable = "avx512fp16,avx512vl")]
22813    unsafe fn test_mm256_mask_roundscale_ph() {
22814        let a = _mm256_set1_ph(1.1);
22815        let src = _mm256_set1_ph(2.0);
22816        let r = _mm256_mask_roundscale_ph::<0>(src, 0b0101010101010101, a);
22817        let e = _mm256_set_ph(
22818            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22819        );
22820        assert_eq_m256h(r, e);
22821    }
22822
22823    #[simd_test(enable = "avx512fp16,avx512vl")]
22824    unsafe fn test_mm256_maskz_roundscale_ph() {
22825        let a = _mm256_set1_ph(1.1);
22826        let r = _mm256_maskz_roundscale_ph::<0>(0b0101010101010101, a);
22827        let e = _mm256_set_ph(
22828            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22829        );
22830        assert_eq_m256h(r, e);
22831    }
22832
22833    #[simd_test(enable = "avx512fp16")]
22834    unsafe fn test_mm512_roundscale_ph() {
22835        let a = _mm512_set1_ph(1.1);
22836        let r = _mm512_roundscale_ph::<0>(a);
22837        let e = _mm512_set1_ph(1.0);
22838        assert_eq_m512h(r, e);
22839    }
22840
22841    #[simd_test(enable = "avx512fp16")]
22842    unsafe fn test_mm512_mask_roundscale_ph() {
22843        let a = _mm512_set1_ph(1.1);
22844        let src = _mm512_set1_ph(2.0);
22845        let r = _mm512_mask_roundscale_ph::<0>(src, 0b01010101010101010101010101010101, a);
22846        let e = _mm512_set_ph(
22847            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22848            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22849        );
22850        assert_eq_m512h(r, e);
22851    }
22852
22853    #[simd_test(enable = "avx512fp16")]
22854    unsafe fn test_mm512_maskz_roundscale_ph() {
22855        let a = _mm512_set1_ph(1.1);
22856        let r = _mm512_maskz_roundscale_ph::<0>(0b01010101010101010101010101010101, a);
22857        let e = _mm512_set_ph(
22858            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22859            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22860        );
22861        assert_eq_m512h(r, e);
22862    }
22863
22864    #[simd_test(enable = "avx512fp16")]
22865    unsafe fn test_mm512_roundscale_round_ph() {
22866        let a = _mm512_set1_ph(1.1);
22867        let r = _mm512_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(a);
22868        let e = _mm512_set1_ph(1.0);
22869        assert_eq_m512h(r, e);
22870    }
22871
22872    #[simd_test(enable = "avx512fp16")]
22873    unsafe fn test_mm512_mask_roundscale_round_ph() {
22874        let a = _mm512_set1_ph(1.1);
22875        let src = _mm512_set1_ph(2.0);
22876        let r = _mm512_mask_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
22877            src,
22878            0b01010101010101010101010101010101,
22879            a,
22880        );
22881        let e = _mm512_set_ph(
22882            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22883            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22884        );
22885        assert_eq_m512h(r, e);
22886    }
22887
22888    #[simd_test(enable = "avx512fp16")]
22889    unsafe fn test_mm512_maskz_roundscale_round_ph() {
22890        let a = _mm512_set1_ph(1.1);
22891        let r = _mm512_maskz_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
22892            0b01010101010101010101010101010101,
22893            a,
22894        );
22895        let e = _mm512_set_ph(
22896            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22897            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22898        );
22899        assert_eq_m512h(r, e);
22900    }
22901
22902    #[simd_test(enable = "avx512fp16")]
22903    unsafe fn test_mm_roundscale_sh() {
22904        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
22905        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
22906        let r = _mm_roundscale_sh::<0>(a, b);
22907        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22908        assert_eq_m128h(r, e);
22909    }
22910
22911    #[simd_test(enable = "avx512fp16")]
22912    unsafe fn test_mm_mask_roundscale_sh() {
22913        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
22914        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
22915        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
22916        let r = _mm_mask_roundscale_sh::<0>(src, 0, a, b);
22917        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
22918        assert_eq_m128h(r, e);
22919        let r = _mm_mask_roundscale_sh::<0>(src, 1, a, b);
22920        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22921        assert_eq_m128h(r, e);
22922    }
22923
22924    #[simd_test(enable = "avx512fp16")]
22925    unsafe fn test_mm_maskz_roundscale_sh() {
22926        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
22927        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
22928        let r = _mm_maskz_roundscale_sh::<0>(0, a, b);
22929        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22930        assert_eq_m128h(r, e);
22931        let r = _mm_maskz_roundscale_sh::<0>(1, a, b);
22932        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22933        assert_eq_m128h(r, e);
22934    }
22935
22936    #[simd_test(enable = "avx512fp16")]
22937    unsafe fn test_mm_roundscale_round_sh() {
22938        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
22939        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
22940        let r = _mm_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(a, b);
22941        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22942        assert_eq_m128h(r, e);
22943    }
22944
22945    #[simd_test(enable = "avx512fp16")]
22946    unsafe fn test_mm_mask_roundscale_round_sh() {
22947        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
22948        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
22949        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
22950        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 0, a, b);
22951        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
22952        assert_eq_m128h(r, e);
22953        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 1, a, b);
22954        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22955        assert_eq_m128h(r, e);
22956    }
22957
22958    #[simd_test(enable = "avx512fp16")]
22959    unsafe fn test_mm_maskz_roundscale_round_sh() {
22960        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
22961        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
22962        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(0, a, b);
22963        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22964        assert_eq_m128h(r, e);
22965        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(1, a, b);
22966        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22967        assert_eq_m128h(r, e);
22968    }
22969
22970    #[simd_test(enable = "avx512fp16,avx512vl")]
22971    unsafe fn test_mm_scalef_ph() {
22972        let a = _mm_set1_ph(1.);
22973        let b = _mm_set1_ph(3.);
22974        let r = _mm_scalef_ph(a, b);
22975        let e = _mm_set1_ph(8.0);
22976        assert_eq_m128h(r, e);
22977    }
22978
22979    #[simd_test(enable = "avx512fp16,avx512vl")]
22980    unsafe fn test_mm_mask_scalef_ph() {
22981        let a = _mm_set1_ph(1.);
22982        let b = _mm_set1_ph(3.);
22983        let src = _mm_set1_ph(2.);
22984        let r = _mm_mask_scalef_ph(src, 0b01010101, a, b);
22985        let e = _mm_set_ph(2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0);
22986        assert_eq_m128h(r, e);
22987    }
22988
22989    #[simd_test(enable = "avx512fp16,avx512vl")]
22990    unsafe fn test_mm_maskz_scalef_ph() {
22991        let a = _mm_set1_ph(1.);
22992        let b = _mm_set1_ph(3.);
22993        let r = _mm_maskz_scalef_ph(0b01010101, a, b);
22994        let e = _mm_set_ph(0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0);
22995        assert_eq_m128h(r, e);
22996    }
22997
22998    #[simd_test(enable = "avx512fp16,avx512vl")]
22999    unsafe fn test_mm256_scalef_ph() {
23000        let a = _mm256_set1_ph(1.);
23001        let b = _mm256_set1_ph(3.);
23002        let r = _mm256_scalef_ph(a, b);
23003        let e = _mm256_set1_ph(8.0);
23004        assert_eq_m256h(r, e);
23005    }
23006
23007    #[simd_test(enable = "avx512fp16,avx512vl")]
23008    unsafe fn test_mm256_mask_scalef_ph() {
23009        let a = _mm256_set1_ph(1.);
23010        let b = _mm256_set1_ph(3.);
23011        let src = _mm256_set1_ph(2.);
23012        let r = _mm256_mask_scalef_ph(src, 0b0101010101010101, a, b);
23013        let e = _mm256_set_ph(
23014            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23015        );
23016        assert_eq_m256h(r, e);
23017    }
23018
23019    #[simd_test(enable = "avx512fp16,avx512vl")]
23020    unsafe fn test_mm256_maskz_scalef_ph() {
23021        let a = _mm256_set1_ph(1.);
23022        let b = _mm256_set1_ph(3.);
23023        let r = _mm256_maskz_scalef_ph(0b0101010101010101, a, b);
23024        let e = _mm256_set_ph(
23025            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23026        );
23027        assert_eq_m256h(r, e);
23028    }
23029
23030    #[simd_test(enable = "avx512fp16")]
23031    unsafe fn test_mm512_scalef_ph() {
23032        let a = _mm512_set1_ph(1.);
23033        let b = _mm512_set1_ph(3.);
23034        let r = _mm512_scalef_ph(a, b);
23035        let e = _mm512_set1_ph(8.0);
23036        assert_eq_m512h(r, e);
23037    }
23038
23039    #[simd_test(enable = "avx512fp16")]
23040    unsafe fn test_mm512_mask_scalef_ph() {
23041        let a = _mm512_set1_ph(1.);
23042        let b = _mm512_set1_ph(3.);
23043        let src = _mm512_set1_ph(2.);
23044        let r = _mm512_mask_scalef_ph(src, 0b01010101010101010101010101010101, a, b);
23045        let e = _mm512_set_ph(
23046            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23047            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23048        );
23049        assert_eq_m512h(r, e);
23050    }
23051
23052    #[simd_test(enable = "avx512fp16")]
23053    unsafe fn test_mm512_maskz_scalef_ph() {
23054        let a = _mm512_set1_ph(1.);
23055        let b = _mm512_set1_ph(3.);
23056        let r = _mm512_maskz_scalef_ph(0b01010101010101010101010101010101, a, b);
23057        let e = _mm512_set_ph(
23058            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23059            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23060        );
23061        assert_eq_m512h(r, e);
23062    }
23063
23064    #[simd_test(enable = "avx512fp16")]
23065    unsafe fn test_mm512_scalef_round_ph() {
23066        let a = _mm512_set1_ph(1.);
23067        let b = _mm512_set1_ph(3.);
23068        let r = _mm512_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23069        let e = _mm512_set1_ph(8.0);
23070        assert_eq_m512h(r, e);
23071    }
23072
23073    #[simd_test(enable = "avx512fp16")]
23074    unsafe fn test_mm512_mask_scalef_round_ph() {
23075        let a = _mm512_set1_ph(1.);
23076        let b = _mm512_set1_ph(3.);
23077        let src = _mm512_set1_ph(2.);
23078        let r = _mm512_mask_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23079            src,
23080            0b01010101010101010101010101010101,
23081            a,
23082            b,
23083        );
23084        let e = _mm512_set_ph(
23085            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23086            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23087        );
23088        assert_eq_m512h(r, e);
23089    }
23090
23091    #[simd_test(enable = "avx512fp16")]
23092    unsafe fn test_mm512_maskz_scalef_round_ph() {
23093        let a = _mm512_set1_ph(1.);
23094        let b = _mm512_set1_ph(3.);
23095        let r = _mm512_maskz_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23096            0b01010101010101010101010101010101,
23097            a,
23098            b,
23099        );
23100        let e = _mm512_set_ph(
23101            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23102            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23103        );
23104        assert_eq_m512h(r, e);
23105    }
23106
23107    #[simd_test(enable = "avx512fp16")]
23108    unsafe fn test_mm_scalef_sh() {
23109        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23110        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23111        let r = _mm_scalef_sh(a, b);
23112        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23113        assert_eq_m128h(r, e);
23114    }
23115
23116    #[simd_test(enable = "avx512fp16")]
23117    unsafe fn test_mm_mask_scalef_sh() {
23118        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23119        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23120        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23121        let r = _mm_mask_scalef_sh(src, 0, a, b);
23122        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23123        assert_eq_m128h(r, e);
23124        let r = _mm_mask_scalef_sh(src, 1, a, b);
23125        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23126        assert_eq_m128h(r, e);
23127    }
23128
23129    #[simd_test(enable = "avx512fp16")]
23130    unsafe fn test_mm_maskz_scalef_sh() {
23131        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23132        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23133        let r = _mm_maskz_scalef_sh(0, a, b);
23134        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23135        assert_eq_m128h(r, e);
23136        let r = _mm_maskz_scalef_sh(1, a, b);
23137        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23138        assert_eq_m128h(r, e);
23139    }
23140
23141    #[simd_test(enable = "avx512fp16")]
23142    unsafe fn test_mm_scalef_round_sh() {
23143        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23144        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23145        let r = _mm_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23146        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23147        assert_eq_m128h(r, e);
23148    }
23149
23150    #[simd_test(enable = "avx512fp16")]
23151    unsafe fn test_mm_mask_scalef_round_sh() {
23152        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23153        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23154        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23155        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23156            src, 0, a, b,
23157        );
23158        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23159        assert_eq_m128h(r, e);
23160        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23161            src, 1, a, b,
23162        );
23163        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23164        assert_eq_m128h(r, e);
23165    }
23166
23167    #[simd_test(enable = "avx512fp16")]
23168    unsafe fn test_mm_maskz_scalef_round_sh() {
23169        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23170        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23171        let r =
23172            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
23173        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23174        assert_eq_m128h(r, e);
23175        let r =
23176            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
23177        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23178        assert_eq_m128h(r, e);
23179    }
23180
23181    #[simd_test(enable = "avx512fp16,avx512vl")]
23182    unsafe fn test_mm_reduce_ph() {
23183        let a = _mm_set1_ph(1.25);
23184        let r = _mm_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23185        let e = _mm_set1_ph(0.25);
23186        assert_eq_m128h(r, e);
23187    }
23188
23189    #[simd_test(enable = "avx512fp16,avx512vl")]
23190    unsafe fn test_mm_mask_reduce_ph() {
23191        let a = _mm_set1_ph(1.25);
23192        let src = _mm_set1_ph(2.0);
23193        let r = _mm_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01010101, a);
23194        let e = _mm_set_ph(2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25);
23195        assert_eq_m128h(r, e);
23196    }
23197
23198    #[simd_test(enable = "avx512fp16,avx512vl")]
23199    unsafe fn test_mm_maskz_reduce_ph() {
23200        let a = _mm_set1_ph(1.25);
23201        let r = _mm_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01010101, a);
23202        let e = _mm_set_ph(0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25);
23203        assert_eq_m128h(r, e);
23204    }
23205
23206    #[simd_test(enable = "avx512fp16,avx512vl")]
23207    unsafe fn test_mm256_reduce_ph() {
23208        let a = _mm256_set1_ph(1.25);
23209        let r = _mm256_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23210        let e = _mm256_set1_ph(0.25);
23211        assert_eq_m256h(r, e);
23212    }
23213
23214    #[simd_test(enable = "avx512fp16,avx512vl")]
23215    unsafe fn test_mm256_mask_reduce_ph() {
23216        let a = _mm256_set1_ph(1.25);
23217        let src = _mm256_set1_ph(2.0);
23218        let r = _mm256_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0101010101010101, a);
23219        let e = _mm256_set_ph(
23220            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23221        );
23222        assert_eq_m256h(r, e);
23223    }
23224
23225    #[simd_test(enable = "avx512fp16,avx512vl")]
23226    unsafe fn test_mm256_maskz_reduce_ph() {
23227        let a = _mm256_set1_ph(1.25);
23228        let r = _mm256_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0101010101010101, a);
23229        let e = _mm256_set_ph(
23230            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23231        );
23232        assert_eq_m256h(r, e);
23233    }
23234
23235    #[simd_test(enable = "avx512fp16")]
23236    unsafe fn test_mm512_reduce_ph() {
23237        let a = _mm512_set1_ph(1.25);
23238        let r = _mm512_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23239        let e = _mm512_set1_ph(0.25);
23240        assert_eq_m512h(r, e);
23241    }
23242
23243    #[simd_test(enable = "avx512fp16")]
23244    unsafe fn test_mm512_mask_reduce_ph() {
23245        let a = _mm512_set1_ph(1.25);
23246        let src = _mm512_set1_ph(2.0);
23247        let r = _mm512_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23248            src,
23249            0b01010101010101010101010101010101,
23250            a,
23251        );
23252        let e = _mm512_set_ph(
23253            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23254            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23255        );
23256        assert_eq_m512h(r, e);
23257    }
23258
23259    #[simd_test(enable = "avx512fp16")]
23260    unsafe fn test_mm512_maskz_reduce_ph() {
23261        let a = _mm512_set1_ph(1.25);
23262        let r = _mm512_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23263            0b01010101010101010101010101010101,
23264            a,
23265        );
23266        let e = _mm512_set_ph(
23267            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23268            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23269        );
23270        assert_eq_m512h(r, e);
23271    }
23272
23273    #[simd_test(enable = "avx512fp16")]
23274    unsafe fn test_mm512_reduce_round_ph() {
23275        let a = _mm512_set1_ph(1.25);
23276        let r = _mm512_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
23277        let e = _mm512_set1_ph(0.25);
23278        assert_eq_m512h(r, e);
23279    }
23280
23281    #[simd_test(enable = "avx512fp16")]
23282    unsafe fn test_mm512_mask_reduce_round_ph() {
23283        let a = _mm512_set1_ph(1.25);
23284        let src = _mm512_set1_ph(2.0);
23285        let r = _mm512_mask_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23286            src,
23287            0b01010101010101010101010101010101,
23288            a,
23289        );
23290        let e = _mm512_set_ph(
23291            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23292            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23293        );
23294        assert_eq_m512h(r, e);
23295    }
23296
23297    #[simd_test(enable = "avx512fp16")]
23298    unsafe fn test_mm512_maskz_reduce_round_ph() {
23299        let a = _mm512_set1_ph(1.25);
23300        let r = _mm512_maskz_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23301            0b01010101010101010101010101010101,
23302            a,
23303        );
23304        let e = _mm512_set_ph(
23305            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23306            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23307        );
23308        assert_eq_m512h(r, e);
23309    }
23310
23311    #[simd_test(enable = "avx512fp16")]
23312    unsafe fn test_mm_reduce_sh() {
23313        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23314        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23315        let r = _mm_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
23316        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23317        assert_eq_m128h(r, e);
23318    }
23319
23320    #[simd_test(enable = "avx512fp16")]
23321    unsafe fn test_mm_mask_reduce_sh() {
23322        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23323        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23324        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23325        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0, a, b);
23326        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23327        assert_eq_m128h(r, e);
23328        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 1, a, b);
23329        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23330        assert_eq_m128h(r, e);
23331    }
23332
23333    #[simd_test(enable = "avx512fp16")]
23334    unsafe fn test_mm_maskz_reduce_sh() {
23335        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23336        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23337        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(0, a, b);
23338        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23339        assert_eq_m128h(r, e);
23340        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(1, a, b);
23341        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23342        assert_eq_m128h(r, e);
23343    }
23344
23345    #[simd_test(enable = "avx512fp16")]
23346    unsafe fn test_mm_reduce_round_sh() {
23347        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23348        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23349        let r = _mm_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
23350        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23351        assert_eq_m128h(r, e);
23352    }
23353
23354    #[simd_test(enable = "avx512fp16")]
23355    unsafe fn test_mm_mask_reduce_round_sh() {
23356        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23357        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23358        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23359        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23360            src, 0, a, b,
23361        );
23362        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23363        assert_eq_m128h(r, e);
23364        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23365            src, 1, a, b,
23366        );
23367        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23368        assert_eq_m128h(r, e);
23369    }
23370
23371    #[simd_test(enable = "avx512fp16")]
23372    unsafe fn test_mm_maskz_reduce_round_sh() {
23373        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23374        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23375        let r =
23376            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0, a, b);
23377        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23378        assert_eq_m128h(r, e);
23379        let r =
23380            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(1, a, b);
23381        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23382        assert_eq_m128h(r, e);
23383    }
23384
23385    #[simd_test(enable = "avx512fp16,avx512vl")]
23386    unsafe fn test_mm_reduce_add_ph() {
23387        let a = _mm_set1_ph(2.0);
23388        let r = _mm_reduce_add_ph(a);
23389        assert_eq!(r, 16.0);
23390    }
23391
23392    #[simd_test(enable = "avx512fp16,avx512vl")]
23393    unsafe fn test_mm256_reduce_add_ph() {
23394        let a = _mm256_set1_ph(2.0);
23395        let r = _mm256_reduce_add_ph(a);
23396        assert_eq!(r, 32.0);
23397    }
23398
23399    #[simd_test(enable = "avx512fp16")]
23400    unsafe fn test_mm512_reduce_add_ph() {
23401        let a = _mm512_set1_ph(2.0);
23402        let r = _mm512_reduce_add_ph(a);
23403        assert_eq!(r, 64.0);
23404    }
23405
23406    #[simd_test(enable = "avx512fp16,avx512vl")]
23407    unsafe fn test_mm_reduce_mul_ph() {
23408        let a = _mm_set1_ph(2.0);
23409        let r = _mm_reduce_mul_ph(a);
23410        assert_eq!(r, 256.0);
23411    }
23412
23413    #[simd_test(enable = "avx512fp16,avx512vl")]
23414    unsafe fn test_mm256_reduce_mul_ph() {
23415        let a = _mm256_set1_ph(2.0);
23416        let r = _mm256_reduce_mul_ph(a);
23417        assert_eq!(r, 65536.0);
23418    }
23419
23420    #[simd_test(enable = "avx512fp16")]
23421    unsafe fn test_mm512_reduce_mul_ph() {
23422        let a = _mm512_set1_ph(2.0);
23423        let r = _mm512_reduce_mul_ph(a);
23424        assert_eq!(r, 16777216.0);
23425    }
23426
23427    #[simd_test(enable = "avx512fp16,avx512vl")]
23428    unsafe fn test_mm_reduce_max_ph() {
23429        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23430        let r = _mm_reduce_max_ph(a);
23431        assert_eq!(r, 8.0);
23432    }
23433
23434    #[simd_test(enable = "avx512fp16,avx512vl")]
23435    unsafe fn test_mm256_reduce_max_ph() {
23436        let a = _mm256_set_ph(
23437            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23438        );
23439        let r = _mm256_reduce_max_ph(a);
23440        assert_eq!(r, 16.0);
23441    }
23442
23443    #[simd_test(enable = "avx512fp16")]
23444    unsafe fn test_mm512_reduce_max_ph() {
23445        let a = _mm512_set_ph(
23446            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23447            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23448            31.0, 32.0,
23449        );
23450        let r = _mm512_reduce_max_ph(a);
23451        assert_eq!(r, 32.0);
23452    }
23453
23454    #[simd_test(enable = "avx512fp16,avx512vl")]
23455    unsafe fn test_mm_reduce_min_ph() {
23456        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23457        let r = _mm_reduce_min_ph(a);
23458        assert_eq!(r, 1.0);
23459    }
23460
23461    #[simd_test(enable = "avx512fp16,avx512vl")]
23462    unsafe fn test_mm256_reduce_min_ph() {
23463        let a = _mm256_set_ph(
23464            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23465        );
23466        let r = _mm256_reduce_min_ph(a);
23467        assert_eq!(r, 1.0);
23468    }
23469
23470    #[simd_test(enable = "avx512fp16")]
23471    unsafe fn test_mm512_reduce_min_ph() {
23472        let a = _mm512_set_ph(
23473            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23474            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23475            31.0, 32.0,
23476        );
23477        let r = _mm512_reduce_min_ph(a);
23478        assert_eq!(r, 1.0);
23479    }
23480
23481    #[simd_test(enable = "avx512fp16,avx512vl")]
23482    unsafe fn test_mm_fpclass_ph_mask() {
23483        let a = _mm_set_ph(
23484            1.,
23485            f16::INFINITY,
23486            f16::NEG_INFINITY,
23487            0.0,
23488            -0.0,
23489            -2.0,
23490            f16::NAN,
23491            5.9e-8, // Denormal
23492        );
23493        let r = _mm_fpclass_ph_mask::<0x18>(a); // infinities
23494        assert_eq!(r, 0b01100000);
23495    }
23496
23497    #[simd_test(enable = "avx512fp16,avx512vl")]
23498    unsafe fn test_mm_mask_fpclass_ph_mask() {
23499        let a = _mm_set_ph(
23500            1.,
23501            f16::INFINITY,
23502            f16::NEG_INFINITY,
23503            0.0,
23504            -0.0,
23505            -2.0,
23506            f16::NAN,
23507            5.9e-8, // Denormal
23508        );
23509        let r = _mm_mask_fpclass_ph_mask::<0x18>(0b01010101, a);
23510        assert_eq!(r, 0b01000000);
23511    }
23512
23513    #[simd_test(enable = "avx512fp16,avx512vl")]
23514    unsafe fn test_mm256_fpclass_ph_mask() {
23515        let a = _mm256_set_ph(
23516            1.,
23517            f16::INFINITY,
23518            f16::NEG_INFINITY,
23519            0.0,
23520            -0.0,
23521            -2.0,
23522            f16::NAN,
23523            5.9e-8, // Denormal
23524            1.,
23525            f16::INFINITY,
23526            f16::NEG_INFINITY,
23527            0.0,
23528            -0.0,
23529            -2.0,
23530            f16::NAN,
23531            5.9e-8, // Denormal
23532        );
23533        let r = _mm256_fpclass_ph_mask::<0x18>(a); // infinities
23534        assert_eq!(r, 0b0110000001100000);
23535    }
23536
23537    #[simd_test(enable = "avx512fp16,avx512vl")]
23538    unsafe fn test_mm256_mask_fpclass_ph_mask() {
23539        let a = _mm256_set_ph(
23540            1.,
23541            f16::INFINITY,
23542            f16::NEG_INFINITY,
23543            0.0,
23544            -0.0,
23545            -2.0,
23546            f16::NAN,
23547            5.9e-8, // Denormal
23548            1.,
23549            f16::INFINITY,
23550            f16::NEG_INFINITY,
23551            0.0,
23552            -0.0,
23553            -2.0,
23554            f16::NAN,
23555            5.9e-8, // Denormal
23556        );
23557        let r = _mm256_mask_fpclass_ph_mask::<0x18>(0b0101010101010101, a);
23558        assert_eq!(r, 0b0100000001000000);
23559    }
23560
23561    #[simd_test(enable = "avx512fp16")]
23562    unsafe fn test_mm512_fpclass_ph_mask() {
23563        let a = _mm512_set_ph(
23564            1.,
23565            f16::INFINITY,
23566            f16::NEG_INFINITY,
23567            0.0,
23568            -0.0,
23569            -2.0,
23570            f16::NAN,
23571            5.9e-8, // Denormal
23572            1.,
23573            f16::INFINITY,
23574            f16::NEG_INFINITY,
23575            0.0,
23576            -0.0,
23577            -2.0,
23578            f16::NAN,
23579            5.9e-8, // Denormal
23580            1.,
23581            f16::INFINITY,
23582            f16::NEG_INFINITY,
23583            0.0,
23584            -0.0,
23585            -2.0,
23586            f16::NAN,
23587            5.9e-8, // Denormal
23588            1.,
23589            f16::INFINITY,
23590            f16::NEG_INFINITY,
23591            0.0,
23592            -0.0,
23593            -2.0,
23594            f16::NAN,
23595            5.9e-8, // Denormal
23596        );
23597        let r = _mm512_fpclass_ph_mask::<0x18>(a); // infinities
23598        assert_eq!(r, 0b01100000011000000110000001100000);
23599    }
23600
23601    #[simd_test(enable = "avx512fp16")]
23602    unsafe fn test_mm512_mask_fpclass_ph_mask() {
23603        let a = _mm512_set_ph(
23604            1.,
23605            f16::INFINITY,
23606            f16::NEG_INFINITY,
23607            0.0,
23608            -0.0,
23609            -2.0,
23610            f16::NAN,
23611            5.9e-8, // Denormal
23612            1.,
23613            f16::INFINITY,
23614            f16::NEG_INFINITY,
23615            0.0,
23616            -0.0,
23617            -2.0,
23618            f16::NAN,
23619            5.9e-8, // Denormal
23620            1.,
23621            f16::INFINITY,
23622            f16::NEG_INFINITY,
23623            0.0,
23624            -0.0,
23625            -2.0,
23626            f16::NAN,
23627            5.9e-8, // Denormal
23628            1.,
23629            f16::INFINITY,
23630            f16::NEG_INFINITY,
23631            0.0,
23632            -0.0,
23633            -2.0,
23634            f16::NAN,
23635            5.9e-8, // Denormal
23636        );
23637        let r = _mm512_mask_fpclass_ph_mask::<0x18>(0b01010101010101010101010101010101, a);
23638        assert_eq!(r, 0b01000000010000000100000001000000);
23639    }
23640
23641    #[simd_test(enable = "avx512fp16")]
23642    unsafe fn test_mm_fpclass_sh_mask() {
23643        let a = _mm_set_sh(f16::INFINITY);
23644        let r = _mm_fpclass_sh_mask::<0x18>(a);
23645        assert_eq!(r, 1);
23646    }
23647
23648    #[simd_test(enable = "avx512fp16")]
23649    unsafe fn test_mm_mask_fpclass_sh_mask() {
23650        let a = _mm_set_sh(f16::INFINITY);
23651        let r = _mm_mask_fpclass_sh_mask::<0x18>(0, a);
23652        assert_eq!(r, 0);
23653        let r = _mm_mask_fpclass_sh_mask::<0x18>(1, a);
23654        assert_eq!(r, 1);
23655    }
23656
23657    #[simd_test(enable = "avx512fp16,avx512vl")]
23658    unsafe fn test_mm_mask_blend_ph() {
23659        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23660        let b = _mm_set_ph(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0);
23661        let r = _mm_mask_blend_ph(0b01010101, a, b);
23662        let e = _mm_set_ph(1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0);
23663        assert_eq_m128h(r, e);
23664    }
23665
23666    #[simd_test(enable = "avx512fp16,avx512vl")]
23667    unsafe fn test_mm256_mask_blend_ph() {
23668        let a = _mm256_set_ph(
23669            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23670        );
23671        let b = _mm256_set_ph(
23672            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
23673            -14.0, -15.0, -16.0,
23674        );
23675        let r = _mm256_mask_blend_ph(0b0101010101010101, a, b);
23676        let e = _mm256_set_ph(
23677            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
23678            -16.0,
23679        );
23680        assert_eq_m256h(r, e);
23681    }
23682
23683    #[simd_test(enable = "avx512fp16")]
23684    unsafe fn test_mm512_mask_blend_ph() {
23685        let a = _mm512_set_ph(
23686            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23687            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23688            31.0, 32.0,
23689        );
23690        let b = _mm512_set_ph(
23691            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
23692            -14.0, -15.0, -16.0, -17.0, -18.0, -19.0, -20.0, -21.0, -22.0, -23.0, -24.0, -25.0,
23693            -26.0, -27.0, -28.0, -29.0, -30.0, -31.0, -32.0,
23694        );
23695        let r = _mm512_mask_blend_ph(0b01010101010101010101010101010101, a, b);
23696        let e = _mm512_set_ph(
23697            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
23698            -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0, 27.0, -28.0,
23699            29.0, -30.0, 31.0, -32.0,
23700        );
23701        assert_eq_m512h(r, e);
23702    }
23703
23704    #[simd_test(enable = "avx512fp16,avx512vl")]
23705    unsafe fn test_mm_permutex2var_ph() {
23706        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23707        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
23708        let idx = _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14);
23709        let r = _mm_permutex2var_ph(a, idx, b);
23710        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
23711        assert_eq_m128h(r, e);
23712    }
23713
23714    #[simd_test(enable = "avx512fp16,avx512vl")]
23715    unsafe fn test_mm256_permutex2var_ph() {
23716        let a = _mm256_setr_ph(
23717            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23718        );
23719        let b = _mm256_setr_ph(
23720            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23721            31.0, 32.0,
23722        );
23723        let idx = _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
23724        let r = _mm256_permutex2var_ph(a, idx, b);
23725        let e = _mm256_setr_ph(
23726            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
23727            31.0,
23728        );
23729        assert_eq_m256h(r, e);
23730    }
23731
23732    #[simd_test(enable = "avx512fp16")]
23733    unsafe fn test_mm512_permutex2var_ph() {
23734        let a = _mm512_setr_ph(
23735            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23736            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23737            31.0, 32.0,
23738        );
23739        let b = _mm512_setr_ph(
23740            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
23741            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
23742            61.0, 62.0, 63.0, 64.0,
23743        );
23744        let idx = _mm512_set_epi16(
23745            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20,
23746            18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
23747        );
23748        let r = _mm512_permutex2var_ph(a, idx, b);
23749        let e = _mm512_setr_ph(
23750            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
23751            31.0, 33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0, 49.0, 51.0, 53.0, 55.0, 57.0,
23752            59.0, 61.0, 63.0,
23753        );
23754        assert_eq_m512h(r, e);
23755    }
23756
23757    #[simd_test(enable = "avx512fp16,avx512vl")]
23758    unsafe fn test_mm_permutexvar_ph() {
23759        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23760        let idx = _mm_set_epi16(0, 2, 4, 6, 1, 3, 5, 7);
23761        let r = _mm_permutexvar_ph(idx, a);
23762        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 2.0, 4.0, 6.0, 8.0);
23763        assert_eq_m128h(r, e);
23764    }
23765
23766    #[simd_test(enable = "avx512fp16,avx512vl")]
23767    unsafe fn test_mm256_permutexvar_ph() {
23768        let a = _mm256_set_ph(
23769            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23770        );
23771        let idx = _mm256_set_epi16(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
23772        let r = _mm256_permutexvar_ph(idx, a);
23773        let e = _mm256_setr_ph(
23774            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0,
23775        );
23776        assert_eq_m256h(r, e);
23777    }
23778
23779    #[simd_test(enable = "avx512fp16")]
23780    unsafe fn test_mm512_permutexvar_ph() {
23781        let a = _mm512_set_ph(
23782            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23783            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23784            31.0, 32.0,
23785        );
23786        let idx = _mm512_set_epi16(
23787            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1, 3, 5, 7, 9, 11, 13, 15,
23788            17, 19, 21, 23, 25, 27, 29, 31,
23789        );
23790        let r = _mm512_permutexvar_ph(idx, a);
23791        let e = _mm512_setr_ph(
23792            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
23793            31.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0,
23794            30.0, 32.0,
23795        );
23796        assert_eq_m512h(r, e);
23797    }
23798
23799    #[simd_test(enable = "avx512fp16,avx512vl")]
23800    unsafe fn test_mm_cvtepi16_ph() {
23801        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
23802        let r = _mm_cvtepi16_ph(a);
23803        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23804        assert_eq_m128h(r, e);
23805    }
23806
23807    #[simd_test(enable = "avx512fp16,avx512vl")]
23808    unsafe fn test_mm_mask_cvtepi16_ph() {
23809        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
23810        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
23811        let r = _mm_mask_cvtepi16_ph(src, 0b01010101, a);
23812        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
23813        assert_eq_m128h(r, e);
23814    }
23815
23816    #[simd_test(enable = "avx512fp16,avx512vl")]
23817    unsafe fn test_mm_maskz_cvtepi16_ph() {
23818        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
23819        let r = _mm_maskz_cvtepi16_ph(0b01010101, a);
23820        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
23821        assert_eq_m128h(r, e);
23822    }
23823
23824    #[simd_test(enable = "avx512fp16,avx512vl")]
23825    unsafe fn test_mm256_cvtepi16_ph() {
23826        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
23827        let r = _mm256_cvtepi16_ph(a);
23828        let e = _mm256_set_ph(
23829            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23830        );
23831        assert_eq_m256h(r, e);
23832    }
23833
23834    #[simd_test(enable = "avx512fp16,avx512vl")]
23835    unsafe fn test_mm256_mask_cvtepi16_ph() {
23836        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
23837        let src = _mm256_set_ph(
23838            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
23839        );
23840        let r = _mm256_mask_cvtepi16_ph(src, 0b0101010101010101, a);
23841        let e = _mm256_set_ph(
23842            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
23843        );
23844        assert_eq_m256h(r, e);
23845    }
23846
23847    #[simd_test(enable = "avx512fp16,avx512vl")]
23848    unsafe fn test_mm256_maskz_cvtepi16_ph() {
23849        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
23850        let r = _mm256_maskz_cvtepi16_ph(0b0101010101010101, a);
23851        let e = _mm256_set_ph(
23852            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
23853        );
23854        assert_eq_m256h(r, e);
23855    }
23856
23857    #[simd_test(enable = "avx512fp16")]
23858    unsafe fn test_mm512_cvtepi16_ph() {
23859        let a = _mm512_set_epi16(
23860            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
23861            25, 26, 27, 28, 29, 30, 31, 32,
23862        );
23863        let r = _mm512_cvtepi16_ph(a);
23864        let e = _mm512_set_ph(
23865            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23866            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23867            31.0, 32.0,
23868        );
23869        assert_eq_m512h(r, e);
23870    }
23871
23872    #[simd_test(enable = "avx512fp16")]
23873    unsafe fn test_mm512_mask_cvtepi16_ph() {
23874        let a = _mm512_set_epi16(
23875            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
23876            25, 26, 27, 28, 29, 30, 31, 32,
23877        );
23878        let src = _mm512_set_ph(
23879            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
23880            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
23881        );
23882        let r = _mm512_mask_cvtepi16_ph(src, 0b01010101010101010101010101010101, a);
23883        let e = _mm512_set_ph(
23884            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
23885            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
23886        );
23887        assert_eq_m512h(r, e);
23888    }
23889
23890    #[simd_test(enable = "avx512fp16")]
23891    unsafe fn test_mm512_maskz_cvtepi16_ph() {
23892        let a = _mm512_set_epi16(
23893            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
23894            25, 26, 27, 28, 29, 30, 31, 32,
23895        );
23896        let r = _mm512_maskz_cvtepi16_ph(0b01010101010101010101010101010101, a);
23897        let e = _mm512_set_ph(
23898            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
23899            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
23900        );
23901        assert_eq_m512h(r, e);
23902    }
23903
23904    #[simd_test(enable = "avx512fp16")]
23905    unsafe fn test_mm512_cvt_roundepi16_ph() {
23906        let a = _mm512_set_epi16(
23907            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
23908            25, 26, 27, 28, 29, 30, 31, 32,
23909        );
23910        let r = _mm512_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
23911        let e = _mm512_set_ph(
23912            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23913            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23914            31.0, 32.0,
23915        );
23916        assert_eq_m512h(r, e);
23917    }
23918
23919    #[simd_test(enable = "avx512fp16")]
23920    unsafe fn test_mm512_mask_cvt_roundepi16_ph() {
23921        let a = _mm512_set_epi16(
23922            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
23923            25, 26, 27, 28, 29, 30, 31, 32,
23924        );
23925        let src = _mm512_set_ph(
23926            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
23927            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
23928        );
23929        let r = _mm512_mask_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23930            src,
23931            0b01010101010101010101010101010101,
23932            a,
23933        );
23934        let e = _mm512_set_ph(
23935            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
23936            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
23937        );
23938        assert_eq_m512h(r, e);
23939    }
23940
23941    #[simd_test(enable = "avx512fp16")]
23942    unsafe fn test_mm512_maskz_cvt_roundepi16_ph() {
23943        let a = _mm512_set_epi16(
23944            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
23945            25, 26, 27, 28, 29, 30, 31, 32,
23946        );
23947        let r = _mm512_maskz_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23948            0b01010101010101010101010101010101,
23949            a,
23950        );
23951        let e = _mm512_set_ph(
23952            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
23953            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
23954        );
23955        assert_eq_m512h(r, e);
23956    }
23957
23958    #[simd_test(enable = "avx512fp16,avx512vl")]
23959    unsafe fn test_mm_cvtepu16_ph() {
23960        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
23961        let r = _mm_cvtepu16_ph(a);
23962        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23963        assert_eq_m128h(r, e);
23964    }
23965
23966    #[simd_test(enable = "avx512fp16,avx512vl")]
23967    unsafe fn test_mm_mask_cvtepu16_ph() {
23968        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
23969        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
23970        let r = _mm_mask_cvtepu16_ph(src, 0b01010101, a);
23971        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
23972        assert_eq_m128h(r, e);
23973    }
23974
23975    #[simd_test(enable = "avx512fp16,avx512vl")]
23976    unsafe fn test_mm_maskz_cvtepu16_ph() {
23977        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
23978        let r = _mm_maskz_cvtepu16_ph(0b01010101, a);
23979        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
23980        assert_eq_m128h(r, e);
23981    }
23982
23983    #[simd_test(enable = "avx512fp16,avx512vl")]
23984    unsafe fn test_mm256_cvtepu16_ph() {
23985        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
23986        let r = _mm256_cvtepu16_ph(a);
23987        let e = _mm256_set_ph(
23988            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23989        );
23990        assert_eq_m256h(r, e);
23991    }
23992
23993    #[simd_test(enable = "avx512fp16,avx512vl")]
23994    unsafe fn test_mm256_mask_cvtepu16_ph() {
23995        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
23996        let src = _mm256_set_ph(
23997            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
23998        );
23999        let r = _mm256_mask_cvtepu16_ph(src, 0b0101010101010101, a);
24000        let e = _mm256_set_ph(
24001            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24002        );
24003        assert_eq_m256h(r, e);
24004    }
24005
24006    #[simd_test(enable = "avx512fp16,avx512vl")]
24007    unsafe fn test_mm256_maskz_cvtepu16_ph() {
24008        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24009        let r = _mm256_maskz_cvtepu16_ph(0b0101010101010101, a);
24010        let e = _mm256_set_ph(
24011            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24012        );
24013        assert_eq_m256h(r, e);
24014    }
24015
24016    #[simd_test(enable = "avx512fp16")]
24017    unsafe fn test_mm512_cvtepu16_ph() {
24018        let a = _mm512_set_epi16(
24019            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24020            25, 26, 27, 28, 29, 30, 31, 32,
24021        );
24022        let r = _mm512_cvtepu16_ph(a);
24023        let e = _mm512_set_ph(
24024            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24025            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24026            31.0, 32.0,
24027        );
24028        assert_eq_m512h(r, e);
24029    }
24030
24031    #[simd_test(enable = "avx512fp16")]
24032    unsafe fn test_mm512_mask_cvtepu16_ph() {
24033        let a = _mm512_set_epi16(
24034            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24035            25, 26, 27, 28, 29, 30, 31, 32,
24036        );
24037        let src = _mm512_set_ph(
24038            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24039            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24040        );
24041        let r = _mm512_mask_cvtepu16_ph(src, 0b01010101010101010101010101010101, a);
24042        let e = _mm512_set_ph(
24043            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24044            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24045        );
24046        assert_eq_m512h(r, e);
24047    }
24048
24049    #[simd_test(enable = "avx512fp16")]
24050    unsafe fn test_mm512_maskz_cvtepu16_ph() {
24051        let a = _mm512_set_epi16(
24052            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24053            25, 26, 27, 28, 29, 30, 31, 32,
24054        );
24055        let r = _mm512_maskz_cvtepu16_ph(0b01010101010101010101010101010101, a);
24056        let e = _mm512_set_ph(
24057            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24058            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24059        );
24060        assert_eq_m512h(r, e);
24061    }
24062
24063    #[simd_test(enable = "avx512fp16")]
24064    unsafe fn test_mm512_cvt_roundepu16_ph() {
24065        let a = _mm512_set_epi16(
24066            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24067            25, 26, 27, 28, 29, 30, 31, 32,
24068        );
24069        let r = _mm512_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24070        let e = _mm512_set_ph(
24071            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24072            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24073            31.0, 32.0,
24074        );
24075        assert_eq_m512h(r, e);
24076    }
24077
24078    #[simd_test(enable = "avx512fp16")]
24079    unsafe fn test_mm512_mask_cvt_roundepu16_ph() {
24080        let a = _mm512_set_epi16(
24081            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24082            25, 26, 27, 28, 29, 30, 31, 32,
24083        );
24084        let src = _mm512_set_ph(
24085            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24086            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24087        );
24088        let r = _mm512_mask_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24089            src,
24090            0b01010101010101010101010101010101,
24091            a,
24092        );
24093        let e = _mm512_set_ph(
24094            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24095            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24096        );
24097        assert_eq_m512h(r, e);
24098    }
24099
24100    #[simd_test(enable = "avx512fp16")]
24101    unsafe fn test_mm512_maskz_cvt_roundepu16_ph() {
24102        let a = _mm512_set_epi16(
24103            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24104            25, 26, 27, 28, 29, 30, 31, 32,
24105        );
24106        let r = _mm512_maskz_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24107            0b01010101010101010101010101010101,
24108            a,
24109        );
24110        let e = _mm512_set_ph(
24111            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24112            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24113        );
24114        assert_eq_m512h(r, e);
24115    }
24116
24117    #[simd_test(enable = "avx512fp16,avx512vl")]
24118    unsafe fn test_mm_cvtepi32_ph() {
24119        let a = _mm_set_epi32(1, 2, 3, 4);
24120        let r = _mm_cvtepi32_ph(a);
24121        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24122        assert_eq_m128h(r, e);
24123    }
24124
24125    #[simd_test(enable = "avx512fp16,avx512vl")]
24126    unsafe fn test_mm_mask_cvtepi32_ph() {
24127        let a = _mm_set_epi32(1, 2, 3, 4);
24128        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24129        let r = _mm_mask_cvtepi32_ph(src, 0b0101, a);
24130        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24131        assert_eq_m128h(r, e);
24132    }
24133
24134    #[simd_test(enable = "avx512fp16,avx512vl")]
24135    unsafe fn test_mm_maskz_cvtepi32_ph() {
24136        let a = _mm_set_epi32(1, 2, 3, 4);
24137        let r = _mm_maskz_cvtepi32_ph(0b0101, a);
24138        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24139        assert_eq_m128h(r, e);
24140    }
24141
24142    #[simd_test(enable = "avx512fp16,avx512vl")]
24143    unsafe fn test_mm256_cvtepi32_ph() {
24144        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24145        let r = _mm256_cvtepi32_ph(a);
24146        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24147        assert_eq_m128h(r, e);
24148    }
24149
24150    #[simd_test(enable = "avx512fp16,avx512vl")]
24151    unsafe fn test_mm256_mask_cvtepi32_ph() {
24152        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24153        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24154        let r = _mm256_mask_cvtepi32_ph(src, 0b01010101, a);
24155        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24156        assert_eq_m128h(r, e);
24157    }
24158
24159    #[simd_test(enable = "avx512fp16,avx512vl")]
24160    unsafe fn test_mm256_maskz_cvtepi32_ph() {
24161        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24162        let r = _mm256_maskz_cvtepi32_ph(0b01010101, a);
24163        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24164        assert_eq_m128h(r, e);
24165    }
24166
24167    #[simd_test(enable = "avx512fp16")]
24168    unsafe fn test_mm512_cvtepi32_ph() {
24169        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24170        let r = _mm512_cvtepi32_ph(a);
24171        let e = _mm256_set_ph(
24172            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24173        );
24174        assert_eq_m256h(r, e);
24175    }
24176
24177    #[simd_test(enable = "avx512fp16")]
24178    unsafe fn test_mm512_mask_cvtepi32_ph() {
24179        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24180        let src = _mm256_set_ph(
24181            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24182        );
24183        let r = _mm512_mask_cvtepi32_ph(src, 0b0101010101010101, a);
24184        let e = _mm256_set_ph(
24185            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24186        );
24187        assert_eq_m256h(r, e);
24188    }
24189
24190    #[simd_test(enable = "avx512fp16")]
24191    unsafe fn test_mm512_maskz_cvtepi32_ph() {
24192        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24193        let r = _mm512_maskz_cvtepi32_ph(0b0101010101010101, a);
24194        let e = _mm256_set_ph(
24195            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24196        );
24197        assert_eq_m256h(r, e);
24198    }
24199
24200    #[simd_test(enable = "avx512fp16")]
24201    unsafe fn test_mm512_cvt_roundepi32_ph() {
24202        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24203        let r = _mm512_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24204        let e = _mm256_set_ph(
24205            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24206        );
24207        assert_eq_m256h(r, e);
24208    }
24209
24210    #[simd_test(enable = "avx512fp16")]
24211    unsafe fn test_mm512_mask_cvt_roundepi32_ph() {
24212        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24213        let src = _mm256_set_ph(
24214            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24215        );
24216        let r = _mm512_mask_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24217            src,
24218            0b0101010101010101,
24219            a,
24220        );
24221        let e = _mm256_set_ph(
24222            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24223        );
24224        assert_eq_m256h(r, e);
24225    }
24226
24227    #[simd_test(enable = "avx512fp16")]
24228    unsafe fn test_mm512_maskz_cvt_roundepi32_ph() {
24229        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24230        let r = _mm512_maskz_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24231            0b0101010101010101,
24232            a,
24233        );
24234        let e = _mm256_set_ph(
24235            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24236        );
24237        assert_eq_m256h(r, e);
24238    }
24239
24240    #[simd_test(enable = "avx512fp16")]
24241    unsafe fn test_mm_cvti32_sh() {
24242        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24243        let r = _mm_cvti32_sh(a, 10);
24244        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24245        assert_eq_m128h(r, e);
24246    }
24247
24248    #[simd_test(enable = "avx512fp16")]
24249    unsafe fn test_mm_cvt_roundi32_sh() {
24250        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24251        let r = _mm_cvt_roundi32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24252        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24253        assert_eq_m128h(r, e);
24254    }
24255
24256    #[simd_test(enable = "avx512fp16,avx512vl")]
24257    unsafe fn test_mm_cvtepu32_ph() {
24258        let a = _mm_set_epi32(1, 2, 3, 4);
24259        let r = _mm_cvtepu32_ph(a);
24260        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24261        assert_eq_m128h(r, e);
24262    }
24263
24264    #[simd_test(enable = "avx512fp16,avx512vl")]
24265    unsafe fn test_mm_mask_cvtepu32_ph() {
24266        let a = _mm_set_epi32(1, 2, 3, 4);
24267        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24268        let r = _mm_mask_cvtepu32_ph(src, 0b0101, a);
24269        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24270        assert_eq_m128h(r, e);
24271    }
24272
24273    #[simd_test(enable = "avx512fp16,avx512vl")]
24274    unsafe fn test_mm_maskz_cvtepu32_ph() {
24275        let a = _mm_set_epi32(1, 2, 3, 4);
24276        let r = _mm_maskz_cvtepu32_ph(0b0101, a);
24277        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24278        assert_eq_m128h(r, e);
24279    }
24280
24281    #[simd_test(enable = "avx512fp16,avx512vl")]
24282    unsafe fn test_mm256_cvtepu32_ph() {
24283        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24284        let r = _mm256_cvtepu32_ph(a);
24285        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24286        assert_eq_m128h(r, e);
24287    }
24288
24289    #[simd_test(enable = "avx512fp16,avx512vl")]
24290    unsafe fn test_mm256_mask_cvtepu32_ph() {
24291        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24292        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24293        let r = _mm256_mask_cvtepu32_ph(src, 0b01010101, a);
24294        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24295        assert_eq_m128h(r, e);
24296    }
24297
24298    #[simd_test(enable = "avx512fp16,avx512vl")]
24299    unsafe fn test_mm256_maskz_cvtepu32_ph() {
24300        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24301        let r = _mm256_maskz_cvtepu32_ph(0b01010101, a);
24302        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24303        assert_eq_m128h(r, e);
24304    }
24305
24306    #[simd_test(enable = "avx512fp16")]
24307    unsafe fn test_mm512_cvtepu32_ph() {
24308        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24309        let r = _mm512_cvtepu32_ph(a);
24310        let e = _mm256_set_ph(
24311            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24312        );
24313        assert_eq_m256h(r, e);
24314    }
24315
24316    #[simd_test(enable = "avx512fp16")]
24317    unsafe fn test_mm512_mask_cvtepu32_ph() {
24318        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24319        let src = _mm256_set_ph(
24320            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24321        );
24322        let r = _mm512_mask_cvtepu32_ph(src, 0b0101010101010101, a);
24323        let e = _mm256_set_ph(
24324            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
24325        );
24326        assert_eq_m256h(r, e);
24327    }
24328
24329    #[simd_test(enable = "avx512fp16")]
24330    unsafe fn test_mm512_maskz_cvtepu32_ph() {
24331        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24332        let r = _mm512_maskz_cvtepu32_ph(0b0101010101010101, a);
24333        let e = _mm256_set_ph(
24334            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24335        );
24336        assert_eq_m256h(r, e);
24337    }
24338
24339    #[simd_test(enable = "avx512fp16")]
24340    unsafe fn test_mm512_cvt_roundepu32_ph() {
24341        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24342        let r = _mm512_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24343        let e = _mm256_set_ph(
24344            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24345        );
24346        assert_eq_m256h(r, e);
24347    }
24348
24349    #[simd_test(enable = "avx512fp16")]
24350    unsafe fn test_mm512_mask_cvt_roundepu32_ph() {
24351        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24352        let src = _mm256_set_ph(
24353            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24354        );
24355        let r = _mm512_mask_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24356            src,
24357            0b0101010101010101,
24358            a,
24359        );
24360        let e = _mm256_set_ph(
24361            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
24362            16.0,
24363        );
24364        assert_eq_m256h(r, e);
24365    }
24366
24367    #[simd_test(enable = "avx512fp16")]
24368    unsafe fn test_mm512_maskz_cvt_roundepu32_ph() {
24369        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24370        let r = _mm512_maskz_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24371            0b0101010101010101,
24372            a,
24373        );
24374        let e = _mm256_set_ph(
24375            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24376        );
24377        assert_eq_m256h(r, e);
24378    }
24379
24380    #[simd_test(enable = "avx512fp16")]
24381    unsafe fn test_mm_cvtu32_sh() {
24382        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24383        let r = _mm_cvtu32_sh(a, 10);
24384        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24385        assert_eq_m128h(r, e);
24386    }
24387
24388    #[simd_test(enable = "avx512fp16")]
24389    unsafe fn test_mm_cvt_roundu32_sh() {
24390        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24391        let r = _mm_cvt_roundu32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24392        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24393        assert_eq_m128h(r, e);
24394    }
24395
24396    #[simd_test(enable = "avx512fp16,avx512vl")]
24397    unsafe fn test_mm_cvtepi64_ph() {
24398        let a = _mm_set_epi64x(1, 2);
24399        let r = _mm_cvtepi64_ph(a);
24400        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
24401        assert_eq_m128h(r, e);
24402    }
24403
24404    #[simd_test(enable = "avx512fp16,avx512vl")]
24405    unsafe fn test_mm_mask_cvtepi64_ph() {
24406        let a = _mm_set_epi64x(1, 2);
24407        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24408        let r = _mm_mask_cvtepi64_ph(src, 0b01, a);
24409        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
24410        assert_eq_m128h(r, e);
24411    }
24412
24413    #[simd_test(enable = "avx512fp16,avx512vl")]
24414    unsafe fn test_mm_maskz_cvtepi64_ph() {
24415        let a = _mm_set_epi64x(1, 2);
24416        let r = _mm_maskz_cvtepi64_ph(0b01, a);
24417        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.);
24418        assert_eq_m128h(r, e);
24419    }
24420
24421    #[simd_test(enable = "avx512fp16,avx512vl")]
24422    unsafe fn test_mm256_cvtepi64_ph() {
24423        let a = _mm256_set_epi64x(1, 2, 3, 4);
24424        let r = _mm256_cvtepi64_ph(a);
24425        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24426        assert_eq_m128h(r, e);
24427    }
24428
24429    #[simd_test(enable = "avx512fp16,avx512vl")]
24430    unsafe fn test_mm256_mask_cvtepi64_ph() {
24431        let a = _mm256_set_epi64x(1, 2, 3, 4);
24432        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24433        let r = _mm256_mask_cvtepi64_ph(src, 0b0101, a);
24434        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
24435        assert_eq_m128h(r, e);
24436    }
24437
24438    #[simd_test(enable = "avx512fp16,avx512vl")]
24439    unsafe fn test_mm256_maskz_cvtepi64_ph() {
24440        let a = _mm256_set_epi64x(1, 2, 3, 4);
24441        let r = _mm256_maskz_cvtepi64_ph(0b0101, a);
24442        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24443        assert_eq_m128h(r, e);
24444    }
24445
24446    #[simd_test(enable = "avx512fp16")]
24447    unsafe fn test_mm512_cvtepi64_ph() {
24448        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24449        let r = _mm512_cvtepi64_ph(a);
24450        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24451        assert_eq_m128h(r, e);
24452    }
24453
24454    #[simd_test(enable = "avx512fp16")]
24455    unsafe fn test_mm512_mask_cvtepi64_ph() {
24456        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24457        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24458        let r = _mm512_mask_cvtepi64_ph(src, 0b01010101, a);
24459        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24460        assert_eq_m128h(r, e);
24461    }
24462
24463    #[simd_test(enable = "avx512fp16")]
24464    unsafe fn test_mm512_maskz_cvtepi64_ph() {
24465        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24466        let r = _mm512_maskz_cvtepi64_ph(0b01010101, a);
24467        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24468        assert_eq_m128h(r, e);
24469    }
24470
24471    #[simd_test(enable = "avx512fp16")]
24472    unsafe fn test_mm512_cvt_roundepi64_ph() {
24473        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24474        let r = _mm512_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24475        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24476        assert_eq_m128h(r, e);
24477    }
24478
24479    #[simd_test(enable = "avx512fp16")]
24480    unsafe fn test_mm512_mask_cvt_roundepi64_ph() {
24481        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24482        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24483        let r = _mm512_mask_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24484            src, 0b01010101, a,
24485        );
24486        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24487        assert_eq_m128h(r, e);
24488    }
24489
24490    #[simd_test(enable = "avx512fp16")]
24491    unsafe fn test_mm512_maskz_cvt_roundepi64_ph() {
24492        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24493        let r = _mm512_maskz_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24494            0b01010101, a,
24495        );
24496        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24497        assert_eq_m128h(r, e);
24498    }
24499
24500    #[simd_test(enable = "avx512fp16,avx512vl")]
24501    unsafe fn test_mm_cvtepu64_ph() {
24502        let a = _mm_set_epi64x(1, 2);
24503        let r = _mm_cvtepu64_ph(a);
24504        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
24505        assert_eq_m128h(r, e);
24506    }
24507
24508    #[simd_test(enable = "avx512fp16,avx512vl")]
24509    unsafe fn test_mm_mask_cvtepu64_ph() {
24510        let a = _mm_set_epi64x(1, 2);
24511        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24512        let r = _mm_mask_cvtepu64_ph(src, 0b01, a);
24513        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
24514        assert_eq_m128h(r, e);
24515    }
24516
24517    #[simd_test(enable = "avx512fp16,avx512vl")]
24518    unsafe fn test_mm_maskz_cvtepu64_ph() {
24519        let a = _mm_set_epi64x(1, 2);
24520        let r = _mm_maskz_cvtepu64_ph(0b01, a);
24521        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
24522        assert_eq_m128h(r, e);
24523    }
24524
24525    #[simd_test(enable = "avx512fp16,avx512vl")]
24526    unsafe fn test_mm256_cvtepu64_ph() {
24527        let a = _mm256_set_epi64x(1, 2, 3, 4);
24528        let r = _mm256_cvtepu64_ph(a);
24529        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24530        assert_eq_m128h(r, e);
24531    }
24532
24533    #[simd_test(enable = "avx512fp16,avx512vl")]
24534    unsafe fn test_mm256_mask_cvtepu64_ph() {
24535        let a = _mm256_set_epi64x(1, 2, 3, 4);
24536        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24537        let r = _mm256_mask_cvtepu64_ph(src, 0b0101, a);
24538        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
24539        assert_eq_m128h(r, e);
24540    }
24541
24542    #[simd_test(enable = "avx512fp16,avx512vl")]
24543    unsafe fn test_mm256_maskz_cvtepu64_ph() {
24544        let a = _mm256_set_epi64x(1, 2, 3, 4);
24545        let r = _mm256_maskz_cvtepu64_ph(0b0101, a);
24546        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24547        assert_eq_m128h(r, e);
24548    }
24549
24550    #[simd_test(enable = "avx512fp16")]
24551    unsafe fn test_mm512_cvtepu64_ph() {
24552        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24553        let r = _mm512_cvtepu64_ph(a);
24554        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24555        assert_eq_m128h(r, e);
24556    }
24557
24558    #[simd_test(enable = "avx512fp16")]
24559    unsafe fn test_mm512_mask_cvtepu64_ph() {
24560        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24561        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24562        let r = _mm512_mask_cvtepu64_ph(src, 0b01010101, a);
24563        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24564        assert_eq_m128h(r, e);
24565    }
24566
24567    #[simd_test(enable = "avx512fp16")]
24568    unsafe fn test_mm512_maskz_cvtepu64_ph() {
24569        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24570        let r = _mm512_maskz_cvtepu64_ph(0b01010101, a);
24571        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24572        assert_eq_m128h(r, e);
24573    }
24574
24575    #[simd_test(enable = "avx512fp16")]
24576    unsafe fn test_mm512_cvt_roundepu64_ph() {
24577        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24578        let r = _mm512_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24579        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24580        assert_eq_m128h(r, e);
24581    }
24582
24583    #[simd_test(enable = "avx512fp16")]
24584    unsafe fn test_mm512_mask_cvt_roundepu64_ph() {
24585        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24586        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24587        let r = _mm512_mask_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24588            src, 0b01010101, a,
24589        );
24590        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24591        assert_eq_m128h(r, e);
24592    }
24593
24594    #[simd_test(enable = "avx512fp16")]
24595    unsafe fn test_mm512_maskz_cvt_roundepu64_ph() {
24596        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24597        let r = _mm512_maskz_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24598            0b01010101, a,
24599        );
24600        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24601        assert_eq_m128h(r, e);
24602    }
24603
24604    #[simd_test(enable = "avx512fp16,avx512vl")]
24605    unsafe fn test_mm_cvtxps_ph() {
24606        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24607        let r = _mm_cvtxps_ph(a);
24608        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24609        assert_eq_m128h(r, e);
24610    }
24611
24612    #[simd_test(enable = "avx512fp16,avx512vl")]
24613    unsafe fn test_mm_mask_cvtxps_ph() {
24614        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24615        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24616        let r = _mm_mask_cvtxps_ph(src, 0b0101, a);
24617        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16., 4.0);
24618        assert_eq_m128h(r, e);
24619    }
24620
24621    #[simd_test(enable = "avx512fp16,avx512vl")]
24622    unsafe fn test_mm_maskz_cvtxps_ph() {
24623        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24624        let r = _mm_maskz_cvtxps_ph(0b0101, a);
24625        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24626        assert_eq_m128h(r, e);
24627    }
24628
24629    #[simd_test(enable = "avx512fp16,avx512vl")]
24630    unsafe fn test_mm256_cvtxps_ph() {
24631        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24632        let r = _mm256_cvtxps_ph(a);
24633        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24634        assert_eq_m128h(r, e);
24635    }
24636
24637    #[simd_test(enable = "avx512fp16,avx512vl")]
24638    unsafe fn test_mm256_mask_cvtxps_ph() {
24639        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24640        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24641        let r = _mm256_mask_cvtxps_ph(src, 0b01010101, a);
24642        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24643        assert_eq_m128h(r, e);
24644    }
24645
24646    #[simd_test(enable = "avx512fp16,avx512vl")]
24647    unsafe fn test_mm256_maskz_cvtxps_ph() {
24648        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24649        let r = _mm256_maskz_cvtxps_ph(0b01010101, a);
24650        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24651        assert_eq_m128h(r, e);
24652    }
24653
24654    #[simd_test(enable = "avx512fp16")]
24655    unsafe fn test_mm512_cvtxps_ph() {
24656        let a = _mm512_set_ps(
24657            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24658        );
24659        let r = _mm512_cvtxps_ph(a);
24660        let e = _mm256_set_ph(
24661            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24662        );
24663        assert_eq_m256h(r, e);
24664    }
24665
24666    #[simd_test(enable = "avx512fp16")]
24667    unsafe fn test_mm512_mask_cvtxps_ph() {
24668        let a = _mm512_set_ps(
24669            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24670        );
24671        let src = _mm256_set_ph(
24672            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24673        );
24674        let r = _mm512_mask_cvtxps_ph(src, 0b0101010101010101, a);
24675        let e = _mm256_set_ph(
24676            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
24677        );
24678        assert_eq_m256h(r, e);
24679    }
24680
24681    #[simd_test(enable = "avx512fp16")]
24682    unsafe fn test_mm512_maskz_cvtxps_ph() {
24683        let a = _mm512_set_ps(
24684            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24685        );
24686        let r = _mm512_maskz_cvtxps_ph(0b0101010101010101, a);
24687        let e = _mm256_set_ph(
24688            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24689        );
24690        assert_eq_m256h(r, e);
24691    }
24692
24693    #[simd_test(enable = "avx512fp16")]
24694    unsafe fn test_mm512_cvtx_roundps_ph() {
24695        let a = _mm512_set_ps(
24696            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24697        );
24698        let r = _mm512_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24699        let e = _mm256_set_ph(
24700            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24701        );
24702        assert_eq_m256h(r, e);
24703    }
24704
24705    #[simd_test(enable = "avx512fp16")]
24706    unsafe fn test_mm512_mask_cvtx_roundps_ph() {
24707        let a = _mm512_set_ps(
24708            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24709        );
24710        let src = _mm256_set_ph(
24711            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24712        );
24713        let r = _mm512_mask_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24714            src,
24715            0b0101010101010101,
24716            a,
24717        );
24718        let e = _mm256_set_ph(
24719            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
24720            16.0,
24721        );
24722        assert_eq_m256h(r, e);
24723    }
24724
24725    #[simd_test(enable = "avx512fp16")]
24726    unsafe fn test_mm512_maskz_cvtx_roundps_ph() {
24727        let a = _mm512_set_ps(
24728            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24729        );
24730        let r = _mm512_maskz_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24731            0b0101010101010101,
24732            a,
24733        );
24734        let e = _mm256_set_ph(
24735            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24736        );
24737        assert_eq_m256h(r, e);
24738    }
24739
24740    #[simd_test(enable = "avx512fp16")]
24741    unsafe fn test_mm_cvtss_sh() {
24742        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24743        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
24744        let r = _mm_cvtss_sh(a, b);
24745        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
24746        assert_eq_m128h(r, e);
24747    }
24748
24749    #[simd_test(enable = "avx512fp16")]
24750    unsafe fn test_mm_mask_cvtss_sh() {
24751        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24752        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
24753        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
24754        let r = _mm_mask_cvtss_sh(src, 0, a, b);
24755        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
24756        assert_eq_m128h(r, e);
24757        let r = _mm_mask_cvtss_sh(src, 1, a, b);
24758        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
24759        assert_eq_m128h(r, e);
24760    }
24761
24762    #[simd_test(enable = "avx512fp16")]
24763    unsafe fn test_mm_maskz_cvtss_sh() {
24764        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24765        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
24766        let r = _mm_maskz_cvtss_sh(0, a, b);
24767        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
24768        assert_eq_m128h(r, e);
24769        let r = _mm_maskz_cvtss_sh(1, a, b);
24770        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
24771        assert_eq_m128h(r, e);
24772    }
24773
24774    #[simd_test(enable = "avx512fp16")]
24775    unsafe fn test_mm_cvt_roundss_sh() {
24776        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24777        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
24778        let r = _mm_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
24779        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
24780        assert_eq_m128h(r, e);
24781    }
24782
24783    #[simd_test(enable = "avx512fp16")]
24784    unsafe fn test_mm_mask_cvt_roundss_sh() {
24785        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24786        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
24787        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
24788        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24789            src, 0, a, b,
24790        );
24791        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
24792        assert_eq_m128h(r, e);
24793        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24794            src, 1, a, b,
24795        );
24796        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
24797        assert_eq_m128h(r, e);
24798    }
24799
24800    #[simd_test(enable = "avx512fp16")]
24801    unsafe fn test_mm_maskz_cvt_roundss_sh() {
24802        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24803        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
24804        let r =
24805            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
24806        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
24807        assert_eq_m128h(r, e);
24808        let r =
24809            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
24810        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
24811        assert_eq_m128h(r, e);
24812    }
24813
24814    #[simd_test(enable = "avx512fp16,avx512vl")]
24815    unsafe fn test_mm_cvtpd_ph() {
24816        let a = _mm_set_pd(1.0, 2.0);
24817        let r = _mm_cvtpd_ph(a);
24818        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
24819        assert_eq_m128h(r, e);
24820    }
24821
24822    #[simd_test(enable = "avx512fp16,avx512vl")]
24823    unsafe fn test_mm_mask_cvtpd_ph() {
24824        let a = _mm_set_pd(1.0, 2.0);
24825        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24826        let r = _mm_mask_cvtpd_ph(src, 0b01, a);
24827        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
24828        assert_eq_m128h(r, e);
24829    }
24830
24831    #[simd_test(enable = "avx512fp16,avx512vl")]
24832    unsafe fn test_mm_maskz_cvtpd_ph() {
24833        let a = _mm_set_pd(1.0, 2.0);
24834        let r = _mm_maskz_cvtpd_ph(0b01, a);
24835        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
24836        assert_eq_m128h(r, e);
24837    }
24838
24839    #[simd_test(enable = "avx512fp16,avx512vl")]
24840    unsafe fn test_mm256_cvtpd_ph() {
24841        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
24842        let r = _mm256_cvtpd_ph(a);
24843        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24844        assert_eq_m128h(r, e);
24845    }
24846
24847    #[simd_test(enable = "avx512fp16,avx512vl")]
24848    unsafe fn test_mm256_mask_cvtpd_ph() {
24849        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
24850        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24851        let r = _mm256_mask_cvtpd_ph(src, 0b0101, a);
24852        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
24853        assert_eq_m128h(r, e);
24854    }
24855
24856    #[simd_test(enable = "avx512fp16,avx512vl")]
24857    unsafe fn test_mm256_maskz_cvtpd_ph() {
24858        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
24859        let r = _mm256_maskz_cvtpd_ph(0b0101, a);
24860        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24861        assert_eq_m128h(r, e);
24862    }
24863
24864    #[simd_test(enable = "avx512fp16")]
24865    unsafe fn test_mm512_cvtpd_ph() {
24866        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24867        let r = _mm512_cvtpd_ph(a);
24868        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24869        assert_eq_m128h(r, e);
24870    }
24871
24872    #[simd_test(enable = "avx512fp16")]
24873    unsafe fn test_mm512_mask_cvtpd_ph() {
24874        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24875        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24876        let r = _mm512_mask_cvtpd_ph(src, 0b01010101, a);
24877        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24878        assert_eq_m128h(r, e);
24879    }
24880
24881    #[simd_test(enable = "avx512fp16")]
24882    unsafe fn test_mm512_maskz_cvtpd_ph() {
24883        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24884        let r = _mm512_maskz_cvtpd_ph(0b01010101, a);
24885        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24886        assert_eq_m128h(r, e);
24887    }
24888
24889    #[simd_test(enable = "avx512fp16")]
24890    unsafe fn test_mm512_cvt_roundpd_ph() {
24891        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24892        let r = _mm512_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24893        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24894        assert_eq_m128h(r, e);
24895    }
24896
24897    #[simd_test(enable = "avx512fp16")]
24898    unsafe fn test_mm512_mask_cvt_roundpd_ph() {
24899        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24900        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24901        let r = _mm512_mask_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24902            src, 0b01010101, a,
24903        );
24904        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24905        assert_eq_m128h(r, e);
24906    }
24907
24908    #[simd_test(enable = "avx512fp16")]
24909    unsafe fn test_mm512_maskz_cvt_roundpd_ph() {
24910        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24911        let r = _mm512_maskz_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24912            0b01010101, a,
24913        );
24914        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24915        assert_eq_m128h(r, e);
24916    }
24917
24918    #[simd_test(enable = "avx512fp16")]
24919    unsafe fn test_mm_cvtsd_sh() {
24920        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24921        let b = _mm_setr_pd(1.0, 2.0);
24922        let r = _mm_cvtsd_sh(a, b);
24923        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
24924        assert_eq_m128h(r, e);
24925    }
24926
24927    #[simd_test(enable = "avx512fp16")]
24928    unsafe fn test_mm_mask_cvtsd_sh() {
24929        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24930        let b = _mm_setr_pd(1.0, 2.0);
24931        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
24932        let r = _mm_mask_cvtsd_sh(src, 0, a, b);
24933        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
24934        assert_eq_m128h(r, e);
24935        let r = _mm_mask_cvtsd_sh(src, 1, a, b);
24936        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
24937        assert_eq_m128h(r, e);
24938    }
24939
24940    #[simd_test(enable = "avx512fp16")]
24941    unsafe fn test_mm_maskz_cvtsd_sh() {
24942        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24943        let b = _mm_setr_pd(1.0, 2.0);
24944        let r = _mm_maskz_cvtsd_sh(0, a, b);
24945        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
24946        assert_eq_m128h(r, e);
24947        let r = _mm_maskz_cvtsd_sh(1, a, b);
24948        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
24949        assert_eq_m128h(r, e);
24950    }
24951
24952    #[simd_test(enable = "avx512fp16")]
24953    unsafe fn test_mm_cvt_roundsd_sh() {
24954        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24955        let b = _mm_setr_pd(1.0, 2.0);
24956        let r = _mm_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
24957        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
24958        assert_eq_m128h(r, e);
24959    }
24960
24961    #[simd_test(enable = "avx512fp16")]
24962    unsafe fn test_mm_mask_cvt_roundsd_sh() {
24963        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24964        let b = _mm_setr_pd(1.0, 2.0);
24965        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
24966        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24967            src, 0, a, b,
24968        );
24969        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
24970        assert_eq_m128h(r, e);
24971        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24972            src, 1, a, b,
24973        );
24974        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
24975        assert_eq_m128h(r, e);
24976    }
24977
24978    #[simd_test(enable = "avx512fp16")]
24979    unsafe fn test_mm_maskz_cvt_roundsd_sh() {
24980        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24981        let b = _mm_setr_pd(1.0, 2.0);
24982        let r =
24983            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
24984        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
24985        assert_eq_m128h(r, e);
24986        let r =
24987            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
24988        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
24989        assert_eq_m128h(r, e);
24990    }
24991
24992    #[simd_test(enable = "avx512fp16,avx512vl")]
24993    unsafe fn test_mm_cvtph_epi16() {
24994        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24995        let r = _mm_cvttph_epi16(a);
24996        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24997        assert_eq_m128i(r, e);
24998    }
24999
25000    #[simd_test(enable = "avx512fp16,avx512vl")]
25001    unsafe fn test_mm_mask_cvtph_epi16() {
25002        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25003        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25004        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25005        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25006        assert_eq_m128i(r, e);
25007    }
25008
25009    #[simd_test(enable = "avx512fp16,avx512vl")]
25010    unsafe fn test_mm_maskz_cvtph_epi16() {
25011        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25012        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25013        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25014        assert_eq_m128i(r, e);
25015    }
25016
25017    #[simd_test(enable = "avx512fp16,avx512vl")]
25018    unsafe fn test_mm256_cvtph_epi16() {
25019        let a = _mm256_set_ph(
25020            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25021        );
25022        let r = _mm256_cvttph_epi16(a);
25023        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25024        assert_eq_m256i(r, e);
25025    }
25026
25027    #[simd_test(enable = "avx512fp16,avx512vl")]
25028    unsafe fn test_mm256_mask_cvtph_epi16() {
25029        let a = _mm256_set_ph(
25030            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25031        );
25032        let src = _mm256_set_epi16(
25033            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25034        );
25035        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25036        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25037        assert_eq_m256i(r, e);
25038    }
25039
25040    #[simd_test(enable = "avx512fp16,avx512vl")]
25041    unsafe fn test_mm256_maskz_cvtph_epi16() {
25042        let a = _mm256_set_ph(
25043            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25044        );
25045        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25046        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25047        assert_eq_m256i(r, e);
25048    }
25049
25050    #[simd_test(enable = "avx512fp16")]
25051    unsafe fn test_mm512_cvtph_epi16() {
25052        let a = _mm512_set_ph(
25053            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25054            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25055            31.0, 32.0,
25056        );
25057        let r = _mm512_cvttph_epi16(a);
25058        let e = _mm512_set_epi16(
25059            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25060            25, 26, 27, 28, 29, 30, 31, 32,
25061        );
25062        assert_eq_m512i(r, e);
25063    }
25064
25065    #[simd_test(enable = "avx512fp16")]
25066    unsafe fn test_mm512_mask_cvtph_epi16() {
25067        let a = _mm512_set_ph(
25068            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25069            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25070            31.0, 32.0,
25071        );
25072        let src = _mm512_set_epi16(
25073            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25074            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25075        );
25076        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25077        let e = _mm512_set_epi16(
25078            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25079            24, 34, 26, 36, 28, 38, 30, 40, 32,
25080        );
25081        assert_eq_m512i(r, e);
25082    }
25083
25084    #[simd_test(enable = "avx512fp16")]
25085    unsafe fn test_mm512_maskz_cvtph_epi16() {
25086        let a = _mm512_set_ph(
25087            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25088            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25089            31.0, 32.0,
25090        );
25091        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25092        let e = _mm512_set_epi16(
25093            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25094            0, 28, 0, 30, 0, 32,
25095        );
25096        assert_eq_m512i(r, e);
25097    }
25098
25099    #[simd_test(enable = "avx512fp16")]
25100    unsafe fn test_mm512_cvt_roundph_epi16() {
25101        let a = _mm512_set_ph(
25102            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25103            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25104            31.0, 32.0,
25105        );
25106        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
25107        let e = _mm512_set_epi16(
25108            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25109            25, 26, 27, 28, 29, 30, 31, 32,
25110        );
25111        assert_eq_m512i(r, e);
25112    }
25113
25114    #[simd_test(enable = "avx512fp16")]
25115    unsafe fn test_mm512_mask_cvt_roundph_epi16() {
25116        let a = _mm512_set_ph(
25117            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25118            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25119            31.0, 32.0,
25120        );
25121        let src = _mm512_set_epi16(
25122            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25123            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25124        );
25125        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25126            src,
25127            0b01010101010101010101010101010101,
25128            a,
25129        );
25130        let e = _mm512_set_epi16(
25131            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25132            24, 34, 26, 36, 28, 38, 30, 40, 32,
25133        );
25134        assert_eq_m512i(r, e);
25135    }
25136
25137    #[simd_test(enable = "avx512fp16")]
25138    unsafe fn test_mm512_maskz_cvt_roundph_epi16() {
25139        let a = _mm512_set_ph(
25140            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25141            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25142            31.0, 32.0,
25143        );
25144        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25145            0b01010101010101010101010101010101,
25146            a,
25147        );
25148        let e = _mm512_set_epi16(
25149            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25150            0, 28, 0, 30, 0, 32,
25151        );
25152        assert_eq_m512i(r, e);
25153    }
25154
25155    #[simd_test(enable = "avx512fp16,avx512vl")]
25156    unsafe fn test_mm_cvtph_epu16() {
25157        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25158        let r = _mm_cvttph_epu16(a);
25159        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25160        assert_eq_m128i(r, e);
25161    }
25162
25163    #[simd_test(enable = "avx512fp16,avx512vl")]
25164    unsafe fn test_mm_mask_cvtph_epu16() {
25165        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25166        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25167        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
25168        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25169        assert_eq_m128i(r, e);
25170    }
25171
25172    #[simd_test(enable = "avx512fp16,avx512vl")]
25173    unsafe fn test_mm_maskz_cvtph_epu16() {
25174        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25175        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
25176        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25177        assert_eq_m128i(r, e);
25178    }
25179
25180    #[simd_test(enable = "avx512fp16,avx512vl")]
25181    unsafe fn test_mm256_cvtph_epu16() {
25182        let a = _mm256_set_ph(
25183            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25184        );
25185        let r = _mm256_cvttph_epu16(a);
25186        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25187        assert_eq_m256i(r, e);
25188    }
25189
25190    #[simd_test(enable = "avx512fp16,avx512vl")]
25191    unsafe fn test_mm256_mask_cvtph_epu16() {
25192        let a = _mm256_set_ph(
25193            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25194        );
25195        let src = _mm256_set_epi16(
25196            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25197        );
25198        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
25199        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25200        assert_eq_m256i(r, e);
25201    }
25202
25203    #[simd_test(enable = "avx512fp16,avx512vl")]
25204    unsafe fn test_mm256_maskz_cvtph_epu16() {
25205        let a = _mm256_set_ph(
25206            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25207        );
25208        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
25209        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25210        assert_eq_m256i(r, e);
25211    }
25212
25213    #[simd_test(enable = "avx512fp16")]
25214    unsafe fn test_mm512_cvtph_epu16() {
25215        let a = _mm512_set_ph(
25216            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25217            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25218            31.0, 32.0,
25219        );
25220        let r = _mm512_cvttph_epu16(a);
25221        let e = _mm512_set_epi16(
25222            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25223            25, 26, 27, 28, 29, 30, 31, 32,
25224        );
25225        assert_eq_m512i(r, e);
25226    }
25227
25228    #[simd_test(enable = "avx512fp16")]
25229    unsafe fn test_mm512_mask_cvtph_epu16() {
25230        let a = _mm512_set_ph(
25231            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25232            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25233            31.0, 32.0,
25234        );
25235        let src = _mm512_set_epi16(
25236            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25237            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25238        );
25239        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
25240        let e = _mm512_set_epi16(
25241            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25242            24, 34, 26, 36, 28, 38, 30, 40, 32,
25243        );
25244        assert_eq_m512i(r, e);
25245    }
25246
25247    #[simd_test(enable = "avx512fp16")]
25248    unsafe fn test_mm512_maskz_cvtph_epu16() {
25249        let a = _mm512_set_ph(
25250            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25251            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25252            31.0, 32.0,
25253        );
25254        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
25255        let e = _mm512_set_epi16(
25256            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25257            0, 28, 0, 30, 0, 32,
25258        );
25259        assert_eq_m512i(r, e);
25260    }
25261
25262    #[simd_test(enable = "avx512fp16")]
25263    unsafe fn test_mm512_cvt_roundph_epu16() {
25264        let a = _mm512_set_ph(
25265            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25266            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25267            31.0, 32.0,
25268        );
25269        let r = _mm512_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25270        let e = _mm512_set_epi16(
25271            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25272            25, 26, 27, 28, 29, 30, 31, 32,
25273        );
25274        assert_eq_m512i(r, e);
25275    }
25276
25277    #[simd_test(enable = "avx512fp16")]
25278    unsafe fn test_mm512_mask_cvt_roundph_epu16() {
25279        let a = _mm512_set_ph(
25280            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25281            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25282            31.0, 32.0,
25283        );
25284        let src = _mm512_set_epi16(
25285            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25286            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25287        );
25288        let r = _mm512_mask_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25289            src,
25290            0b01010101010101010101010101010101,
25291            a,
25292        );
25293        let e = _mm512_set_epi16(
25294            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25295            24, 34, 26, 36, 28, 38, 30, 40, 32,
25296        );
25297        assert_eq_m512i(r, e);
25298    }
25299
25300    #[simd_test(enable = "avx512fp16")]
25301    unsafe fn test_mm512_maskz_cvt_roundph_epu16() {
25302        let a = _mm512_set_ph(
25303            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25304            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25305            31.0, 32.0,
25306        );
25307        let r = _mm512_maskz_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25308            0b01010101010101010101010101010101,
25309            a,
25310        );
25311        let e = _mm512_set_epi16(
25312            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25313            0, 28, 0, 30, 0, 32,
25314        );
25315        assert_eq_m512i(r, e);
25316    }
25317
25318    #[simd_test(enable = "avx512fp16,avx512vl")]
25319    unsafe fn test_mm_cvttph_epi16() {
25320        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25321        let r = _mm_cvttph_epi16(a);
25322        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25323        assert_eq_m128i(r, e);
25324    }
25325
25326    #[simd_test(enable = "avx512fp16,avx512vl")]
25327    unsafe fn test_mm_mask_cvttph_epi16() {
25328        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25329        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25330        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25331        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25332        assert_eq_m128i(r, e);
25333    }
25334
25335    #[simd_test(enable = "avx512fp16,avx512vl")]
25336    unsafe fn test_mm_maskz_cvttph_epi16() {
25337        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25338        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25339        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25340        assert_eq_m128i(r, e);
25341    }
25342
25343    #[simd_test(enable = "avx512fp16,avx512vl")]
25344    unsafe fn test_mm256_cvttph_epi16() {
25345        let a = _mm256_set_ph(
25346            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25347        );
25348        let r = _mm256_cvttph_epi16(a);
25349        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25350        assert_eq_m256i(r, e);
25351    }
25352
25353    #[simd_test(enable = "avx512fp16,avx512vl")]
25354    unsafe fn test_mm256_mask_cvttph_epi16() {
25355        let a = _mm256_set_ph(
25356            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25357        );
25358        let src = _mm256_set_epi16(
25359            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25360        );
25361        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25362        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25363        assert_eq_m256i(r, e);
25364    }
25365
25366    #[simd_test(enable = "avx512fp16,avx512vl")]
25367    unsafe fn test_mm256_maskz_cvttph_epi16() {
25368        let a = _mm256_set_ph(
25369            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25370        );
25371        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25372        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25373        assert_eq_m256i(r, e);
25374    }
25375
25376    #[simd_test(enable = "avx512fp16")]
25377    unsafe fn test_mm512_cvttph_epi16() {
25378        let a = _mm512_set_ph(
25379            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25380            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25381            31.0, 32.0,
25382        );
25383        let r = _mm512_cvttph_epi16(a);
25384        let e = _mm512_set_epi16(
25385            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25386            25, 26, 27, 28, 29, 30, 31, 32,
25387        );
25388        assert_eq_m512i(r, e);
25389    }
25390
25391    #[simd_test(enable = "avx512fp16")]
25392    unsafe fn test_mm512_mask_cvttph_epi16() {
25393        let a = _mm512_set_ph(
25394            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25395            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25396            31.0, 32.0,
25397        );
25398        let src = _mm512_set_epi16(
25399            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25400            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25401        );
25402        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25403        let e = _mm512_set_epi16(
25404            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25405            24, 34, 26, 36, 28, 38, 30, 40, 32,
25406        );
25407        assert_eq_m512i(r, e);
25408    }
25409
25410    #[simd_test(enable = "avx512fp16")]
25411    unsafe fn test_mm512_maskz_cvttph_epi16() {
25412        let a = _mm512_set_ph(
25413            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25414            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25415            31.0, 32.0,
25416        );
25417        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25418        let e = _mm512_set_epi16(
25419            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25420            0, 28, 0, 30, 0, 32,
25421        );
25422        assert_eq_m512i(r, e);
25423    }
25424
25425    #[simd_test(enable = "avx512fp16")]
25426    unsafe fn test_mm512_cvtt_roundph_epi16() {
25427        let a = _mm512_set_ph(
25428            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25429            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25430            31.0, 32.0,
25431        );
25432        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
25433        let e = _mm512_set_epi16(
25434            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25435            25, 26, 27, 28, 29, 30, 31, 32,
25436        );
25437        assert_eq_m512i(r, e);
25438    }
25439
25440    #[simd_test(enable = "avx512fp16")]
25441    unsafe fn test_mm512_mask_cvtt_roundph_epi16() {
25442        let a = _mm512_set_ph(
25443            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25444            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25445            31.0, 32.0,
25446        );
25447        let src = _mm512_set_epi16(
25448            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25449            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25450        );
25451        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25452            src,
25453            0b01010101010101010101010101010101,
25454            a,
25455        );
25456        let e = _mm512_set_epi16(
25457            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25458            24, 34, 26, 36, 28, 38, 30, 40, 32,
25459        );
25460        assert_eq_m512i(r, e);
25461    }
25462
25463    #[simd_test(enable = "avx512fp16")]
25464    unsafe fn test_mm512_maskz_cvtt_roundph_epi16() {
25465        let a = _mm512_set_ph(
25466            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25467            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25468            31.0, 32.0,
25469        );
25470        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25471            0b01010101010101010101010101010101,
25472            a,
25473        );
25474        let e = _mm512_set_epi16(
25475            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25476            0, 28, 0, 30, 0, 32,
25477        );
25478        assert_eq_m512i(r, e);
25479    }
25480
25481    #[simd_test(enable = "avx512fp16,avx512vl")]
25482    unsafe fn test_mm_cvttph_epu16() {
25483        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25484        let r = _mm_cvttph_epu16(a);
25485        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25486        assert_eq_m128i(r, e);
25487    }
25488
25489    #[simd_test(enable = "avx512fp16,avx512vl")]
25490    unsafe fn test_mm_mask_cvttph_epu16() {
25491        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25492        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25493        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
25494        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25495        assert_eq_m128i(r, e);
25496    }
25497
25498    #[simd_test(enable = "avx512fp16,avx512vl")]
25499    unsafe fn test_mm_maskz_cvttph_epu16() {
25500        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25501        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
25502        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25503        assert_eq_m128i(r, e);
25504    }
25505
25506    #[simd_test(enable = "avx512fp16,avx512vl")]
25507    unsafe fn test_mm256_cvttph_epu16() {
25508        let a = _mm256_set_ph(
25509            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25510        );
25511        let r = _mm256_cvttph_epu16(a);
25512        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25513        assert_eq_m256i(r, e);
25514    }
25515
25516    #[simd_test(enable = "avx512fp16,avx512vl")]
25517    unsafe fn test_mm256_mask_cvttph_epu16() {
25518        let a = _mm256_set_ph(
25519            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25520        );
25521        let src = _mm256_set_epi16(
25522            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25523        );
25524        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
25525        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25526        assert_eq_m256i(r, e);
25527    }
25528
25529    #[simd_test(enable = "avx512fp16,avx512vl")]
25530    unsafe fn test_mm256_maskz_cvttph_epu16() {
25531        let a = _mm256_set_ph(
25532            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25533        );
25534        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
25535        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25536        assert_eq_m256i(r, e);
25537    }
25538
25539    #[simd_test(enable = "avx512fp16")]
25540    unsafe fn test_mm512_cvttph_epu16() {
25541        let a = _mm512_set_ph(
25542            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25543            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25544            31.0, 32.0,
25545        );
25546        let r = _mm512_cvttph_epu16(a);
25547        let e = _mm512_set_epi16(
25548            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25549            25, 26, 27, 28, 29, 30, 31, 32,
25550        );
25551        assert_eq_m512i(r, e);
25552    }
25553
25554    #[simd_test(enable = "avx512fp16")]
25555    unsafe fn test_mm512_mask_cvttph_epu16() {
25556        let a = _mm512_set_ph(
25557            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25558            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25559            31.0, 32.0,
25560        );
25561        let src = _mm512_set_epi16(
25562            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25563            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25564        );
25565        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
25566        let e = _mm512_set_epi16(
25567            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25568            24, 34, 26, 36, 28, 38, 30, 40, 32,
25569        );
25570        assert_eq_m512i(r, e);
25571    }
25572
25573    #[simd_test(enable = "avx512fp16")]
25574    unsafe fn test_mm512_maskz_cvttph_epu16() {
25575        let a = _mm512_set_ph(
25576            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25577            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25578            31.0, 32.0,
25579        );
25580        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
25581        let e = _mm512_set_epi16(
25582            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25583            0, 28, 0, 30, 0, 32,
25584        );
25585        assert_eq_m512i(r, e);
25586    }
25587
25588    #[simd_test(enable = "avx512fp16")]
25589    unsafe fn test_mm512_cvtt_roundph_epu16() {
25590        let a = _mm512_set_ph(
25591            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25592            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25593            31.0, 32.0,
25594        );
25595        let r = _mm512_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(a);
25596        let e = _mm512_set_epi16(
25597            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25598            25, 26, 27, 28, 29, 30, 31, 32,
25599        );
25600        assert_eq_m512i(r, e);
25601    }
25602
25603    #[simd_test(enable = "avx512fp16")]
25604    unsafe fn test_mm512_mask_cvtt_roundph_epu16() {
25605        let a = _mm512_set_ph(
25606            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25607            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25608            31.0, 32.0,
25609        );
25610        let src = _mm512_set_epi16(
25611            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25612            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25613        );
25614        let r = _mm512_mask_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
25615            src,
25616            0b01010101010101010101010101010101,
25617            a,
25618        );
25619        let e = _mm512_set_epi16(
25620            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25621            24, 34, 26, 36, 28, 38, 30, 40, 32,
25622        );
25623        assert_eq_m512i(r, e);
25624    }
25625
25626    #[simd_test(enable = "avx512fp16")]
25627    unsafe fn test_mm512_maskz_cvtt_roundph_epu16() {
25628        let a = _mm512_set_ph(
25629            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25630            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25631            31.0, 32.0,
25632        );
25633        let r = _mm512_maskz_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
25634            0b01010101010101010101010101010101,
25635            a,
25636        );
25637        let e = _mm512_set_epi16(
25638            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25639            0, 28, 0, 30, 0, 32,
25640        );
25641        assert_eq_m512i(r, e);
25642    }
25643
25644    #[simd_test(enable = "avx512fp16,avx512vl")]
25645    unsafe fn test_mm_cvtph_epi32() {
25646        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25647        let r = _mm_cvtph_epi32(a);
25648        let e = _mm_set_epi32(1, 2, 3, 4);
25649        assert_eq_m128i(r, e);
25650    }
25651
25652    #[simd_test(enable = "avx512fp16,avx512vl")]
25653    unsafe fn test_mm_mask_cvtph_epi32() {
25654        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25655        let src = _mm_set_epi32(10, 11, 12, 13);
25656        let r = _mm_mask_cvtph_epi32(src, 0b0101, a);
25657        let e = _mm_set_epi32(10, 2, 12, 4);
25658        assert_eq_m128i(r, e);
25659    }
25660
25661    #[simd_test(enable = "avx512fp16,avx512vl")]
25662    unsafe fn test_mm_maskz_cvtph_epi32() {
25663        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25664        let r = _mm_maskz_cvtph_epi32(0b0101, a);
25665        let e = _mm_set_epi32(0, 2, 0, 4);
25666        assert_eq_m128i(r, e);
25667    }
25668
25669    #[simd_test(enable = "avx512fp16,avx512vl")]
25670    unsafe fn test_mm256_cvtph_epi32() {
25671        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25672        let r = _mm256_cvtph_epi32(a);
25673        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
25674        assert_eq_m256i(r, e);
25675    }
25676
25677    #[simd_test(enable = "avx512fp16,avx512vl")]
25678    unsafe fn test_mm256_mask_cvtph_epi32() {
25679        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25680        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
25681        let r = _mm256_mask_cvtph_epi32(src, 0b01010101, a);
25682        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
25683        assert_eq_m256i(r, e);
25684    }
25685
25686    #[simd_test(enable = "avx512fp16,avx512vl")]
25687    unsafe fn test_mm256_maskz_cvtph_epi32() {
25688        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25689        let r = _mm256_maskz_cvtph_epi32(0b01010101, a);
25690        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
25691        assert_eq_m256i(r, e);
25692    }
25693
25694    #[simd_test(enable = "avx512fp16")]
25695    unsafe fn test_mm512_cvtph_epi32() {
25696        let a = _mm256_set_ph(
25697            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25698        );
25699        let r = _mm512_cvtph_epi32(a);
25700        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25701        assert_eq_m512i(r, e);
25702    }
25703
25704    #[simd_test(enable = "avx512fp16")]
25705    unsafe fn test_mm512_mask_cvtph_epi32() {
25706        let a = _mm256_set_ph(
25707            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25708        );
25709        let src = _mm512_set_epi32(
25710            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25711        );
25712        let r = _mm512_mask_cvtph_epi32(src, 0b0101010101010101, a);
25713        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25714        assert_eq_m512i(r, e);
25715    }
25716
25717    #[simd_test(enable = "avx512fp16")]
25718    unsafe fn test_mm512_maskz_cvtph_epi32() {
25719        let a = _mm256_set_ph(
25720            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25721        );
25722        let r = _mm512_maskz_cvtph_epi32(0b0101010101010101, a);
25723        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25724        assert_eq_m512i(r, e);
25725    }
25726
25727    #[simd_test(enable = "avx512fp16")]
25728    unsafe fn test_mm512_cvt_roundph_epi32() {
25729        let a = _mm256_set_ph(
25730            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25731        );
25732        let r = _mm512_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25733        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25734        assert_eq_m512i(r, e);
25735    }
25736
25737    #[simd_test(enable = "avx512fp16")]
25738    unsafe fn test_mm512_mask_cvt_roundph_epi32() {
25739        let a = _mm256_set_ph(
25740            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25741        );
25742        let src = _mm512_set_epi32(
25743            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25744        );
25745        let r = _mm512_mask_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25746            src,
25747            0b0101010101010101,
25748            a,
25749        );
25750        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25751        assert_eq_m512i(r, e);
25752    }
25753
25754    #[simd_test(enable = "avx512fp16")]
25755    unsafe fn test_mm512_maskz_cvt_roundph_epi32() {
25756        let a = _mm256_set_ph(
25757            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25758        );
25759        let r = _mm512_maskz_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25760            0b0101010101010101,
25761            a,
25762        );
25763        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25764        assert_eq_m512i(r, e);
25765    }
25766
25767    #[simd_test(enable = "avx512fp16")]
25768    unsafe fn test_mm_cvtsh_i32() {
25769        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25770        let r = _mm_cvtsh_i32(a);
25771        assert_eq!(r, 1);
25772    }
25773
25774    #[simd_test(enable = "avx512fp16")]
25775    unsafe fn test_mm_cvt_roundsh_i32() {
25776        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25777        let r = _mm_cvt_roundsh_i32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25778        assert_eq!(r, 1);
25779    }
25780
25781    #[simd_test(enable = "avx512fp16,avx512vl")]
25782    unsafe fn test_mm_cvtph_epu32() {
25783        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25784        let r = _mm_cvtph_epu32(a);
25785        let e = _mm_set_epi32(1, 2, 3, 4);
25786        assert_eq_m128i(r, e);
25787    }
25788
25789    #[simd_test(enable = "avx512fp16,avx512vl")]
25790    unsafe fn test_mm_mask_cvtph_epu32() {
25791        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25792        let src = _mm_set_epi32(10, 11, 12, 13);
25793        let r = _mm_mask_cvtph_epu32(src, 0b0101, a);
25794        let e = _mm_set_epi32(10, 2, 12, 4);
25795        assert_eq_m128i(r, e);
25796    }
25797
25798    #[simd_test(enable = "avx512fp16,avx512vl")]
25799    unsafe fn test_mm_maskz_cvtph_epu32() {
25800        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25801        let r = _mm_maskz_cvtph_epu32(0b0101, a);
25802        let e = _mm_set_epi32(0, 2, 0, 4);
25803        assert_eq_m128i(r, e);
25804    }
25805
25806    #[simd_test(enable = "avx512fp16,avx512vl")]
25807    unsafe fn test_mm256_cvtph_epu32() {
25808        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25809        let r = _mm256_cvtph_epu32(a);
25810        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
25811        assert_eq_m256i(r, e);
25812    }
25813
25814    #[simd_test(enable = "avx512fp16,avx512vl")]
25815    unsafe fn test_mm256_mask_cvtph_epu32() {
25816        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25817        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
25818        let r = _mm256_mask_cvtph_epu32(src, 0b01010101, a);
25819        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
25820        assert_eq_m256i(r, e);
25821    }
25822
25823    #[simd_test(enable = "avx512fp16,avx512vl")]
25824    unsafe fn test_mm256_maskz_cvtph_epu32() {
25825        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25826        let r = _mm256_maskz_cvtph_epu32(0b01010101, a);
25827        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
25828        assert_eq_m256i(r, e);
25829    }
25830
25831    #[simd_test(enable = "avx512fp16")]
25832    unsafe fn test_mm512_cvtph_epu32() {
25833        let a = _mm256_set_ph(
25834            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25835        );
25836        let r = _mm512_cvtph_epu32(a);
25837        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25838        assert_eq_m512i(r, e);
25839    }
25840
25841    #[simd_test(enable = "avx512fp16")]
25842    unsafe fn test_mm512_mask_cvtph_epu32() {
25843        let a = _mm256_set_ph(
25844            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25845        );
25846        let src = _mm512_set_epi32(
25847            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25848        );
25849        let r = _mm512_mask_cvtph_epu32(src, 0b0101010101010101, a);
25850        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25851        assert_eq_m512i(r, e);
25852    }
25853
25854    #[simd_test(enable = "avx512fp16")]
25855    unsafe fn test_mm512_maskz_cvtph_epu32() {
25856        let a = _mm256_set_ph(
25857            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25858        );
25859        let r = _mm512_maskz_cvtph_epu32(0b0101010101010101, a);
25860        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25861        assert_eq_m512i(r, e);
25862    }
25863
25864    #[simd_test(enable = "avx512fp16")]
25865    unsafe fn test_mm512_cvt_roundph_epu32() {
25866        let a = _mm256_set_ph(
25867            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25868        );
25869        let r = _mm512_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25870        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25871        assert_eq_m512i(r, e);
25872    }
25873
25874    #[simd_test(enable = "avx512fp16")]
25875    unsafe fn test_mm512_mask_cvt_roundph_epu32() {
25876        let a = _mm256_set_ph(
25877            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25878        );
25879        let src = _mm512_set_epi32(
25880            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25881        );
25882        let r = _mm512_mask_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25883            src,
25884            0b0101010101010101,
25885            a,
25886        );
25887        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25888        assert_eq_m512i(r, e);
25889    }
25890
25891    #[simd_test(enable = "avx512fp16")]
25892    unsafe fn test_mm512_maskz_cvt_roundph_epu32() {
25893        let a = _mm256_set_ph(
25894            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25895        );
25896        let r = _mm512_maskz_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25897            0b0101010101010101,
25898            a,
25899        );
25900        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25901        assert_eq_m512i(r, e);
25902    }
25903
25904    #[simd_test(enable = "avx512fp16")]
25905    unsafe fn test_mm_cvtsh_u32() {
25906        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25907        let r = _mm_cvtsh_u32(a);
25908        assert_eq!(r, 1);
25909    }
25910
25911    #[simd_test(enable = "avx512fp16")]
25912    unsafe fn test_mm_cvt_roundsh_u32() {
25913        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25914        let r = _mm_cvt_roundsh_u32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25915        assert_eq!(r, 1);
25916    }
25917
25918    #[simd_test(enable = "avx512fp16,avx512vl")]
25919    unsafe fn test_mm_cvttph_epi32() {
25920        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25921        let r = _mm_cvttph_epi32(a);
25922        let e = _mm_set_epi32(1, 2, 3, 4);
25923        assert_eq_m128i(r, e);
25924    }
25925
25926    #[simd_test(enable = "avx512fp16,avx512vl")]
25927    unsafe fn test_mm_mask_cvttph_epi32() {
25928        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25929        let src = _mm_set_epi32(10, 11, 12, 13);
25930        let r = _mm_mask_cvttph_epi32(src, 0b0101, a);
25931        let e = _mm_set_epi32(10, 2, 12, 4);
25932        assert_eq_m128i(r, e);
25933    }
25934
25935    #[simd_test(enable = "avx512fp16,avx512vl")]
25936    unsafe fn test_mm_maskz_cvttph_epi32() {
25937        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25938        let r = _mm_maskz_cvttph_epi32(0b0101, a);
25939        let e = _mm_set_epi32(0, 2, 0, 4);
25940        assert_eq_m128i(r, e);
25941    }
25942
25943    #[simd_test(enable = "avx512fp16,avx512vl")]
25944    unsafe fn test_mm256_cvttph_epi32() {
25945        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25946        let r = _mm256_cvttph_epi32(a);
25947        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
25948        assert_eq_m256i(r, e);
25949    }
25950
25951    #[simd_test(enable = "avx512fp16,avx512vl")]
25952    unsafe fn test_mm256_mask_cvttph_epi32() {
25953        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25954        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
25955        let r = _mm256_mask_cvttph_epi32(src, 0b01010101, a);
25956        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
25957        assert_eq_m256i(r, e);
25958    }
25959
25960    #[simd_test(enable = "avx512fp16,avx512vl")]
25961    unsafe fn test_mm256_maskz_cvttph_epi32() {
25962        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25963        let r = _mm256_maskz_cvttph_epi32(0b01010101, a);
25964        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
25965        assert_eq_m256i(r, e);
25966    }
25967
25968    #[simd_test(enable = "avx512fp16")]
25969    unsafe fn test_mm512_cvttph_epi32() {
25970        let a = _mm256_set_ph(
25971            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25972        );
25973        let r = _mm512_cvttph_epi32(a);
25974        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25975        assert_eq_m512i(r, e);
25976    }
25977
25978    #[simd_test(enable = "avx512fp16")]
25979    unsafe fn test_mm512_mask_cvttph_epi32() {
25980        let a = _mm256_set_ph(
25981            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25982        );
25983        let src = _mm512_set_epi32(
25984            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25985        );
25986        let r = _mm512_mask_cvttph_epi32(src, 0b0101010101010101, a);
25987        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25988        assert_eq_m512i(r, e);
25989    }
25990
25991    #[simd_test(enable = "avx512fp16")]
25992    unsafe fn test_mm512_maskz_cvttph_epi32() {
25993        let a = _mm256_set_ph(
25994            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25995        );
25996        let r = _mm512_maskz_cvttph_epi32(0b0101010101010101, a);
25997        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25998        assert_eq_m512i(r, e);
25999    }
26000
26001    #[simd_test(enable = "avx512fp16")]
26002    unsafe fn test_mm512_cvtt_roundph_epi32() {
26003        let a = _mm256_set_ph(
26004            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26005        );
26006        let r = _mm512_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(a);
26007        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26008        assert_eq_m512i(r, e);
26009    }
26010
26011    #[simd_test(enable = "avx512fp16")]
26012    unsafe fn test_mm512_mask_cvtt_roundph_epi32() {
26013        let a = _mm256_set_ph(
26014            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26015        );
26016        let src = _mm512_set_epi32(
26017            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26018        );
26019        let r = _mm512_mask_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26020        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26021        assert_eq_m512i(r, e);
26022    }
26023
26024    #[simd_test(enable = "avx512fp16")]
26025    unsafe fn test_mm512_maskz_cvtt_roundph_epi32() {
26026        let a = _mm256_set_ph(
26027            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26028        );
26029        let r = _mm512_maskz_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26030        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26031        assert_eq_m512i(r, e);
26032    }
26033
26034    #[simd_test(enable = "avx512fp16")]
26035    unsafe fn test_mm_cvttsh_i32() {
26036        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26037        let r = _mm_cvttsh_i32(a);
26038        assert_eq!(r, 1);
26039    }
26040
26041    #[simd_test(enable = "avx512fp16")]
26042    unsafe fn test_mm_cvtt_roundsh_i32() {
26043        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26044        let r = _mm_cvtt_roundsh_i32::<_MM_FROUND_NO_EXC>(a);
26045        assert_eq!(r, 1);
26046    }
26047
26048    #[simd_test(enable = "avx512fp16,avx512vl")]
26049    unsafe fn test_mm_cvttph_epu32() {
26050        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26051        let r = _mm_cvttph_epu32(a);
26052        let e = _mm_set_epi32(1, 2, 3, 4);
26053        assert_eq_m128i(r, e);
26054    }
26055
26056    #[simd_test(enable = "avx512fp16,avx512vl")]
26057    unsafe fn test_mm_mask_cvttph_epu32() {
26058        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26059        let src = _mm_set_epi32(10, 11, 12, 13);
26060        let r = _mm_mask_cvttph_epu32(src, 0b0101, a);
26061        let e = _mm_set_epi32(10, 2, 12, 4);
26062        assert_eq_m128i(r, e);
26063    }
26064
26065    #[simd_test(enable = "avx512fp16,avx512vl")]
26066    unsafe fn test_mm_maskz_cvttph_epu32() {
26067        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26068        let r = _mm_maskz_cvttph_epu32(0b0101, a);
26069        let e = _mm_set_epi32(0, 2, 0, 4);
26070        assert_eq_m128i(r, e);
26071    }
26072
26073    #[simd_test(enable = "avx512fp16,avx512vl")]
26074    unsafe fn test_mm256_cvttph_epu32() {
26075        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26076        let r = _mm256_cvttph_epu32(a);
26077        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26078        assert_eq_m256i(r, e);
26079    }
26080
26081    #[simd_test(enable = "avx512fp16,avx512vl")]
26082    unsafe fn test_mm256_mask_cvttph_epu32() {
26083        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26084        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26085        let r = _mm256_mask_cvttph_epu32(src, 0b01010101, a);
26086        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26087        assert_eq_m256i(r, e);
26088    }
26089
26090    #[simd_test(enable = "avx512fp16,avx512vl")]
26091    unsafe fn test_mm256_maskz_cvttph_epu32() {
26092        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26093        let r = _mm256_maskz_cvttph_epu32(0b01010101, a);
26094        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26095        assert_eq_m256i(r, e);
26096    }
26097
26098    #[simd_test(enable = "avx512fp16")]
26099    unsafe fn test_mm512_cvttph_epu32() {
26100        let a = _mm256_set_ph(
26101            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26102        );
26103        let r = _mm512_cvttph_epu32(a);
26104        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26105        assert_eq_m512i(r, e);
26106    }
26107
26108    #[simd_test(enable = "avx512fp16")]
26109    unsafe fn test_mm512_mask_cvttph_epu32() {
26110        let a = _mm256_set_ph(
26111            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26112        );
26113        let src = _mm512_set_epi32(
26114            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26115        );
26116        let r = _mm512_mask_cvttph_epu32(src, 0b0101010101010101, a);
26117        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26118        assert_eq_m512i(r, e);
26119    }
26120
26121    #[simd_test(enable = "avx512fp16")]
26122    unsafe fn test_mm512_maskz_cvttph_epu32() {
26123        let a = _mm256_set_ph(
26124            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26125        );
26126        let r = _mm512_maskz_cvttph_epu32(0b0101010101010101, a);
26127        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26128        assert_eq_m512i(r, e);
26129    }
26130
26131    #[simd_test(enable = "avx512fp16")]
26132    unsafe fn test_mm512_cvtt_roundph_epu32() {
26133        let a = _mm256_set_ph(
26134            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26135        );
26136        let r = _mm512_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(a);
26137        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26138        assert_eq_m512i(r, e);
26139    }
26140
26141    #[simd_test(enable = "avx512fp16")]
26142    unsafe fn test_mm512_mask_cvtt_roundph_epu32() {
26143        let a = _mm256_set_ph(
26144            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26145        );
26146        let src = _mm512_set_epi32(
26147            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26148        );
26149        let r = _mm512_mask_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26150        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26151        assert_eq_m512i(r, e);
26152    }
26153
26154    #[simd_test(enable = "avx512fp16")]
26155    unsafe fn test_mm512_maskz_cvtt_roundph_epu32() {
26156        let a = _mm256_set_ph(
26157            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26158        );
26159        let r = _mm512_maskz_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26160        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26161        assert_eq_m512i(r, e);
26162    }
26163
26164    #[simd_test(enable = "avx512fp16")]
26165    unsafe fn test_mm_cvttsh_u32() {
26166        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26167        let r = _mm_cvttsh_u32(a);
26168        assert_eq!(r, 1);
26169    }
26170
26171    #[simd_test(enable = "avx512fp16")]
26172    unsafe fn test_mm_cvtt_roundsh_u32() {
26173        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26174        let r = _mm_cvtt_roundsh_u32::<_MM_FROUND_NO_EXC>(a);
26175        assert_eq!(r, 1);
26176    }
26177
26178    #[simd_test(enable = "avx512fp16,avx512vl")]
26179    unsafe fn test_mm_cvtph_epi64() {
26180        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26181        let r = _mm_cvtph_epi64(a);
26182        let e = _mm_set_epi64x(1, 2);
26183        assert_eq_m128i(r, e);
26184    }
26185
26186    #[simd_test(enable = "avx512fp16,avx512vl")]
26187    unsafe fn test_mm_mask_cvtph_epi64() {
26188        let src = _mm_set_epi64x(3, 4);
26189        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26190        let r = _mm_mask_cvtph_epi64(src, 0b01, a);
26191        let e = _mm_set_epi64x(3, 2);
26192        assert_eq_m128i(r, e);
26193    }
26194
26195    #[simd_test(enable = "avx512fp16,avx512vl")]
26196    unsafe fn test_mm_maskz_cvtph_epi64() {
26197        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26198        let r = _mm_maskz_cvtph_epi64(0b01, a);
26199        let e = _mm_set_epi64x(0, 2);
26200        assert_eq_m128i(r, e);
26201    }
26202
26203    #[simd_test(enable = "avx512fp16,avx512vl")]
26204    unsafe fn test_mm256_cvtph_epi64() {
26205        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26206        let r = _mm256_cvtph_epi64(a);
26207        let e = _mm256_set_epi64x(1, 2, 3, 4);
26208        assert_eq_m256i(r, e);
26209    }
26210
26211    #[simd_test(enable = "avx512fp16,avx512vl")]
26212    unsafe fn test_mm256_mask_cvtph_epi64() {
26213        let src = _mm256_set_epi64x(5, 6, 7, 8);
26214        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26215        let r = _mm256_mask_cvtph_epi64(src, 0b0101, a);
26216        let e = _mm256_set_epi64x(5, 2, 7, 4);
26217        assert_eq_m256i(r, e);
26218    }
26219
26220    #[simd_test(enable = "avx512fp16,avx512vl")]
26221    unsafe fn test_mm256_maskz_cvtph_epi64() {
26222        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26223        let r = _mm256_maskz_cvtph_epi64(0b0101, a);
26224        let e = _mm256_set_epi64x(0, 2, 0, 4);
26225        assert_eq_m256i(r, e);
26226    }
26227
26228    #[simd_test(enable = "avx512fp16")]
26229    unsafe fn test_mm512_cvtph_epi64() {
26230        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26231        let r = _mm512_cvtph_epi64(a);
26232        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26233        assert_eq_m512i(r, e);
26234    }
26235
26236    #[simd_test(enable = "avx512fp16")]
26237    unsafe fn test_mm512_mask_cvtph_epi64() {
26238        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26239        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26240        let r = _mm512_mask_cvtph_epi64(src, 0b01010101, a);
26241        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26242        assert_eq_m512i(r, e);
26243    }
26244
26245    #[simd_test(enable = "avx512fp16")]
26246    unsafe fn test_mm512_maskz_cvtph_epi64() {
26247        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26248        let r = _mm512_maskz_cvtph_epi64(0b01010101, a);
26249        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26250        assert_eq_m512i(r, e);
26251    }
26252
26253    #[simd_test(enable = "avx512fp16")]
26254    unsafe fn test_mm512_cvt_roundph_epi64() {
26255        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26256        let r = _mm512_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26257        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26258        assert_eq_m512i(r, e);
26259    }
26260
26261    #[simd_test(enable = "avx512fp16")]
26262    unsafe fn test_mm512_mask_cvt_roundph_epi64() {
26263        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26264        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26265        let r = _mm512_mask_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26266            src, 0b01010101, a,
26267        );
26268        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26269        assert_eq_m512i(r, e);
26270    }
26271
26272    #[simd_test(enable = "avx512fp16")]
26273    unsafe fn test_mm512_maskz_cvt_roundph_epi64() {
26274        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26275        let r = _mm512_maskz_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26276            0b01010101, a,
26277        );
26278        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26279        assert_eq_m512i(r, e);
26280    }
26281
26282    #[simd_test(enable = "avx512fp16,avx512vl")]
26283    unsafe fn test_mm_cvtph_epu64() {
26284        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26285        let r = _mm_cvtph_epu64(a);
26286        let e = _mm_set_epi64x(1, 2);
26287        assert_eq_m128i(r, e);
26288    }
26289
26290    #[simd_test(enable = "avx512fp16,avx512vl")]
26291    unsafe fn test_mm_mask_cvtph_epu64() {
26292        let src = _mm_set_epi64x(3, 4);
26293        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26294        let r = _mm_mask_cvtph_epu64(src, 0b01, a);
26295        let e = _mm_set_epi64x(3, 2);
26296        assert_eq_m128i(r, e);
26297    }
26298
26299    #[simd_test(enable = "avx512fp16,avx512vl")]
26300    unsafe fn test_mm_maskz_cvtph_epu64() {
26301        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26302        let r = _mm_maskz_cvtph_epu64(0b01, a);
26303        let e = _mm_set_epi64x(0, 2);
26304        assert_eq_m128i(r, e);
26305    }
26306
26307    #[simd_test(enable = "avx512fp16,avx512vl")]
26308    unsafe fn test_mm256_cvtph_epu64() {
26309        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26310        let r = _mm256_cvtph_epu64(a);
26311        let e = _mm256_set_epi64x(1, 2, 3, 4);
26312        assert_eq_m256i(r, e);
26313    }
26314
26315    #[simd_test(enable = "avx512fp16,avx512vl")]
26316    unsafe fn test_mm256_mask_cvtph_epu64() {
26317        let src = _mm256_set_epi64x(5, 6, 7, 8);
26318        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26319        let r = _mm256_mask_cvtph_epu64(src, 0b0101, a);
26320        let e = _mm256_set_epi64x(5, 2, 7, 4);
26321        assert_eq_m256i(r, e);
26322    }
26323
26324    #[simd_test(enable = "avx512fp16,avx512vl")]
26325    unsafe fn test_mm256_maskz_cvtph_epu64() {
26326        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26327        let r = _mm256_maskz_cvtph_epu64(0b0101, a);
26328        let e = _mm256_set_epi64x(0, 2, 0, 4);
26329        assert_eq_m256i(r, e);
26330    }
26331
26332    #[simd_test(enable = "avx512fp16")]
26333    unsafe fn test_mm512_cvtph_epu64() {
26334        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26335        let r = _mm512_cvtph_epu64(a);
26336        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26337        assert_eq_m512i(r, e);
26338    }
26339
26340    #[simd_test(enable = "avx512fp16")]
26341    unsafe fn test_mm512_mask_cvtph_epu64() {
26342        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26343        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26344        let r = _mm512_mask_cvtph_epu64(src, 0b01010101, a);
26345        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26346        assert_eq_m512i(r, e);
26347    }
26348
26349    #[simd_test(enable = "avx512fp16")]
26350    unsafe fn test_mm512_maskz_cvtph_epu64() {
26351        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26352        let r = _mm512_maskz_cvtph_epu64(0b01010101, a);
26353        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26354        assert_eq_m512i(r, e);
26355    }
26356
26357    #[simd_test(enable = "avx512fp16")]
26358    unsafe fn test_mm512_cvt_roundph_epu64() {
26359        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26360        let r = _mm512_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26361        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26362        assert_eq_m512i(r, e);
26363    }
26364
26365    #[simd_test(enable = "avx512fp16")]
26366    unsafe fn test_mm512_mask_cvt_roundph_epu64() {
26367        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26368        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26369        let r = _mm512_mask_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26370            src, 0b01010101, a,
26371        );
26372        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26373        assert_eq_m512i(r, e);
26374    }
26375
26376    #[simd_test(enable = "avx512fp16")]
26377    unsafe fn test_mm512_maskz_cvt_roundph_epu64() {
26378        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26379        let r = _mm512_maskz_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26380            0b01010101, a,
26381        );
26382        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26383        assert_eq_m512i(r, e);
26384    }
26385
26386    #[simd_test(enable = "avx512fp16,avx512vl")]
26387    unsafe fn test_mm_cvttph_epi64() {
26388        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26389        let r = _mm_cvttph_epi64(a);
26390        let e = _mm_set_epi64x(1, 2);
26391        assert_eq_m128i(r, e);
26392    }
26393
26394    #[simd_test(enable = "avx512fp16,avx512vl")]
26395    unsafe fn test_mm_mask_cvttph_epi64() {
26396        let src = _mm_set_epi64x(3, 4);
26397        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26398        let r = _mm_mask_cvttph_epi64(src, 0b01, a);
26399        let e = _mm_set_epi64x(3, 2);
26400        assert_eq_m128i(r, e);
26401    }
26402
26403    #[simd_test(enable = "avx512fp16,avx512vl")]
26404    unsafe fn test_mm_maskz_cvttph_epi64() {
26405        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26406        let r = _mm_maskz_cvttph_epi64(0b01, a);
26407        let e = _mm_set_epi64x(0, 2);
26408        assert_eq_m128i(r, e);
26409    }
26410
26411    #[simd_test(enable = "avx512fp16,avx512vl")]
26412    unsafe fn test_mm256_cvttph_epi64() {
26413        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26414        let r = _mm256_cvttph_epi64(a);
26415        let e = _mm256_set_epi64x(1, 2, 3, 4);
26416        assert_eq_m256i(r, e);
26417    }
26418
26419    #[simd_test(enable = "avx512fp16,avx512vl")]
26420    unsafe fn test_mm256_mask_cvttph_epi64() {
26421        let src = _mm256_set_epi64x(5, 6, 7, 8);
26422        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26423        let r = _mm256_mask_cvttph_epi64(src, 0b0101, a);
26424        let e = _mm256_set_epi64x(5, 2, 7, 4);
26425        assert_eq_m256i(r, e);
26426    }
26427
26428    #[simd_test(enable = "avx512fp16,avx512vl")]
26429    unsafe fn test_mm256_maskz_cvttph_epi64() {
26430        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26431        let r = _mm256_maskz_cvttph_epi64(0b0101, a);
26432        let e = _mm256_set_epi64x(0, 2, 0, 4);
26433        assert_eq_m256i(r, e);
26434    }
26435
26436    #[simd_test(enable = "avx512fp16")]
26437    unsafe fn test_mm512_cvttph_epi64() {
26438        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26439        let r = _mm512_cvttph_epi64(a);
26440        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26441        assert_eq_m512i(r, e);
26442    }
26443
26444    #[simd_test(enable = "avx512fp16")]
26445    unsafe fn test_mm512_mask_cvttph_epi64() {
26446        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26447        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26448        let r = _mm512_mask_cvttph_epi64(src, 0b01010101, a);
26449        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26450        assert_eq_m512i(r, e);
26451    }
26452
26453    #[simd_test(enable = "avx512fp16")]
26454    unsafe fn test_mm512_maskz_cvttph_epi64() {
26455        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26456        let r = _mm512_maskz_cvttph_epi64(0b01010101, a);
26457        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26458        assert_eq_m512i(r, e);
26459    }
26460
26461    #[simd_test(enable = "avx512fp16")]
26462    unsafe fn test_mm512_cvtt_roundph_epi64() {
26463        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26464        let r = _mm512_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(a);
26465        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26466        assert_eq_m512i(r, e);
26467    }
26468
26469    #[simd_test(enable = "avx512fp16")]
26470    unsafe fn test_mm512_mask_cvtt_roundph_epi64() {
26471        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26472        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26473        let r = _mm512_mask_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
26474        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26475        assert_eq_m512i(r, e);
26476    }
26477
26478    #[simd_test(enable = "avx512fp16")]
26479    unsafe fn test_mm512_maskz_cvtt_roundph_epi64() {
26480        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26481        let r = _mm512_maskz_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(0b01010101, a);
26482        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26483        assert_eq_m512i(r, e);
26484    }
26485
26486    #[simd_test(enable = "avx512fp16,avx512vl")]
26487    unsafe fn test_mm_cvttph_epu64() {
26488        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26489        let r = _mm_cvttph_epu64(a);
26490        let e = _mm_set_epi64x(1, 2);
26491        assert_eq_m128i(r, e);
26492    }
26493
26494    #[simd_test(enable = "avx512fp16,avx512vl")]
26495    unsafe fn test_mm_mask_cvttph_epu64() {
26496        let src = _mm_set_epi64x(3, 4);
26497        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26498        let r = _mm_mask_cvttph_epu64(src, 0b01, a);
26499        let e = _mm_set_epi64x(3, 2);
26500        assert_eq_m128i(r, e);
26501    }
26502
26503    #[simd_test(enable = "avx512fp16,avx512vl")]
26504    unsafe fn test_mm_maskz_cvttph_epu64() {
26505        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26506        let r = _mm_maskz_cvttph_epu64(0b01, a);
26507        let e = _mm_set_epi64x(0, 2);
26508        assert_eq_m128i(r, e);
26509    }
26510
26511    #[simd_test(enable = "avx512fp16,avx512vl")]
26512    unsafe fn test_mm256_cvttph_epu64() {
26513        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26514        let r = _mm256_cvttph_epu64(a);
26515        let e = _mm256_set_epi64x(1, 2, 3, 4);
26516        assert_eq_m256i(r, e);
26517    }
26518
26519    #[simd_test(enable = "avx512fp16,avx512vl")]
26520    unsafe fn test_mm256_mask_cvttph_epu64() {
26521        let src = _mm256_set_epi64x(5, 6, 7, 8);
26522        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26523        let r = _mm256_mask_cvttph_epu64(src, 0b0101, a);
26524        let e = _mm256_set_epi64x(5, 2, 7, 4);
26525        assert_eq_m256i(r, e);
26526    }
26527
26528    #[simd_test(enable = "avx512fp16,avx512vl")]
26529    unsafe fn test_mm256_maskz_cvttph_epu64() {
26530        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26531        let r = _mm256_maskz_cvttph_epu64(0b0101, a);
26532        let e = _mm256_set_epi64x(0, 2, 0, 4);
26533        assert_eq_m256i(r, e);
26534    }
26535
26536    #[simd_test(enable = "avx512fp16")]
26537    unsafe fn test_mm512_cvttph_epu64() {
26538        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26539        let r = _mm512_cvttph_epu64(a);
26540        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26541        assert_eq_m512i(r, e);
26542    }
26543
26544    #[simd_test(enable = "avx512fp16")]
26545    unsafe fn test_mm512_mask_cvttph_epu64() {
26546        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26547        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26548        let r = _mm512_mask_cvttph_epu64(src, 0b01010101, a);
26549        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26550        assert_eq_m512i(r, e);
26551    }
26552
26553    #[simd_test(enable = "avx512fp16")]
26554    unsafe fn test_mm512_maskz_cvttph_epu64() {
26555        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26556        let r = _mm512_maskz_cvttph_epu64(0b01010101, a);
26557        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26558        assert_eq_m512i(r, e);
26559    }
26560
26561    #[simd_test(enable = "avx512fp16")]
26562    unsafe fn test_mm512_cvtt_roundph_epu64() {
26563        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26564        let r = _mm512_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(a);
26565        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26566        assert_eq_m512i(r, e);
26567    }
26568
26569    #[simd_test(enable = "avx512fp16")]
26570    unsafe fn test_mm512_mask_cvtt_roundph_epu64() {
26571        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26572        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26573        let r = _mm512_mask_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
26574        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26575        assert_eq_m512i(r, e);
26576    }
26577
26578    #[simd_test(enable = "avx512fp16")]
26579    unsafe fn test_mm512_maskz_cvtt_roundph_epu64() {
26580        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26581        let r = _mm512_maskz_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(0b01010101, a);
26582        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26583        assert_eq_m512i(r, e);
26584    }
26585
26586    #[simd_test(enable = "avx512fp16,avx512vl")]
26587    unsafe fn test_mm_cvtxph_ps() {
26588        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26589        let r = _mm_cvtxph_ps(a);
26590        let e = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
26591        assert_eq_m128(r, e);
26592    }
26593
26594    #[simd_test(enable = "avx512fp16,avx512vl")]
26595    unsafe fn test_mm_mask_cvtxph_ps() {
26596        let src = _mm_set_ps(10.0, 11.0, 12.0, 13.0);
26597        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26598        let r = _mm_mask_cvtxph_ps(src, 0b0101, a);
26599        let e = _mm_set_ps(10.0, 2.0, 12.0, 4.0);
26600        assert_eq_m128(r, e);
26601    }
26602
26603    #[simd_test(enable = "avx512fp16,avx512vl")]
26604    unsafe fn test_mm_maskz_cvtxph_ps() {
26605        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26606        let r = _mm_maskz_cvtxph_ps(0b0101, a);
26607        let e = _mm_set_ps(0.0, 2.0, 0.0, 4.0);
26608        assert_eq_m128(r, e);
26609    }
26610
26611    #[simd_test(enable = "avx512fp16,avx512vl")]
26612    unsafe fn test_mm256_cvtxph_ps() {
26613        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26614        let r = _mm256_cvtxph_ps(a);
26615        let e = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26616        assert_eq_m256(r, e);
26617    }
26618
26619    #[simd_test(enable = "avx512fp16,avx512vl")]
26620    unsafe fn test_mm256_mask_cvtxph_ps() {
26621        let src = _mm256_set_ps(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
26622        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26623        let r = _mm256_mask_cvtxph_ps(src, 0b01010101, a);
26624        let e = _mm256_set_ps(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
26625        assert_eq_m256(r, e);
26626    }
26627
26628    #[simd_test(enable = "avx512fp16,avx512vl")]
26629    unsafe fn test_mm256_maskz_cvtxph_ps() {
26630        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26631        let r = _mm256_maskz_cvtxph_ps(0b01010101, a);
26632        let e = _mm256_set_ps(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
26633        assert_eq_m256(r, e);
26634    }
26635
26636    #[simd_test(enable = "avx512fp16")]
26637    unsafe fn test_mm512_cvtxph_ps() {
26638        let a = _mm256_set_ph(
26639            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26640        );
26641        let r = _mm512_cvtxph_ps(a);
26642        let e = _mm512_set_ps(
26643            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26644        );
26645        assert_eq_m512(r, e);
26646    }
26647
26648    #[simd_test(enable = "avx512fp16")]
26649    unsafe fn test_mm512_mask_cvtxph_ps() {
26650        let src = _mm512_set_ps(
26651            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
26652            24.0, 25.0,
26653        );
26654        let a = _mm256_set_ph(
26655            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26656        );
26657        let r = _mm512_mask_cvtxph_ps(src, 0b0101010101010101, a);
26658        let e = _mm512_set_ps(
26659            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
26660            16.0,
26661        );
26662        assert_eq_m512(r, e);
26663    }
26664
26665    #[simd_test(enable = "avx512fp16")]
26666    unsafe fn test_mm512_maskz_cvtxph_ps() {
26667        let a = _mm256_set_ph(
26668            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26669        );
26670        let r = _mm512_maskz_cvtxph_ps(0b0101010101010101, a);
26671        let e = _mm512_set_ps(
26672            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
26673        );
26674        assert_eq_m512(r, e);
26675    }
26676
26677    #[simd_test(enable = "avx512fp16")]
26678    unsafe fn test_mm512_cvtx_roundph_ps() {
26679        let a = _mm256_set_ph(
26680            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26681        );
26682        let r = _mm512_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(a);
26683        let e = _mm512_set_ps(
26684            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26685        );
26686        assert_eq_m512(r, e);
26687    }
26688
26689    #[simd_test(enable = "avx512fp16")]
26690    unsafe fn test_mm512_mask_cvtx_roundph_ps() {
26691        let src = _mm512_set_ps(
26692            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
26693            24.0, 25.0,
26694        );
26695        let a = _mm256_set_ph(
26696            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26697        );
26698        let r = _mm512_mask_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26699        let e = _mm512_set_ps(
26700            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
26701            16.0,
26702        );
26703        assert_eq_m512(r, e);
26704    }
26705
26706    #[simd_test(enable = "avx512fp16")]
26707    unsafe fn test_mm512_maskz_cvtx_roundph_ps() {
26708        let a = _mm256_set_ph(
26709            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26710        );
26711        let r = _mm512_maskz_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26712        let e = _mm512_set_ps(
26713            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
26714        );
26715        assert_eq_m512(r, e);
26716    }
26717
26718    #[simd_test(enable = "avx512fp16")]
26719    unsafe fn test_mm_cvtsh_ss() {
26720        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
26721        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
26722        let r = _mm_cvtsh_ss(a, b);
26723        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
26724        assert_eq_m128(r, e);
26725    }
26726
26727    #[simd_test(enable = "avx512fp16")]
26728    unsafe fn test_mm_mask_cvtsh_ss() {
26729        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
26730        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
26731        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
26732        let r = _mm_mask_cvtsh_ss(src, 0, a, b);
26733        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
26734        assert_eq_m128(r, e);
26735        let r = _mm_mask_cvtsh_ss(src, 1, a, b);
26736        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
26737        assert_eq_m128(r, e);
26738    }
26739
26740    #[simd_test(enable = "avx512fp16")]
26741    unsafe fn test_mm_maskz_cvtsh_ss() {
26742        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
26743        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
26744        let r = _mm_maskz_cvtsh_ss(0, a, b);
26745        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
26746        assert_eq_m128(r, e);
26747        let r = _mm_maskz_cvtsh_ss(1, a, b);
26748        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
26749        assert_eq_m128(r, e);
26750    }
26751
26752    #[simd_test(enable = "avx512fp16")]
26753    unsafe fn test_mm_cvt_roundsh_ss() {
26754        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
26755        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
26756        let r = _mm_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(a, b);
26757        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
26758        assert_eq_m128(r, e);
26759    }
26760
26761    #[simd_test(enable = "avx512fp16")]
26762    unsafe fn test_mm_mask_cvt_roundsh_ss() {
26763        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
26764        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
26765        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
26766        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 0, a, b);
26767        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
26768        assert_eq_m128(r, e);
26769        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 1, a, b);
26770        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
26771        assert_eq_m128(r, e);
26772    }
26773
26774    #[simd_test(enable = "avx512fp16")]
26775    unsafe fn test_mm_maskz_cvt_roundsh_ss() {
26776        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
26777        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
26778        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(0, a, b);
26779        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
26780        assert_eq_m128(r, e);
26781        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(1, a, b);
26782        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
26783        assert_eq_m128(r, e);
26784    }
26785
26786    #[simd_test(enable = "avx512fp16,avx512vl")]
26787    unsafe fn test_mm_cvtph_pd() {
26788        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26789        let r = _mm_cvtph_pd(a);
26790        let e = _mm_set_pd(1.0, 2.0);
26791        assert_eq_m128d(r, e);
26792    }
26793
26794    #[simd_test(enable = "avx512fp16,avx512vl")]
26795    unsafe fn test_mm_mask_cvtph_pd() {
26796        let src = _mm_set_pd(10.0, 11.0);
26797        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26798        let r = _mm_mask_cvtph_pd(src, 0b01, a);
26799        let e = _mm_set_pd(10.0, 2.0);
26800        assert_eq_m128d(r, e);
26801    }
26802
26803    #[simd_test(enable = "avx512fp16,avx512vl")]
26804    unsafe fn test_mm_maskz_cvtph_pd() {
26805        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26806        let r = _mm_maskz_cvtph_pd(0b01, a);
26807        let e = _mm_set_pd(0.0, 2.0);
26808        assert_eq_m128d(r, e);
26809    }
26810
26811    #[simd_test(enable = "avx512fp16,avx512vl")]
26812    unsafe fn test_mm256_cvtph_pd() {
26813        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26814        let r = _mm256_cvtph_pd(a);
26815        let e = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
26816        assert_eq_m256d(r, e);
26817    }
26818
26819    #[simd_test(enable = "avx512fp16,avx512vl")]
26820    unsafe fn test_mm256_mask_cvtph_pd() {
26821        let src = _mm256_set_pd(10.0, 11.0, 12.0, 13.0);
26822        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26823        let r = _mm256_mask_cvtph_pd(src, 0b0101, a);
26824        let e = _mm256_set_pd(10.0, 2.0, 12.0, 4.0);
26825        assert_eq_m256d(r, e);
26826    }
26827
26828    #[simd_test(enable = "avx512fp16,avx512vl")]
26829    unsafe fn test_mm256_maskz_cvtph_pd() {
26830        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26831        let r = _mm256_maskz_cvtph_pd(0b0101, a);
26832        let e = _mm256_set_pd(0.0, 2.0, 0.0, 4.0);
26833        assert_eq_m256d(r, e);
26834    }
26835
26836    #[simd_test(enable = "avx512fp16")]
26837    unsafe fn test_mm512_cvtph_pd() {
26838        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26839        let r = _mm512_cvtph_pd(a);
26840        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26841        assert_eq_m512d(r, e);
26842    }
26843
26844    #[simd_test(enable = "avx512fp16")]
26845    unsafe fn test_mm512_mask_cvtph_pd() {
26846        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
26847        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26848        let r = _mm512_mask_cvtph_pd(src, 0b01010101, a);
26849        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
26850        assert_eq_m512d(r, e);
26851    }
26852
26853    #[simd_test(enable = "avx512fp16")]
26854    unsafe fn test_mm512_maskz_cvtph_pd() {
26855        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26856        let r = _mm512_maskz_cvtph_pd(0b01010101, a);
26857        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
26858        assert_eq_m512d(r, e);
26859    }
26860
26861    #[simd_test(enable = "avx512fp16")]
26862    unsafe fn test_mm512_cvt_roundph_pd() {
26863        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26864        let r = _mm512_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(a);
26865        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26866        assert_eq_m512d(r, e);
26867    }
26868
26869    #[simd_test(enable = "avx512fp16")]
26870    unsafe fn test_mm512_mask_cvt_roundph_pd() {
26871        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
26872        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26873        let r = _mm512_mask_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
26874        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
26875        assert_eq_m512d(r, e);
26876    }
26877
26878    #[simd_test(enable = "avx512fp16")]
26879    unsafe fn test_mm512_maskz_cvt_roundph_pd() {
26880        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26881        let r = _mm512_maskz_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(0b01010101, a);
26882        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
26883        assert_eq_m512d(r, e);
26884    }
26885
26886    #[simd_test(enable = "avx512fp16")]
26887    unsafe fn test_mm_cvtsh_sd() {
26888        let a = _mm_setr_pd(2.0, 20.0);
26889        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
26890        let r = _mm_cvtsh_sd(a, b);
26891        let e = _mm_setr_pd(1.0, 20.0);
26892        assert_eq_m128d(r, e);
26893    }
26894
26895    #[simd_test(enable = "avx512fp16")]
26896    unsafe fn test_mm_mask_cvtsh_sd() {
26897        let src = _mm_setr_pd(3.0, 11.0);
26898        let a = _mm_setr_pd(2.0, 20.0);
26899        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
26900        let r = _mm_mask_cvtsh_sd(src, 0, a, b);
26901        let e = _mm_setr_pd(3.0, 20.0);
26902        assert_eq_m128d(r, e);
26903        let r = _mm_mask_cvtsh_sd(src, 1, a, b);
26904        let e = _mm_setr_pd(1.0, 20.0);
26905        assert_eq_m128d(r, e);
26906    }
26907
26908    #[simd_test(enable = "avx512fp16")]
26909    unsafe fn test_mm_maskz_cvtsh_sd() {
26910        let a = _mm_setr_pd(2.0, 20.0);
26911        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
26912        let r = _mm_maskz_cvtsh_sd(0, a, b);
26913        let e = _mm_setr_pd(0.0, 20.0);
26914        assert_eq_m128d(r, e);
26915        let r = _mm_maskz_cvtsh_sd(1, a, b);
26916        let e = _mm_setr_pd(1.0, 20.0);
26917        assert_eq_m128d(r, e);
26918    }
26919
26920    #[simd_test(enable = "avx512fp16")]
26921    unsafe fn test_mm_cvt_roundsh_sd() {
26922        let a = _mm_setr_pd(2.0, 20.0);
26923        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
26924        let r = _mm_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(a, b);
26925        let e = _mm_setr_pd(1.0, 20.0);
26926        assert_eq_m128d(r, e);
26927    }
26928
26929    #[simd_test(enable = "avx512fp16")]
26930    unsafe fn test_mm_mask_cvt_roundsh_sd() {
26931        let src = _mm_setr_pd(3.0, 11.0);
26932        let a = _mm_setr_pd(2.0, 20.0);
26933        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
26934        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 0, a, b);
26935        let e = _mm_setr_pd(3.0, 20.0);
26936        assert_eq_m128d(r, e);
26937        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 1, a, b);
26938        let e = _mm_setr_pd(1.0, 20.0);
26939        assert_eq_m128d(r, e);
26940    }
26941
26942    #[simd_test(enable = "avx512fp16")]
26943    unsafe fn test_mm_maskz_cvt_roundsh_sd() {
26944        let a = _mm_setr_pd(2.0, 20.0);
26945        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
26946        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(0, a, b);
26947        let e = _mm_setr_pd(0.0, 20.0);
26948        assert_eq_m128d(r, e);
26949        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(1, a, b);
26950        let e = _mm_setr_pd(1.0, 20.0);
26951        assert_eq_m128d(r, e);
26952    }
26953
26954    #[simd_test(enable = "avx512fp16")]
26955    unsafe fn test_mm_cvtsh_h() {
26956        let a = _mm_setr_ph(1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0);
26957        let r = _mm_cvtsh_h(a);
26958        assert_eq!(r, 1.0);
26959    }
26960
26961    #[simd_test(enable = "avx512fp16")]
26962    unsafe fn test_mm256_cvtsh_h() {
26963        let a = _mm256_setr_ph(
26964            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26965        );
26966        let r = _mm256_cvtsh_h(a);
26967        assert_eq!(r, 1.0);
26968    }
26969
26970    #[simd_test(enable = "avx512fp16")]
26971    unsafe fn test_mm512_cvtsh_h() {
26972        let a = _mm512_setr_ph(
26973            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26974            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
26975            31.0, 32.0,
26976        );
26977        let r = _mm512_cvtsh_h(a);
26978        assert_eq!(r, 1.0);
26979    }
26980
26981    #[simd_test(enable = "avx512fp16")]
26982    unsafe fn test_mm_cvtsi128_si16() {
26983        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
26984        let r = _mm_cvtsi128_si16(a);
26985        assert_eq!(r, 1);
26986    }
26987
26988    #[simd_test(enable = "avx512fp16")]
26989    unsafe fn test_mm_cvtsi16_si128() {
26990        let a = 1;
26991        let r = _mm_cvtsi16_si128(a);
26992        let e = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
26993        assert_eq_m128i(r, e);
26994    }
26995}