core/stdarch/crates/core_arch/src/x86/
avx512fp16.rs

1use crate::arch::asm;
2use crate::core_arch::{simd::*, x86::*};
3use crate::intrinsics::{fmaf16, simd::*};
4use crate::ptr;
5
6/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
7///
8/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ph)
9#[inline]
10#[target_feature(enable = "avx512fp16")]
11#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12pub fn _mm_set_ph(
13    e7: f16,
14    e6: f16,
15    e5: f16,
16    e4: f16,
17    e3: f16,
18    e2: f16,
19    e1: f16,
20    e0: f16,
21) -> __m128h {
22    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
23}
24
25/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
26///
27/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set_ph)
28#[inline]
29#[target_feature(enable = "avx512fp16")]
30#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
31pub fn _mm256_set_ph(
32    e15: f16,
33    e14: f16,
34    e13: f16,
35    e12: f16,
36    e11: f16,
37    e10: f16,
38    e9: f16,
39    e8: f16,
40    e7: f16,
41    e6: f16,
42    e5: f16,
43    e4: f16,
44    e3: f16,
45    e2: f16,
46    e1: f16,
47    e0: f16,
48) -> __m256h {
49    __m256h([
50        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
51    ])
52}
53
54/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values.
55///
56/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_ph)
57#[inline]
58#[target_feature(enable = "avx512fp16")]
59#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
60pub fn _mm512_set_ph(
61    e31: f16,
62    e30: f16,
63    e29: f16,
64    e28: f16,
65    e27: f16,
66    e26: f16,
67    e25: f16,
68    e24: f16,
69    e23: f16,
70    e22: f16,
71    e21: f16,
72    e20: f16,
73    e19: f16,
74    e18: f16,
75    e17: f16,
76    e16: f16,
77    e15: f16,
78    e14: f16,
79    e13: f16,
80    e12: f16,
81    e11: f16,
82    e10: f16,
83    e9: f16,
84    e8: f16,
85    e7: f16,
86    e6: f16,
87    e5: f16,
88    e4: f16,
89    e3: f16,
90    e2: f16,
91    e1: f16,
92    e0: f16,
93) -> __m512h {
94    __m512h([
95        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
96        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
97    ])
98}
99
100/// Copy half-precision (16-bit) floating-point elements from a to the lower element of dst and zero
101/// the upper 7 elements.
102///
103/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sh)
104#[inline]
105#[target_feature(enable = "avx512fp16")]
106#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
107pub fn _mm_set_sh(a: f16) -> __m128h {
108    __m128h([a, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
109}
110
111/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
112///
113/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_ph)
114#[inline]
115#[target_feature(enable = "avx512fp16")]
116#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
117pub fn _mm_set1_ph(a: f16) -> __m128h {
118    unsafe { transmute(f16x8::splat(a)) }
119}
120
121/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
122///
123/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_set1_ph)
124#[inline]
125#[target_feature(enable = "avx512fp16")]
126#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
127pub fn _mm256_set1_ph(a: f16) -> __m256h {
128    unsafe { transmute(f16x16::splat(a)) }
129}
130
131/// Broadcast the half-precision (16-bit) floating-point value a to all elements of dst.
132///
133/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_ph)
134#[inline]
135#[target_feature(enable = "avx512fp16")]
136#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
137pub fn _mm512_set1_ph(a: f16) -> __m512h {
138    unsafe { transmute(f16x32::splat(a)) }
139}
140
141/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
142///
143/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_ph)
144#[inline]
145#[target_feature(enable = "avx512fp16")]
146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
147pub fn _mm_setr_ph(
148    e0: f16,
149    e1: f16,
150    e2: f16,
151    e3: f16,
152    e4: f16,
153    e5: f16,
154    e6: f16,
155    e7: f16,
156) -> __m128h {
157    __m128h([e0, e1, e2, e3, e4, e5, e6, e7])
158}
159
160/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
161///
162/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setr_ph)
163#[inline]
164#[target_feature(enable = "avx512fp16")]
165#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
166pub fn _mm256_setr_ph(
167    e0: f16,
168    e1: f16,
169    e2: f16,
170    e3: f16,
171    e4: f16,
172    e5: f16,
173    e6: f16,
174    e7: f16,
175    e8: f16,
176    e9: f16,
177    e10: f16,
178    e11: f16,
179    e12: f16,
180    e13: f16,
181    e14: f16,
182    e15: f16,
183) -> __m256h {
184    __m256h([
185        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
186    ])
187}
188
189/// Set packed half-precision (16-bit) floating-point elements in dst with the supplied values in reverse order.
190///
191/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_ph)
192#[inline]
193#[target_feature(enable = "avx512fp16")]
194#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
195pub fn _mm512_setr_ph(
196    e0: f16,
197    e1: f16,
198    e2: f16,
199    e3: f16,
200    e4: f16,
201    e5: f16,
202    e6: f16,
203    e7: f16,
204    e8: f16,
205    e9: f16,
206    e10: f16,
207    e11: f16,
208    e12: f16,
209    e13: f16,
210    e14: f16,
211    e15: f16,
212    e16: f16,
213    e17: f16,
214    e18: f16,
215    e19: f16,
216    e20: f16,
217    e21: f16,
218    e22: f16,
219    e23: f16,
220    e24: f16,
221    e25: f16,
222    e26: f16,
223    e27: f16,
224    e28: f16,
225    e29: f16,
226    e30: f16,
227    e31: f16,
228) -> __m512h {
229    __m512h([
230        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
231        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
232    ])
233}
234
235/// Return vector of type __m128h with all elements set to zero.
236///
237/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_ph)
238#[inline]
239#[target_feature(enable = "avx512fp16,avx512vl")]
240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
241pub fn _mm_setzero_ph() -> __m128h {
242    unsafe { transmute(f16x8::ZERO) }
243}
244
245/// Return vector of type __m256h with all elements set to zero.
246///
247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_setzero_ph)
248#[inline]
249#[target_feature(enable = "avx512fp16,avx512vl")]
250#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
251pub fn _mm256_setzero_ph() -> __m256h {
252    f16x16::ZERO.as_m256h()
253}
254
255/// Return vector of type __m512h with all elements set to zero.
256///
257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_ph)
258#[inline]
259#[target_feature(enable = "avx512fp16")]
260#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
261pub fn _mm512_setzero_ph() -> __m512h {
262    f16x32::ZERO.as_m512h()
263}
264
265/// Return vector of type `__m128h` with indetermination elements.
266/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
267/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
268/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
269///
270/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ph)
271#[inline]
272#[target_feature(enable = "avx512fp16,avx512vl")]
273#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
274pub fn _mm_undefined_ph() -> __m128h {
275    f16x8::ZERO.as_m128h()
276}
277
278/// Return vector of type `__m256h` with indetermination elements.
279/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
280/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
281/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
282///
283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_undefined_ph)
284#[inline]
285#[target_feature(enable = "avx512fp16,avx512vl")]
286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
287pub fn _mm256_undefined_ph() -> __m256h {
288    f16x16::ZERO.as_m256h()
289}
290
291/// Return vector of type `__m512h` with indetermination elements.
292/// Despite using the word "undefined" (following Intel's naming scheme), this non-deterministically
293/// picks some valid value and is not equivalent to [`mem::MaybeUninit`](crate::mem::MaybeUninit).
294/// In practice, this is typically equivalent to [`mem::zeroed`](crate::mem::zeroed).
295///
296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ph)
297#[inline]
298#[target_feature(enable = "avx512fp16")]
299#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
300pub fn _mm512_undefined_ph() -> __m512h {
301    f16x32::ZERO.as_m512h()
302}
303
304/// Cast vector of type `__m128d` to type `__m128h`. This intrinsic is only used for compilation and
305/// does not generate any instructions, thus it has zero latency.
306///
307/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ph)
308#[inline]
309#[target_feature(enable = "avx512fp16")]
310#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
311pub fn _mm_castpd_ph(a: __m128d) -> __m128h {
312    unsafe { transmute(a) }
313}
314
315/// Cast vector of type `__m256d` to type `__m256h`. This intrinsic is only used for compilation and
316/// does not generate any instructions, thus it has zero latency.
317///
318/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castpd_ph)
319#[inline]
320#[target_feature(enable = "avx512fp16")]
321#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
322pub fn _mm256_castpd_ph(a: __m256d) -> __m256h {
323    unsafe { transmute(a) }
324}
325
326/// Cast vector of type `__m512d` to type `__m512h`. This intrinsic is only used for compilation and
327/// does not generate any instructions, thus it has zero latency.
328///
329/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castpd_ph)
330#[inline]
331#[target_feature(enable = "avx512fp16")]
332#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
333pub fn _mm512_castpd_ph(a: __m512d) -> __m512h {
334    unsafe { transmute(a) }
335}
336
337/// Cast vector of type `__m128h` to type `__m128d`. This intrinsic is only used for compilation and
338/// does not generate any instructions, thus it has zero latency.
339///
340/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_pd)
341#[inline]
342#[target_feature(enable = "avx512fp16")]
343#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
344pub fn _mm_castph_pd(a: __m128h) -> __m128d {
345    unsafe { transmute(a) }
346}
347
348/// Cast vector of type `__m256h` to type `__m256d`. This intrinsic is only used for compilation and
349/// does not generate any instructions, thus it has zero latency.
350///
351/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_pd)
352#[inline]
353#[target_feature(enable = "avx512fp16")]
354#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
355pub fn _mm256_castph_pd(a: __m256h) -> __m256d {
356    unsafe { transmute(a) }
357}
358
359/// Cast vector of type `__m512h` to type `__m512d`. This intrinsic is only used for compilation and
360/// does not generate any instructions, thus it has zero latency.
361///
362/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_pd)
363#[inline]
364#[target_feature(enable = "avx512fp16")]
365#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
366pub fn _mm512_castph_pd(a: __m512h) -> __m512d {
367    unsafe { transmute(a) }
368}
369
370/// Cast vector of type `__m128` to type `__m128h`. This intrinsic is only used for compilation and
371/// does not generate any instructions, thus it has zero latency.
372///
373/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_ph)
374#[inline]
375#[target_feature(enable = "avx512fp16")]
376#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
377pub fn _mm_castps_ph(a: __m128) -> __m128h {
378    unsafe { transmute(a) }
379}
380
381/// Cast vector of type `__m256` to type `__m256h`. This intrinsic is only used for compilation and
382/// does not generate any instructions, thus it has zero latency.
383///
384/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castps_ph)
385#[inline]
386#[target_feature(enable = "avx512fp16")]
387#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
388pub fn _mm256_castps_ph(a: __m256) -> __m256h {
389    unsafe { transmute(a) }
390}
391
392/// Cast vector of type `__m512` to type `__m512h`. This intrinsic is only used for compilation and
393/// does not generate any instructions, thus it has zero latency.
394///
395/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castps_ph)
396#[inline]
397#[target_feature(enable = "avx512fp16")]
398#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
399pub fn _mm512_castps_ph(a: __m512) -> __m512h {
400    unsafe { transmute(a) }
401}
402
403/// Cast vector of type `__m128h` to type `__m128`. This intrinsic is only used for compilation and
404/// does not generate any instructions, thus it has zero latency.
405///
406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_ps)
407#[inline]
408#[target_feature(enable = "avx512fp16")]
409#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
410pub fn _mm_castph_ps(a: __m128h) -> __m128 {
411    unsafe { transmute(a) }
412}
413
414/// Cast vector of type `__m256h` to type `__m256`. This intrinsic is only used for compilation and
415/// does not generate any instructions, thus it has zero latency.
416///
417/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_ps)
418#[inline]
419#[target_feature(enable = "avx512fp16")]
420#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
421pub fn _mm256_castph_ps(a: __m256h) -> __m256 {
422    unsafe { transmute(a) }
423}
424
425/// Cast vector of type `__m512h` to type `__m512`. This intrinsic is only used for compilation and
426/// does not generate any instructions, thus it has zero latency.
427///
428/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_ps)
429#[inline]
430#[target_feature(enable = "avx512fp16")]
431#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
432pub fn _mm512_castph_ps(a: __m512h) -> __m512 {
433    unsafe { transmute(a) }
434}
435
436/// Cast vector of type `__m128i` to type `__m128h`. This intrinsic is only used for compilation and
437/// does not generate any instructions, thus it has zero latency.
438///
439/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_ph)
440#[inline]
441#[target_feature(enable = "avx512fp16")]
442#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
443pub fn _mm_castsi128_ph(a: __m128i) -> __m128h {
444    unsafe { transmute(a) }
445}
446
447/// Cast vector of type `__m256i` to type `__m256h`. This intrinsic is only used for compilation and
448/// does not generate any instructions, thus it has zero latency.
449///
450/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castsi256_ph)
451#[inline]
452#[target_feature(enable = "avx512fp16")]
453#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
454pub fn _mm256_castsi256_ph(a: __m256i) -> __m256h {
455    unsafe { transmute(a) }
456}
457
458/// Cast vector of type `__m512i` to type `__m512h`. This intrinsic is only used for compilation and
459/// does not generate any instructions, thus it has zero latency.
460///
461/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castsi512_ph)
462#[inline]
463#[target_feature(enable = "avx512fp16")]
464#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
465pub fn _mm512_castsi512_ph(a: __m512i) -> __m512h {
466    unsafe { transmute(a) }
467}
468
469/// Cast vector of type `__m128h` to type `__m128i`. This intrinsic is only used for compilation and
470/// does not generate any instructions, thus it has zero latency.
471///
472/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castph_si128)
473#[inline]
474#[target_feature(enable = "avx512fp16")]
475#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
476pub fn _mm_castph_si128(a: __m128h) -> __m128i {
477    unsafe { transmute(a) }
478}
479
480/// Cast vector of type `__m256h` to type `__m256i`. This intrinsic is only used for compilation and
481/// does not generate any instructions, thus it has zero latency.
482///
483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph_si256)
484#[inline]
485#[target_feature(enable = "avx512fp16")]
486#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
487pub fn _mm256_castph_si256(a: __m256h) -> __m256i {
488    unsafe { transmute(a) }
489}
490
491/// Cast vector of type `__m512h` to type `__m512i`. This intrinsic is only used for compilation and
492/// does not generate any instructions, thus it has zero latency.
493///
494/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph_si512)
495#[inline]
496#[target_feature(enable = "avx512fp16")]
497#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
498pub fn _mm512_castph_si512(a: __m512h) -> __m512i {
499    unsafe { transmute(a) }
500}
501
502/// Cast vector of type `__m256h` to type `__m128h`. This intrinsic is only used for compilation and
503/// does not generate any instructions, thus it has zero latency.
504///
505/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph256_ph128)
506#[inline]
507#[target_feature(enable = "avx512fp16")]
508#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
509pub fn _mm256_castph256_ph128(a: __m256h) -> __m128h {
510    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
511}
512
513/// Cast vector of type `__m512h` to type `__m128h`. This intrinsic is only used for compilation and
514/// does not generate any instructions, thus it has zero latency.
515///
516/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph128)
517#[inline]
518#[target_feature(enable = "avx512fp16")]
519#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
520pub fn _mm512_castph512_ph128(a: __m512h) -> __m128h {
521    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]) }
522}
523
524/// Cast vector of type `__m512h` to type `__m256h`. This intrinsic is only used for compilation and
525/// does not generate any instructions, thus it has zero latency.
526///
527/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph512_ph256)
528#[inline]
529#[target_feature(enable = "avx512fp16")]
530#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
531pub fn _mm512_castph512_ph256(a: __m512h) -> __m256h {
532    unsafe { simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]) }
533}
534
535/// Cast vector of type `__m128h` to type `__m256h`. The upper 8 elements of the result are undefined.
536/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
537/// but most of the time it does not generate any instructions.
538///
539/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_castph128_ph256)
540#[inline]
541#[target_feature(enable = "avx512fp16")]
542#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
543pub fn _mm256_castph128_ph256(a: __m128h) -> __m256h {
544    unsafe {
545        simd_shuffle!(
546            a,
547            _mm_undefined_ph(),
548            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
549        )
550    }
551}
552
553/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are undefined.
554/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
555/// but most of the time it does not generate any instructions.
556///
557/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph128_ph512)
558#[inline]
559#[target_feature(enable = "avx512fp16")]
560#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
561pub fn _mm512_castph128_ph512(a: __m128h) -> __m512h {
562    unsafe {
563        simd_shuffle!(
564            a,
565            _mm_undefined_ph(),
566            [
567                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
568                8, 8, 8, 8
569            ]
570        )
571    }
572}
573
574/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are undefined.
575/// In practice, the upper elements are zeroed. This intrinsic can generate the `vzeroupper` instruction,
576/// but most of the time it does not generate any instructions.
577///
578/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_castph256_ph512)
579#[inline]
580#[target_feature(enable = "avx512fp16")]
581#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
582pub fn _mm512_castph256_ph512(a: __m256h) -> __m512h {
583    unsafe {
584        simd_shuffle!(
585            a,
586            _mm256_undefined_ph(),
587            [
588                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
589                16, 16, 16, 16, 16, 16, 16, 16, 16
590            ]
591        )
592    }
593}
594
595/// Cast vector of type `__m256h` to type `__m128h`. The upper 8 elements of the result are zeroed.
596/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
597/// any instructions.
598///
599/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_zextph128_ph256)
600#[inline]
601#[target_feature(enable = "avx512fp16")]
602#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
603pub fn _mm256_zextph128_ph256(a: __m128h) -> __m256h {
604    unsafe {
605        simd_shuffle!(
606            a,
607            _mm_setzero_ph(),
608            [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8]
609        )
610    }
611}
612
613/// Cast vector of type `__m256h` to type `__m512h`. The upper 16 elements of the result are zeroed.
614/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
615/// any instructions.
616///
617/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph256_ph512)
618#[inline]
619#[target_feature(enable = "avx512fp16")]
620#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
621pub fn _mm512_zextph256_ph512(a: __m256h) -> __m512h {
622    unsafe {
623        simd_shuffle!(
624            a,
625            _mm256_setzero_ph(),
626            [
627                0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16,
628                16, 16, 16, 16, 16, 16, 16, 16, 16
629            ]
630        )
631    }
632}
633
634/// Cast vector of type `__m128h` to type `__m512h`. The upper 24 elements of the result are zeroed.
635/// This intrinsic can generate the `vzeroupper` instruction, but most of the time it does not generate
636/// any instructions.
637///
638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_zextph128_ph512)
639#[inline]
640#[target_feature(enable = "avx512fp16")]
641#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
642pub fn _mm512_zextph128_ph512(a: __m128h) -> __m512h {
643    unsafe {
644        simd_shuffle!(
645            a,
646            _mm_setzero_ph(),
647            [
648                0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
649                8, 8, 8, 8
650            ]
651        )
652    }
653}
654
655macro_rules! cmp_asm { // FIXME: use LLVM intrinsics
656    ($mask_type: ty, $reg: ident, $a: expr, $b: expr) => {{
657        let dst: $mask_type;
658        asm!(
659            "vcmpph {k}, {a}, {b}, {imm8}",
660            k = lateout(kreg) dst,
661            a = in($reg) $a,
662            b = in($reg) $b,
663            imm8 = const IMM5,
664            options(pure, nomem, nostack)
665        );
666        dst
667    }};
668    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr, $b: expr) => {{
669        let dst: $mask_type;
670        asm!(
671            "vcmpph {k} {{ {mask} }}, {a}, {b}, {imm8}",
672            k = lateout(kreg) dst,
673            mask = in(kreg) $mask,
674            a = in($reg) $a,
675            b = in($reg) $b,
676            imm8 = const IMM5,
677            options(pure, nomem, nostack)
678        );
679        dst
680    }};
681}
682
683/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
684/// operand specified by imm8, and store the results in mask vector k.
685///
686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ph_mask)
687#[inline]
688#[target_feature(enable = "avx512fp16,avx512vl")]
689#[rustc_legacy_const_generics(2)]
690#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
691pub fn _mm_cmp_ph_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
692    unsafe {
693        static_assert_uimm_bits!(IMM5, 5);
694        cmp_asm!(__mmask8, xmm_reg, a, b)
695    }
696}
697
698/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
699/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
700/// zeroed out when the corresponding mask bit is not set).
701///
702/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ph_mask)
703#[inline]
704#[target_feature(enable = "avx512fp16,avx512vl")]
705#[rustc_legacy_const_generics(3)]
706#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
707pub fn _mm_mask_cmp_ph_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
708    unsafe {
709        static_assert_uimm_bits!(IMM5, 5);
710        cmp_asm!(__mmask8, k1, xmm_reg, a, b)
711    }
712}
713
714/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
715/// operand specified by imm8, and store the results in mask vector k.
716///
717/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ph_mask)
718#[inline]
719#[target_feature(enable = "avx512fp16,avx512vl")]
720#[rustc_legacy_const_generics(2)]
721#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
722pub fn _mm256_cmp_ph_mask<const IMM5: i32>(a: __m256h, b: __m256h) -> __mmask16 {
723    unsafe {
724        static_assert_uimm_bits!(IMM5, 5);
725        cmp_asm!(__mmask16, ymm_reg, a, b)
726    }
727}
728
729/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
730/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
731/// zeroed out when the corresponding mask bit is not set).
732///
733/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_ph_mask)
734#[inline]
735#[target_feature(enable = "avx512fp16,avx512vl")]
736#[rustc_legacy_const_generics(3)]
737#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
738pub fn _mm256_mask_cmp_ph_mask<const IMM5: i32>(
739    k1: __mmask16,
740    a: __m256h,
741    b: __m256h,
742) -> __mmask16 {
743    unsafe {
744        static_assert_uimm_bits!(IMM5, 5);
745        cmp_asm!(__mmask16, k1, ymm_reg, a, b)
746    }
747}
748
749/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
750/// operand specified by imm8, and store the results in mask vector k.
751///
752/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_ph_mask)
753#[inline]
754#[target_feature(enable = "avx512fp16")]
755#[rustc_legacy_const_generics(2)]
756#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
757pub fn _mm512_cmp_ph_mask<const IMM5: i32>(a: __m512h, b: __m512h) -> __mmask32 {
758    unsafe {
759        static_assert_uimm_bits!(IMM5, 5);
760        cmp_asm!(__mmask32, zmm_reg, a, b)
761    }
762}
763
764/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
765/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
766/// zeroed out when the corresponding mask bit is not set).
767///
768/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_ph_mask)
769#[inline]
770#[target_feature(enable = "avx512fp16")]
771#[rustc_legacy_const_generics(3)]
772#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
773pub fn _mm512_mask_cmp_ph_mask<const IMM5: i32>(
774    k1: __mmask32,
775    a: __m512h,
776    b: __m512h,
777) -> __mmask32 {
778    unsafe {
779        static_assert_uimm_bits!(IMM5, 5);
780        cmp_asm!(__mmask32, k1, zmm_reg, a, b)
781    }
782}
783
784/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
785/// operand specified by imm8, and store the results in mask vector k.
786///
787/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
788///
789/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ph_mask)
790#[inline]
791#[target_feature(enable = "avx512fp16")]
792#[rustc_legacy_const_generics(2, 3)]
793#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
794pub fn _mm512_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
795    a: __m512h,
796    b: __m512h,
797) -> __mmask32 {
798    unsafe {
799        static_assert_uimm_bits!(IMM5, 5);
800        static_assert_sae!(SAE);
801        if SAE == _MM_FROUND_NO_EXC {
802            let dst: __mmask32;
803            asm!(
804                "vcmpph {k}, {a}, {b}, {{sae}}, {imm8}",
805                k = lateout(kreg) dst,
806                a = in(zmm_reg) a,
807                b = in(zmm_reg) b,
808                imm8 = const IMM5,
809                options(pure, nomem, nostack)
810            );
811            dst
812        } else {
813            cmp_asm!(__mmask32, zmm_reg, a, b)
814        }
815    }
816}
817
818/// Compare packed half-precision (16-bit) floating-point elements in a and b based on the comparison
819/// operand specified by imm8, and store the results in mask vector k using zeromask k (elements are
820/// zeroed out when the corresponding mask bit is not set).
821///
822/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
823///
824/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ph_mask)
825#[inline]
826#[target_feature(enable = "avx512fp16")]
827#[rustc_legacy_const_generics(3, 4)]
828#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
829pub fn _mm512_mask_cmp_round_ph_mask<const IMM5: i32, const SAE: i32>(
830    k1: __mmask32,
831    a: __m512h,
832    b: __m512h,
833) -> __mmask32 {
834    unsafe {
835        static_assert_uimm_bits!(IMM5, 5);
836        static_assert_sae!(SAE);
837        if SAE == _MM_FROUND_NO_EXC {
838            let dst: __mmask32;
839            asm!(
840                "vcmpph {k} {{{k1}}}, {a}, {b}, {{sae}}, {imm8}",
841                k = lateout(kreg) dst,
842                k1 = in(kreg) k1,
843                a = in(zmm_reg) a,
844                b = in(zmm_reg) b,
845                imm8 = const IMM5,
846                options(pure, nomem, nostack)
847            );
848            dst
849        } else {
850            cmp_asm!(__mmask32, k1, zmm_reg, a, b)
851        }
852    }
853}
854
855/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
856/// operand specified by imm8, and store the result in mask vector k. Exceptions can be suppressed by
857/// passing _MM_FROUND_NO_EXC in the sae parameter.
858///
859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sh_mask)
860#[inline]
861#[target_feature(enable = "avx512fp16")]
862#[rustc_legacy_const_generics(2, 3)]
863#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
864pub fn _mm_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __mmask8 {
865    static_assert_uimm_bits!(IMM5, 5);
866    static_assert_sae!(SAE);
867    _mm_mask_cmp_round_sh_mask::<IMM5, SAE>(0xff, a, b)
868}
869
870/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
871/// operand specified by imm8, and store the result in mask vector k using zeromask k1. Exceptions can be
872/// suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
873///
874/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sh_mask)
875#[inline]
876#[target_feature(enable = "avx512fp16")]
877#[rustc_legacy_const_generics(3, 4)]
878#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
879pub fn _mm_mask_cmp_round_sh_mask<const IMM5: i32, const SAE: i32>(
880    k1: __mmask8,
881    a: __m128h,
882    b: __m128h,
883) -> __mmask8 {
884    unsafe {
885        static_assert_uimm_bits!(IMM5, 5);
886        static_assert_sae!(SAE);
887        vcmpsh(a, b, IMM5, k1, SAE)
888    }
889}
890
891/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
892/// operand specified by imm8, and store the result in mask vector k.
893///
894/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sh_mask)
895#[inline]
896#[target_feature(enable = "avx512fp16")]
897#[rustc_legacy_const_generics(2)]
898#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
899pub fn _mm_cmp_sh_mask<const IMM5: i32>(a: __m128h, b: __m128h) -> __mmask8 {
900    static_assert_uimm_bits!(IMM5, 5);
901    _mm_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
902}
903
904/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
905/// operand specified by imm8, and store the result in mask vector k using zeromask k1.
906///
907/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sh_mask)
908#[inline]
909#[target_feature(enable = "avx512fp16")]
910#[rustc_legacy_const_generics(3)]
911#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
912pub fn _mm_mask_cmp_sh_mask<const IMM5: i32>(k1: __mmask8, a: __m128h, b: __m128h) -> __mmask8 {
913    static_assert_uimm_bits!(IMM5, 5);
914    _mm_mask_cmp_round_sh_mask::<IMM5, _MM_FROUND_CUR_DIRECTION>(k1, a, b)
915}
916
917/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
918/// operand specified by imm8, and return the boolean result (0 or 1).
919/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
920///
921/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_sh)
922#[inline]
923#[target_feature(enable = "avx512fp16")]
924#[rustc_legacy_const_generics(2, 3)]
925#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
926pub fn _mm_comi_round_sh<const IMM5: i32, const SAE: i32>(a: __m128h, b: __m128h) -> i32 {
927    unsafe {
928        static_assert_uimm_bits!(IMM5, 5);
929        static_assert_sae!(SAE);
930        vcomish(a, b, IMM5, SAE)
931    }
932}
933
934/// Compare the lower half-precision (16-bit) floating-point elements in a and b based on the comparison
935/// operand specified by imm8, and return the boolean result (0 or 1).
936///
937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_sh)
938#[inline]
939#[target_feature(enable = "avx512fp16")]
940#[rustc_legacy_const_generics(2)]
941#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
942pub fn _mm_comi_sh<const IMM5: i32>(a: __m128h, b: __m128h) -> i32 {
943    static_assert_uimm_bits!(IMM5, 5);
944    _mm_comi_round_sh::<IMM5, _MM_FROUND_CUR_DIRECTION>(a, b)
945}
946
947/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and return
948/// the boolean result (0 or 1).
949///
950/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sh)
951#[inline]
952#[target_feature(enable = "avx512fp16")]
953#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
954pub fn _mm_comieq_sh(a: __m128h, b: __m128h) -> i32 {
955    _mm_comi_sh::<_CMP_EQ_OS>(a, b)
956}
957
958/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
959/// and return the boolean result (0 or 1).
960///
961/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comige_sh)
962#[inline]
963#[target_feature(enable = "avx512fp16")]
964#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
965pub fn _mm_comige_sh(a: __m128h, b: __m128h) -> i32 {
966    _mm_comi_sh::<_CMP_GE_OS>(a, b)
967}
968
969/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
970/// the boolean result (0 or 1).
971///
972/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comigt_sh)
973#[inline]
974#[target_feature(enable = "avx512fp16")]
975#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
976pub fn _mm_comigt_sh(a: __m128h, b: __m128h) -> i32 {
977    _mm_comi_sh::<_CMP_GT_OS>(a, b)
978}
979
980/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
981/// return the boolean result (0 or 1).
982///
983/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comile_sh)
984#[inline]
985#[target_feature(enable = "avx512fp16")]
986#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
987pub fn _mm_comile_sh(a: __m128h, b: __m128h) -> i32 {
988    _mm_comi_sh::<_CMP_LE_OS>(a, b)
989}
990
991/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
992/// the boolean result (0 or 1).
993///
994/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comilt_sh)
995#[inline]
996#[target_feature(enable = "avx512fp16")]
997#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
998pub fn _mm_comilt_sh(a: __m128h, b: __m128h) -> i32 {
999    _mm_comi_sh::<_CMP_LT_OS>(a, b)
1000}
1001
1002/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
1003/// the boolean result (0 or 1).
1004///
1005/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comineq_sh)
1006#[inline]
1007#[target_feature(enable = "avx512fp16")]
1008#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1009pub fn _mm_comineq_sh(a: __m128h, b: __m128h) -> i32 {
1010    _mm_comi_sh::<_CMP_NEQ_OS>(a, b)
1011}
1012
1013/// Compare the lower half-precision (16-bit) floating-point elements in a and b for equality, and
1014/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1015///
1016/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomieq_sh)
1017#[inline]
1018#[target_feature(enable = "avx512fp16")]
1019#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1020pub fn _mm_ucomieq_sh(a: __m128h, b: __m128h) -> i32 {
1021    _mm_comi_sh::<_CMP_EQ_OQ>(a, b)
1022}
1023
1024/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than-or-equal,
1025/// and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1026///
1027/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomige_sh)
1028#[inline]
1029#[target_feature(enable = "avx512fp16")]
1030#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1031pub fn _mm_ucomige_sh(a: __m128h, b: __m128h) -> i32 {
1032    _mm_comi_sh::<_CMP_GE_OQ>(a, b)
1033}
1034
1035/// Compare the lower half-precision (16-bit) floating-point elements in a and b for greater-than, and return
1036/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1037///
1038/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomigt_sh)
1039#[inline]
1040#[target_feature(enable = "avx512fp16")]
1041#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1042pub fn _mm_ucomigt_sh(a: __m128h, b: __m128h) -> i32 {
1043    _mm_comi_sh::<_CMP_GT_OQ>(a, b)
1044}
1045
1046/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than-or-equal, and
1047/// return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1048///
1049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomile_sh)
1050#[inline]
1051#[target_feature(enable = "avx512fp16")]
1052#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1053pub fn _mm_ucomile_sh(a: __m128h, b: __m128h) -> i32 {
1054    _mm_comi_sh::<_CMP_LE_OQ>(a, b)
1055}
1056
1057/// Compare the lower half-precision (16-bit) floating-point elements in a and b for less-than, and return
1058/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1059///
1060/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomilt_sh)
1061#[inline]
1062#[target_feature(enable = "avx512fp16")]
1063#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1064pub fn _mm_ucomilt_sh(a: __m128h, b: __m128h) -> i32 {
1065    _mm_comi_sh::<_CMP_LT_OQ>(a, b)
1066}
1067
1068/// Compare the lower half-precision (16-bit) floating-point elements in a and b for not-equal, and return
1069/// the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.
1070///
1071/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ucomineq_sh)
1072#[inline]
1073#[target_feature(enable = "avx512fp16")]
1074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1075pub fn _mm_ucomineq_sh(a: __m128h, b: __m128h) -> i32 {
1076    _mm_comi_sh::<_CMP_NEQ_OQ>(a, b)
1077}
1078
1079/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1080/// a new vector. The address must be aligned to 16 bytes or a general-protection exception may be generated.
1081///
1082/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ph)
1083#[inline]
1084#[target_feature(enable = "avx512fp16,avx512vl")]
1085#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1086pub unsafe fn _mm_load_ph(mem_addr: *const f16) -> __m128h {
1087    *mem_addr.cast()
1088}
1089
1090/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1091/// a new vector. The address must be aligned to 32 bytes or a general-protection exception may be generated.
1092///
1093/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_ph)
1094#[inline]
1095#[target_feature(enable = "avx512fp16,avx512vl")]
1096#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1097pub unsafe fn _mm256_load_ph(mem_addr: *const f16) -> __m256h {
1098    *mem_addr.cast()
1099}
1100
1101/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1102/// a new vector. The address must be aligned to 64 bytes or a general-protection exception may be generated.
1103///
1104/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_ph)
1105#[inline]
1106#[target_feature(enable = "avx512fp16")]
1107#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1108pub unsafe fn _mm512_load_ph(mem_addr: *const f16) -> __m512h {
1109    *mem_addr.cast()
1110}
1111
1112/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector,
1113/// and zero the upper elements
1114///
1115/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sh)
1116#[inline]
1117#[target_feature(enable = "avx512fp16")]
1118#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1119pub unsafe fn _mm_load_sh(mem_addr: *const f16) -> __m128h {
1120    _mm_set_sh(*mem_addr)
1121}
1122
1123/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1124/// using writemask k (the element is copied from src when mask bit 0 is not set), and zero the upper elements.
1125///
1126/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_sh)
1127#[inline]
1128#[target_feature(enable = "avx512fp16")]
1129#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1130pub unsafe fn _mm_mask_load_sh(src: __m128h, k: __mmask8, mem_addr: *const f16) -> __m128h {
1131    let mut dst = src;
1132    asm!(
1133        vpl!("vmovsh {dst}{{{k}}}"),
1134        dst = inout(xmm_reg) dst,
1135        k = in(kreg) k,
1136        p = in(reg) mem_addr,
1137        options(pure, readonly, nostack, preserves_flags)
1138    );
1139    dst
1140}
1141
1142/// Load a half-precision (16-bit) floating-point element from memory into the lower element of a new vector
1143/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and zero the upper elements.
1144///
1145/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_sh)
1146#[inline]
1147#[target_feature(enable = "avx512fp16")]
1148#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1149pub unsafe fn _mm_maskz_load_sh(k: __mmask8, mem_addr: *const f16) -> __m128h {
1150    let mut dst: __m128h;
1151    asm!(
1152        vpl!("vmovsh {dst}{{{k}}}{{z}}"),
1153        dst = out(xmm_reg) dst,
1154        k = in(kreg) k,
1155        p = in(reg) mem_addr,
1156        options(pure, readonly, nostack, preserves_flags)
1157    );
1158    dst
1159}
1160
1161/// Load 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from memory into
1162/// a new vector. The address does not need to be aligned to any particular boundary.
1163///
1164/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_ph)
1165#[inline]
1166#[target_feature(enable = "avx512fp16,avx512vl")]
1167#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1168pub unsafe fn _mm_loadu_ph(mem_addr: *const f16) -> __m128h {
1169    ptr::read_unaligned(mem_addr.cast())
1170}
1171
1172/// Load 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from memory into
1173/// a new vector. The address does not need to be aligned to any particular boundary.
1174///
1175/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_ph)
1176#[inline]
1177#[target_feature(enable = "avx512fp16,avx512vl")]
1178#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1179pub unsafe fn _mm256_loadu_ph(mem_addr: *const f16) -> __m256h {
1180    ptr::read_unaligned(mem_addr.cast())
1181}
1182
1183/// Load 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from memory into
1184/// a new vector. The address does not need to be aligned to any particular boundary.
1185///
1186/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ph)
1187#[inline]
1188#[target_feature(enable = "avx512fp16")]
1189#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1190pub unsafe fn _mm512_loadu_ph(mem_addr: *const f16) -> __m512h {
1191    ptr::read_unaligned(mem_addr.cast())
1192}
1193
1194/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1195/// using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper
1196/// 7 packed elements from a to the upper elements of dst.
1197///
1198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_move_sh)
1199#[inline]
1200#[target_feature(enable = "avx512fp16")]
1201#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1202pub fn _mm_mask_move_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1203    unsafe {
1204        let mut mov: f16 = simd_extract!(src, 0);
1205        if (k & 1) != 0 {
1206            mov = simd_extract!(b, 0);
1207        }
1208        simd_insert!(a, 0, mov)
1209    }
1210}
1211
1212/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst
1213/// using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
1214/// elements from a to the upper elements of dst.
1215///
1216/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_move_sh)
1217#[inline]
1218#[target_feature(enable = "avx512fp16")]
1219#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1220pub fn _mm_maskz_move_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1221    unsafe {
1222        let mut mov: f16 = 0.;
1223        if (k & 1) != 0 {
1224            mov = simd_extract!(b, 0);
1225        }
1226        simd_insert!(a, 0, mov)
1227    }
1228}
1229
1230/// Move the lower half-precision (16-bit) floating-point element from b to the lower element of dst,
1231/// and copy the upper 7 packed elements from a to the upper elements of dst.
1232///
1233/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sh)
1234#[inline]
1235#[target_feature(enable = "avx512fp16")]
1236#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1237pub fn _mm_move_sh(a: __m128h, b: __m128h) -> __m128h {
1238    unsafe {
1239        let mov: f16 = simd_extract!(b, 0);
1240        simd_insert!(a, 0, mov)
1241    }
1242}
1243
1244/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1245/// The address must be aligned to 16 bytes or a general-protection exception may be generated.
1246///
1247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ph)
1248#[inline]
1249#[target_feature(enable = "avx512fp16,avx512vl")]
1250#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1251pub unsafe fn _mm_store_ph(mem_addr: *mut f16, a: __m128h) {
1252    *mem_addr.cast() = a;
1253}
1254
1255/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1256/// The address must be aligned to 32 bytes or a general-protection exception may be generated.
1257///
1258/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_ph)
1259#[inline]
1260#[target_feature(enable = "avx512fp16,avx512vl")]
1261#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1262pub unsafe fn _mm256_store_ph(mem_addr: *mut f16, a: __m256h) {
1263    *mem_addr.cast() = a;
1264}
1265
1266/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1267/// The address must be aligned to 64 bytes or a general-protection exception may be generated.
1268///
1269/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_ph)
1270#[inline]
1271#[target_feature(enable = "avx512fp16")]
1272#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1273pub unsafe fn _mm512_store_ph(mem_addr: *mut f16, a: __m512h) {
1274    *mem_addr.cast() = a;
1275}
1276
1277/// Store the lower half-precision (16-bit) floating-point element from a into memory.
1278///
1279/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_sh)
1280#[inline]
1281#[target_feature(enable = "avx512fp16")]
1282#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1283pub unsafe fn _mm_store_sh(mem_addr: *mut f16, a: __m128h) {
1284    *mem_addr = simd_extract!(a, 0);
1285}
1286
1287/// Store the lower half-precision (16-bit) floating-point element from a into memory using writemask k
1288///
1289/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_sh)
1290#[inline]
1291#[target_feature(enable = "avx512fp16")]
1292#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1293pub unsafe fn _mm_mask_store_sh(mem_addr: *mut f16, k: __mmask8, a: __m128h) {
1294    asm!(
1295        vps!("vmovdqu16", "{{{k}}}, {src}"),
1296        p = in(reg) mem_addr,
1297        k = in(kreg) k,
1298        src = in(xmm_reg) a,
1299        options(nostack, preserves_flags)
1300    );
1301}
1302
1303/// Store 128-bits (composed of 8 packed half-precision (16-bit) floating-point elements) from a into memory.
1304/// The address does not need to be aligned to any particular boundary.
1305///
1306/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_ph)
1307#[inline]
1308#[target_feature(enable = "avx512fp16,avx512vl")]
1309#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1310pub unsafe fn _mm_storeu_ph(mem_addr: *mut f16, a: __m128h) {
1311    ptr::write_unaligned(mem_addr.cast(), a);
1312}
1313
1314/// Store 256-bits (composed of 16 packed half-precision (16-bit) floating-point elements) from a into memory.
1315/// The address does not need to be aligned to any particular boundary.
1316///
1317/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_ph)
1318#[inline]
1319#[target_feature(enable = "avx512fp16,avx512vl")]
1320#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1321pub unsafe fn _mm256_storeu_ph(mem_addr: *mut f16, a: __m256h) {
1322    ptr::write_unaligned(mem_addr.cast(), a);
1323}
1324
1325/// Store 512-bits (composed of 32 packed half-precision (16-bit) floating-point elements) from a into memory.
1326/// The address does not need to be aligned to any particular boundary.
1327///
1328/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ph)
1329#[inline]
1330#[target_feature(enable = "avx512fp16")]
1331#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1332pub unsafe fn _mm512_storeu_ph(mem_addr: *mut f16, a: __m512h) {
1333    ptr::write_unaligned(mem_addr.cast(), a);
1334}
1335
1336/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1337///
1338/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_ph)
1339#[inline]
1340#[target_feature(enable = "avx512fp16,avx512vl")]
1341#[cfg_attr(test, assert_instr(vaddph))]
1342#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1343pub fn _mm_add_ph(a: __m128h, b: __m128h) -> __m128h {
1344    unsafe { simd_add(a, b) }
1345}
1346
1347/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1348/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1349///
1350/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ph)
1351#[inline]
1352#[target_feature(enable = "avx512fp16,avx512vl")]
1353#[cfg_attr(test, assert_instr(vaddph))]
1354#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1355pub fn _mm_mask_add_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1356    unsafe {
1357        let r = _mm_add_ph(a, b);
1358        simd_select_bitmask(k, r, src)
1359    }
1360}
1361
1362/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1363/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1364///
1365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ph)
1366#[inline]
1367#[target_feature(enable = "avx512fp16,avx512vl")]
1368#[cfg_attr(test, assert_instr(vaddph))]
1369#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1370pub fn _mm_maskz_add_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1371    unsafe {
1372        let r = _mm_add_ph(a, b);
1373        simd_select_bitmask(k, r, _mm_setzero_ph())
1374    }
1375}
1376
1377/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1378///
1379/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_add_ph)
1380#[inline]
1381#[target_feature(enable = "avx512fp16,avx512vl")]
1382#[cfg_attr(test, assert_instr(vaddph))]
1383#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1384pub fn _mm256_add_ph(a: __m256h, b: __m256h) -> __m256h {
1385    unsafe { simd_add(a, b) }
1386}
1387
1388/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1389/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1390///
1391/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_ph)
1392#[inline]
1393#[target_feature(enable = "avx512fp16,avx512vl")]
1394#[cfg_attr(test, assert_instr(vaddph))]
1395#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1396pub fn _mm256_mask_add_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1397    unsafe {
1398        let r = _mm256_add_ph(a, b);
1399        simd_select_bitmask(k, r, src)
1400    }
1401}
1402
1403/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1404/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1405///
1406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_ph)
1407#[inline]
1408#[target_feature(enable = "avx512fp16,avx512vl")]
1409#[cfg_attr(test, assert_instr(vaddph))]
1410#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1411pub fn _mm256_maskz_add_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1412    unsafe {
1413        let r = _mm256_add_ph(a, b);
1414        simd_select_bitmask(k, r, _mm256_setzero_ph())
1415    }
1416}
1417
1418/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1419///
1420/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_ph)
1421#[inline]
1422#[target_feature(enable = "avx512fp16")]
1423#[cfg_attr(test, assert_instr(vaddph))]
1424#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1425pub fn _mm512_add_ph(a: __m512h, b: __m512h) -> __m512h {
1426    unsafe { simd_add(a, b) }
1427}
1428
1429/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1430/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1431///
1432/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_ph)
1433#[inline]
1434#[target_feature(enable = "avx512fp16")]
1435#[cfg_attr(test, assert_instr(vaddph))]
1436#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1437pub fn _mm512_mask_add_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1438    unsafe {
1439        let r = _mm512_add_ph(a, b);
1440        simd_select_bitmask(k, r, src)
1441    }
1442}
1443
1444/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1445/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1446///
1447/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_ph)
1448#[inline]
1449#[target_feature(enable = "avx512fp16")]
1450#[cfg_attr(test, assert_instr(vaddph))]
1451#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1452pub fn _mm512_maskz_add_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1453    unsafe {
1454        let r = _mm512_add_ph(a, b);
1455        simd_select_bitmask(k, r, _mm512_setzero_ph())
1456    }
1457}
1458
1459/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1460/// Rounding is done according to the rounding parameter, which can be one of:
1461///
1462/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1463/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1464/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1465/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1466/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1467///
1468/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_round_ph)
1469#[inline]
1470#[target_feature(enable = "avx512fp16")]
1471#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1472#[rustc_legacy_const_generics(2)]
1473#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1474pub fn _mm512_add_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1475    unsafe {
1476        static_assert_rounding!(ROUNDING);
1477        vaddph(a, b, ROUNDING)
1478    }
1479}
1480
1481/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1482/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1483/// Rounding is done according to the rounding parameter, which can be one of:
1484///
1485/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1486/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1487/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1488/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1489/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1490///
1491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_round_ph)
1492#[inline]
1493#[target_feature(enable = "avx512fp16")]
1494#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1495#[rustc_legacy_const_generics(4)]
1496#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1497pub fn _mm512_mask_add_round_ph<const ROUNDING: i32>(
1498    src: __m512h,
1499    k: __mmask32,
1500    a: __m512h,
1501    b: __m512h,
1502) -> __m512h {
1503    unsafe {
1504        static_assert_rounding!(ROUNDING);
1505        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1506        simd_select_bitmask(k, r, src)
1507    }
1508}
1509
1510/// Add packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
1511/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1512/// Rounding is done according to the rounding parameter, which can be one of:
1513///
1514/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1515/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1516/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1517/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1518///
1519/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_round_ph)
1520#[inline]
1521#[target_feature(enable = "avx512fp16")]
1522#[cfg_attr(test, assert_instr(vaddph, ROUNDING = 8))]
1523#[rustc_legacy_const_generics(3)]
1524#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1525pub fn _mm512_maskz_add_round_ph<const ROUNDING: i32>(
1526    k: __mmask32,
1527    a: __m512h,
1528    b: __m512h,
1529) -> __m512h {
1530    unsafe {
1531        static_assert_rounding!(ROUNDING);
1532        let r = _mm512_add_round_ph::<ROUNDING>(a, b);
1533        simd_select_bitmask(k, r, _mm512_setzero_ph())
1534    }
1535}
1536
1537/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1538/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1539/// Rounding is done according to the rounding parameter, which can be one of:
1540///
1541/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1542/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1543/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1544/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1545/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1546///
1547/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sh)
1548#[inline]
1549#[target_feature(enable = "avx512fp16")]
1550#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1551#[rustc_legacy_const_generics(2)]
1552#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1553pub fn _mm_add_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1554    static_assert_rounding!(ROUNDING);
1555    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
1556}
1557
1558/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1559/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1560/// writemask k (the element is copied from src when mask bit 0 is not set).
1561/// Rounding is done according to the rounding parameter, which can be one of:
1562///
1563/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1564/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1565/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1566/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1567/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1568///
1569/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_round_sh)
1570#[inline]
1571#[target_feature(enable = "avx512fp16")]
1572#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1573#[rustc_legacy_const_generics(4)]
1574#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1575pub fn _mm_mask_add_round_sh<const ROUNDING: i32>(
1576    src: __m128h,
1577    k: __mmask8,
1578    a: __m128h,
1579    b: __m128h,
1580) -> __m128h {
1581    unsafe {
1582        static_assert_rounding!(ROUNDING);
1583        vaddsh(a, b, src, k, ROUNDING)
1584    }
1585}
1586
1587/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1588/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1589/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1590/// Rounding is done according to the rounding parameter, which can be one of:
1591///
1592/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1593/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1594/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1595/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1596/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1597///
1598/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sh)
1599#[inline]
1600#[target_feature(enable = "avx512fp16")]
1601#[cfg_attr(test, assert_instr(vaddsh, ROUNDING = 8))]
1602#[rustc_legacy_const_generics(3)]
1603#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1604pub fn _mm_maskz_add_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1605    static_assert_rounding!(ROUNDING);
1606    _mm_mask_add_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
1607}
1608
1609/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1610/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1611///
1612/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sh)
1613#[inline]
1614#[target_feature(enable = "avx512fp16")]
1615#[cfg_attr(test, assert_instr(vaddsh))]
1616#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1617pub fn _mm_add_sh(a: __m128h, b: __m128h) -> __m128h {
1618    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) + _mm_cvtsh_h(b)) }
1619}
1620
1621/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1622/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1623/// writemask k (the element is copied from src when mask bit 0 is not set).
1624///
1625/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_sh)
1626#[inline]
1627#[target_feature(enable = "avx512fp16")]
1628#[cfg_attr(test, assert_instr(vaddsh))]
1629#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1630pub fn _mm_mask_add_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1631    unsafe {
1632        let extractsrc: f16 = simd_extract!(src, 0);
1633        let mut add: f16 = extractsrc;
1634        if (k & 0b00000001) != 0 {
1635            let extracta: f16 = simd_extract!(a, 0);
1636            let extractb: f16 = simd_extract!(b, 0);
1637            add = extracta + extractb;
1638        }
1639        simd_insert!(a, 0, add)
1640    }
1641}
1642
1643/// Add the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
1644/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1645/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1646///
1647/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_sh)
1648#[inline]
1649#[target_feature(enable = "avx512fp16")]
1650#[cfg_attr(test, assert_instr(vaddsh))]
1651#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1652pub fn _mm_maskz_add_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1653    unsafe {
1654        let mut add: f16 = 0.;
1655        if (k & 0b00000001) != 0 {
1656            let extracta: f16 = simd_extract!(a, 0);
1657            let extractb: f16 = simd_extract!(b, 0);
1658            add = extracta + extractb;
1659        }
1660        simd_insert!(a, 0, add)
1661    }
1662}
1663
1664/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1665///
1666/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ph)
1667#[inline]
1668#[target_feature(enable = "avx512fp16,avx512vl")]
1669#[cfg_attr(test, assert_instr(vsubph))]
1670#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1671pub fn _mm_sub_ph(a: __m128h, b: __m128h) -> __m128h {
1672    unsafe { simd_sub(a, b) }
1673}
1674
1675/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1676/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1677///
1678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ph)
1679#[inline]
1680#[target_feature(enable = "avx512fp16,avx512vl")]
1681#[cfg_attr(test, assert_instr(vsubph))]
1682#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1683pub fn _mm_mask_sub_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1684    unsafe {
1685        let r = _mm_sub_ph(a, b);
1686        simd_select_bitmask(k, r, src)
1687    }
1688}
1689
1690/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1691/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1692///
1693/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ph)
1694#[inline]
1695#[target_feature(enable = "avx512fp16,avx512vl")]
1696#[cfg_attr(test, assert_instr(vsubph))]
1697#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1698pub fn _mm_maskz_sub_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1699    unsafe {
1700        let r = _mm_sub_ph(a, b);
1701        simd_select_bitmask(k, r, _mm_setzero_ph())
1702    }
1703}
1704
1705/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1706///
1707/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sub_ph)
1708#[inline]
1709#[target_feature(enable = "avx512fp16,avx512vl")]
1710#[cfg_attr(test, assert_instr(vsubph))]
1711#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1712pub fn _mm256_sub_ph(a: __m256h, b: __m256h) -> __m256h {
1713    unsafe { simd_sub(a, b) }
1714}
1715
1716/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1717/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1718///
1719/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_ph)
1720#[inline]
1721#[target_feature(enable = "avx512fp16,avx512vl")]
1722#[cfg_attr(test, assert_instr(vsubph))]
1723#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1724pub fn _mm256_mask_sub_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1725    unsafe {
1726        let r = _mm256_sub_ph(a, b);
1727        simd_select_bitmask(k, r, src)
1728    }
1729}
1730
1731/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1732/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1733///
1734/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_ph)
1735#[inline]
1736#[target_feature(enable = "avx512fp16,avx512vl")]
1737#[cfg_attr(test, assert_instr(vsubph))]
1738#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1739pub fn _mm256_maskz_sub_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
1740    unsafe {
1741        let r = _mm256_sub_ph(a, b);
1742        simd_select_bitmask(k, r, _mm256_setzero_ph())
1743    }
1744}
1745
1746/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1747///
1748/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_ph)
1749#[inline]
1750#[target_feature(enable = "avx512fp16")]
1751#[cfg_attr(test, assert_instr(vsubph))]
1752#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1753pub fn _mm512_sub_ph(a: __m512h, b: __m512h) -> __m512h {
1754    unsafe { simd_sub(a, b) }
1755}
1756
1757/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1758/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1759///
1760/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ph)
1761#[inline]
1762#[target_feature(enable = "avx512fp16")]
1763#[cfg_attr(test, assert_instr(vsubph))]
1764#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1765pub fn _mm512_mask_sub_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1766    unsafe {
1767        let r = _mm512_sub_ph(a, b);
1768        simd_select_bitmask(k, r, src)
1769    }
1770}
1771
1772/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1773/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1774///
1775/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ph)
1776#[inline]
1777#[target_feature(enable = "avx512fp16")]
1778#[cfg_attr(test, assert_instr(vsubph))]
1779#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1780pub fn _mm512_maskz_sub_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
1781    unsafe {
1782        let r = _mm512_sub_ph(a, b);
1783        simd_select_bitmask(k, r, _mm512_setzero_ph())
1784    }
1785}
1786
1787/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst.
1788/// Rounding is done according to the rounding parameter, which can be one of:
1789///
1790/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1791/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1792/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1793/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1794/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1795///
1796/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ph)
1797#[inline]
1798#[target_feature(enable = "avx512fp16")]
1799#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1800#[rustc_legacy_const_generics(2)]
1801#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1802pub fn _mm512_sub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
1803    unsafe {
1804        static_assert_rounding!(ROUNDING);
1805        vsubph(a, b, ROUNDING)
1806    }
1807}
1808
1809/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1810/// writemask k (elements are copied from src when the corresponding mask bit is not set).
1811/// Rounding is done according to the rounding parameter, which can be one of:
1812///
1813/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1814/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1815/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1816/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1817/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1818///
1819/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ph)
1820#[inline]
1821#[target_feature(enable = "avx512fp16")]
1822#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1823#[rustc_legacy_const_generics(4)]
1824#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1825pub fn _mm512_mask_sub_round_ph<const ROUNDING: i32>(
1826    src: __m512h,
1827    k: __mmask32,
1828    a: __m512h,
1829    b: __m512h,
1830) -> __m512h {
1831    unsafe {
1832        static_assert_rounding!(ROUNDING);
1833        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1834        simd_select_bitmask(k, r, src)
1835    }
1836}
1837
1838/// Subtract packed half-precision (16-bit) floating-point elements in b from a, and store the results in dst using
1839/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
1840/// Rounding is done according to the rounding parameter, which can be one of:
1841///
1842/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1843/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1844/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1845/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1846/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1847///
1848/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ph)
1849#[inline]
1850#[target_feature(enable = "avx512fp16")]
1851#[cfg_attr(test, assert_instr(vsubph, ROUNDING = 8))]
1852#[rustc_legacy_const_generics(3)]
1853#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1854pub fn _mm512_maskz_sub_round_ph<const ROUNDING: i32>(
1855    k: __mmask32,
1856    a: __m512h,
1857    b: __m512h,
1858) -> __m512h {
1859    unsafe {
1860        static_assert_rounding!(ROUNDING);
1861        let r = _mm512_sub_round_ph::<ROUNDING>(a, b);
1862        simd_select_bitmask(k, r, _mm512_setzero_ph())
1863    }
1864}
1865
1866/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1867/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1868/// Rounding is done according to the rounding parameter, which can be one of:
1869///
1870/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1871/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1872/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1873/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1874/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1875///
1876/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_round_sh)
1877#[inline]
1878#[target_feature(enable = "avx512fp16")]
1879#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1880#[rustc_legacy_const_generics(2)]
1881#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1882pub fn _mm_sub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
1883    static_assert_rounding!(ROUNDING);
1884    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
1885}
1886
1887/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1888/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1889/// writemask k (the element is copied from src when mask bit 0 is not set).
1890/// Rounding is done according to the rounding parameter, which can be one of:
1891///
1892/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1893/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1894/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1895/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1896/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1897///
1898/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_round_sh)
1899#[inline]
1900#[target_feature(enable = "avx512fp16")]
1901#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1902#[rustc_legacy_const_generics(4)]
1903#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1904pub fn _mm_mask_sub_round_sh<const ROUNDING: i32>(
1905    src: __m128h,
1906    k: __mmask8,
1907    a: __m128h,
1908    b: __m128h,
1909) -> __m128h {
1910    unsafe {
1911        static_assert_rounding!(ROUNDING);
1912        vsubsh(a, b, src, k, ROUNDING)
1913    }
1914}
1915
1916/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1917/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1918/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1919/// Rounding is done according to the rounding parameter, which can be one of:
1920///
1921/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
1922/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
1923/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
1924/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
1925/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
1926///
1927/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_round_sh)
1928#[inline]
1929#[target_feature(enable = "avx512fp16")]
1930#[cfg_attr(test, assert_instr(vsubsh, ROUNDING = 8))]
1931#[rustc_legacy_const_generics(3)]
1932#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1933pub fn _mm_maskz_sub_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1934    static_assert_rounding!(ROUNDING);
1935    _mm_mask_sub_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
1936}
1937
1938/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1939/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
1940///
1941/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sh)
1942#[inline]
1943#[target_feature(enable = "avx512fp16")]
1944#[cfg_attr(test, assert_instr(vsubsh))]
1945#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1946pub fn _mm_sub_sh(a: __m128h, b: __m128h) -> __m128h {
1947    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) - _mm_cvtsh_h(b)) }
1948}
1949
1950/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1951/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1952/// writemask k (the element is copied from src when mask bit 0 is not set).
1953///
1954/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_sh)
1955#[inline]
1956#[target_feature(enable = "avx512fp16")]
1957#[cfg_attr(test, assert_instr(vsubsh))]
1958#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1959pub fn _mm_mask_sub_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1960    unsafe {
1961        let extractsrc: f16 = simd_extract!(src, 0);
1962        let mut add: f16 = extractsrc;
1963        if (k & 0b00000001) != 0 {
1964            let extracta: f16 = simd_extract!(a, 0);
1965            let extractb: f16 = simd_extract!(b, 0);
1966            add = extracta - extractb;
1967        }
1968        simd_insert!(a, 0, add)
1969    }
1970}
1971
1972/// Subtract the lower half-precision (16-bit) floating-point elements in b from a, store the result in the
1973/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
1974/// zeromask k (the element is zeroed out when mask bit 0 is not set).
1975///
1976/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_sh)
1977#[inline]
1978#[target_feature(enable = "avx512fp16")]
1979#[cfg_attr(test, assert_instr(vsubsh))]
1980#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
1981pub fn _mm_maskz_sub_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
1982    unsafe {
1983        let mut add: f16 = 0.;
1984        if (k & 0b00000001) != 0 {
1985            let extracta: f16 = simd_extract!(a, 0);
1986            let extractb: f16 = simd_extract!(b, 0);
1987            add = extracta - extractb;
1988        }
1989        simd_insert!(a, 0, add)
1990    }
1991}
1992
1993/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
1994///
1995/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ph)
1996#[inline]
1997#[target_feature(enable = "avx512fp16,avx512vl")]
1998#[cfg_attr(test, assert_instr(vmulph))]
1999#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2000pub fn _mm_mul_ph(a: __m128h, b: __m128h) -> __m128h {
2001    unsafe { simd_mul(a, b) }
2002}
2003
2004/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2005/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2006///
2007/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ph)
2008#[inline]
2009#[target_feature(enable = "avx512fp16,avx512vl")]
2010#[cfg_attr(test, assert_instr(vmulph))]
2011#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2012pub fn _mm_mask_mul_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2013    unsafe {
2014        let r = _mm_mul_ph(a, b);
2015        simd_select_bitmask(k, r, src)
2016    }
2017}
2018
2019/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2020/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2021///
2022/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ph)
2023#[inline]
2024#[target_feature(enable = "avx512fp16,avx512vl")]
2025#[cfg_attr(test, assert_instr(vmulph))]
2026#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2027pub fn _mm_maskz_mul_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2028    unsafe {
2029        let r = _mm_mul_ph(a, b);
2030        simd_select_bitmask(k, r, _mm_setzero_ph())
2031    }
2032}
2033
2034/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2035///
2036/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_ph)
2037#[inline]
2038#[target_feature(enable = "avx512fp16,avx512vl")]
2039#[cfg_attr(test, assert_instr(vmulph))]
2040#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2041pub fn _mm256_mul_ph(a: __m256h, b: __m256h) -> __m256h {
2042    unsafe { simd_mul(a, b) }
2043}
2044
2045/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2046/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2047///
2048/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_ph)
2049#[inline]
2050#[target_feature(enable = "avx512fp16,avx512vl")]
2051#[cfg_attr(test, assert_instr(vmulph))]
2052#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2053pub fn _mm256_mask_mul_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2054    unsafe {
2055        let r = _mm256_mul_ph(a, b);
2056        simd_select_bitmask(k, r, src)
2057    }
2058}
2059
2060/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2061/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2062///
2063/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_ph)
2064#[inline]
2065#[target_feature(enable = "avx512fp16,avx512vl")]
2066#[cfg_attr(test, assert_instr(vmulph))]
2067#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2068pub fn _mm256_maskz_mul_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2069    unsafe {
2070        let r = _mm256_mul_ph(a, b);
2071        simd_select_bitmask(k, r, _mm256_setzero_ph())
2072    }
2073}
2074
2075/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2076///
2077/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_ph)
2078#[inline]
2079#[target_feature(enable = "avx512fp16")]
2080#[cfg_attr(test, assert_instr(vmulph))]
2081#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2082pub fn _mm512_mul_ph(a: __m512h, b: __m512h) -> __m512h {
2083    unsafe { simd_mul(a, b) }
2084}
2085
2086/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2087/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2088///
2089/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_ph)
2090#[inline]
2091#[target_feature(enable = "avx512fp16")]
2092#[cfg_attr(test, assert_instr(vmulph))]
2093#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2094pub fn _mm512_mask_mul_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2095    unsafe {
2096        let r = _mm512_mul_ph(a, b);
2097        simd_select_bitmask(k, r, src)
2098    }
2099}
2100
2101/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2102/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2103///
2104/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_ph)
2105#[inline]
2106#[target_feature(enable = "avx512fp16")]
2107#[cfg_attr(test, assert_instr(vmulph))]
2108#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2109pub fn _mm512_maskz_mul_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2110    unsafe {
2111        let r = _mm512_mul_ph(a, b);
2112        simd_select_bitmask(k, r, _mm512_setzero_ph())
2113    }
2114}
2115
2116/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst.
2117/// Rounding is done according to the rounding parameter, which can be one of:
2118///
2119/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2120/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2121/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2122/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2123/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2124///
2125/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_ph)
2126#[inline]
2127#[target_feature(enable = "avx512fp16")]
2128#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2129#[rustc_legacy_const_generics(2)]
2130#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2131pub fn _mm512_mul_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2132    unsafe {
2133        static_assert_rounding!(ROUNDING);
2134        vmulph(a, b, ROUNDING)
2135    }
2136}
2137
2138/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2139/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2140/// Rounding is done according to the rounding parameter, which can be one of:
2141///
2142/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2143/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2144/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2145/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2146/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2147///
2148/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_ph)
2149#[inline]
2150#[target_feature(enable = "avx512fp16")]
2151#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2152#[rustc_legacy_const_generics(4)]
2153#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2154pub fn _mm512_mask_mul_round_ph<const ROUNDING: i32>(
2155    src: __m512h,
2156    k: __mmask32,
2157    a: __m512h,
2158    b: __m512h,
2159) -> __m512h {
2160    unsafe {
2161        static_assert_rounding!(ROUNDING);
2162        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2163        simd_select_bitmask(k, r, src)
2164    }
2165}
2166
2167/// Multiply packed half-precision (16-bit) floating-point elements in a and b, and store the results in dst using
2168/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2169/// Rounding is done according to the rounding parameter, which can be one of:
2170///
2171/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2172/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2173/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2174/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2175/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2176///
2177/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_ph)
2178#[inline]
2179#[target_feature(enable = "avx512fp16")]
2180#[cfg_attr(test, assert_instr(vmulph, ROUNDING = 8))]
2181#[rustc_legacy_const_generics(3)]
2182#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2183pub fn _mm512_maskz_mul_round_ph<const ROUNDING: i32>(
2184    k: __mmask32,
2185    a: __m512h,
2186    b: __m512h,
2187) -> __m512h {
2188    unsafe {
2189        static_assert_rounding!(ROUNDING);
2190        let r = _mm512_mul_round_ph::<ROUNDING>(a, b);
2191        simd_select_bitmask(k, r, _mm512_setzero_ph())
2192    }
2193}
2194
2195/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2196/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2197/// Rounding is done according to the rounding parameter, which can be one of:
2198///
2199/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2200/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2201/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2202/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2203/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2204///
2205/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sh)
2206#[inline]
2207#[target_feature(enable = "avx512fp16")]
2208#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2209#[rustc_legacy_const_generics(2)]
2210#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2211pub fn _mm_mul_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2212    static_assert_rounding!(ROUNDING);
2213    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
2214}
2215
2216/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2217/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2218/// writemask k (the element is copied from src when mask bit 0 is not set).
2219/// Rounding is done according to the rounding parameter, which can be one of:
2220///
2221/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2222/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2223/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2224/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2225/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2226///
2227/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sh)
2228#[inline]
2229#[target_feature(enable = "avx512fp16")]
2230#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2231#[rustc_legacy_const_generics(4)]
2232#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2233pub fn _mm_mask_mul_round_sh<const ROUNDING: i32>(
2234    src: __m128h,
2235    k: __mmask8,
2236    a: __m128h,
2237    b: __m128h,
2238) -> __m128h {
2239    unsafe {
2240        static_assert_rounding!(ROUNDING);
2241        vmulsh(a, b, src, k, ROUNDING)
2242    }
2243}
2244
2245/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2246/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2247/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2248/// Rounding is done according to the rounding parameter, which can be one of:
2249///
2250/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2251/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2252/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2253/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2254/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2255///
2256/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sh)
2257#[inline]
2258#[target_feature(enable = "avx512fp16")]
2259#[cfg_attr(test, assert_instr(vmulsh, ROUNDING = 8))]
2260#[rustc_legacy_const_generics(3)]
2261#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2262pub fn _mm_maskz_mul_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2263    static_assert_rounding!(ROUNDING);
2264    _mm_mask_mul_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2265}
2266
2267/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2268/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2269///
2270/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sh)
2271#[inline]
2272#[target_feature(enable = "avx512fp16")]
2273#[cfg_attr(test, assert_instr(vmulsh))]
2274#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2275pub fn _mm_mul_sh(a: __m128h, b: __m128h) -> __m128h {
2276    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) * _mm_cvtsh_h(b)) }
2277}
2278
2279/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2280/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2281/// writemask k (the element is copied from src when mask bit 0 is not set).
2282///
2283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sh)
2284#[inline]
2285#[target_feature(enable = "avx512fp16")]
2286#[cfg_attr(test, assert_instr(vmulsh))]
2287#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2288pub fn _mm_mask_mul_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2289    unsafe {
2290        let extractsrc: f16 = simd_extract!(src, 0);
2291        let mut add: f16 = extractsrc;
2292        if (k & 0b00000001) != 0 {
2293            let extracta: f16 = simd_extract!(a, 0);
2294            let extractb: f16 = simd_extract!(b, 0);
2295            add = extracta * extractb;
2296        }
2297        simd_insert!(a, 0, add)
2298    }
2299}
2300
2301/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, store the result in the
2302/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2303/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2304///
2305/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sh)
2306#[inline]
2307#[target_feature(enable = "avx512fp16")]
2308#[cfg_attr(test, assert_instr(vmulsh))]
2309#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2310pub fn _mm_maskz_mul_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2311    unsafe {
2312        let mut add: f16 = 0.;
2313        if (k & 0b00000001) != 0 {
2314            let extracta: f16 = simd_extract!(a, 0);
2315            let extractb: f16 = simd_extract!(b, 0);
2316            add = extracta * extractb;
2317        }
2318        simd_insert!(a, 0, add)
2319    }
2320}
2321
2322/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2323///
2324/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_ph)
2325#[inline]
2326#[target_feature(enable = "avx512fp16,avx512vl")]
2327#[cfg_attr(test, assert_instr(vdivph))]
2328#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2329pub fn _mm_div_ph(a: __m128h, b: __m128h) -> __m128h {
2330    unsafe { simd_div(a, b) }
2331}
2332
2333/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2334/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2335///
2336/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ph)
2337#[inline]
2338#[target_feature(enable = "avx512fp16,avx512vl")]
2339#[cfg_attr(test, assert_instr(vdivph))]
2340#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2341pub fn _mm_mask_div_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2342    unsafe {
2343        let r = _mm_div_ph(a, b);
2344        simd_select_bitmask(k, r, src)
2345    }
2346}
2347
2348/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2349/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2350///
2351/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ph)
2352#[inline]
2353#[target_feature(enable = "avx512fp16,avx512vl")]
2354#[cfg_attr(test, assert_instr(vdivph))]
2355#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2356pub fn _mm_maskz_div_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2357    unsafe {
2358        let r = _mm_div_ph(a, b);
2359        simd_select_bitmask(k, r, _mm_setzero_ph())
2360    }
2361}
2362
2363/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2364///
2365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_div_ph)
2366#[inline]
2367#[target_feature(enable = "avx512fp16,avx512vl")]
2368#[cfg_attr(test, assert_instr(vdivph))]
2369#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2370pub fn _mm256_div_ph(a: __m256h, b: __m256h) -> __m256h {
2371    unsafe { simd_div(a, b) }
2372}
2373
2374/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2375/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2376///
2377/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_div_ph)
2378#[inline]
2379#[target_feature(enable = "avx512fp16,avx512vl")]
2380#[cfg_attr(test, assert_instr(vdivph))]
2381#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2382pub fn _mm256_mask_div_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2383    unsafe {
2384        let r = _mm256_div_ph(a, b);
2385        simd_select_bitmask(k, r, src)
2386    }
2387}
2388
2389/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2390/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2391///
2392/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_div_ph)
2393#[inline]
2394#[target_feature(enable = "avx512fp16,avx512vl")]
2395#[cfg_attr(test, assert_instr(vdivph))]
2396#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2397pub fn _mm256_maskz_div_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
2398    unsafe {
2399        let r = _mm256_div_ph(a, b);
2400        simd_select_bitmask(k, r, _mm256_setzero_ph())
2401    }
2402}
2403
2404/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2405///
2406/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_ph)
2407#[inline]
2408#[target_feature(enable = "avx512fp16")]
2409#[cfg_attr(test, assert_instr(vdivph))]
2410#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2411pub fn _mm512_div_ph(a: __m512h, b: __m512h) -> __m512h {
2412    unsafe { simd_div(a, b) }
2413}
2414
2415/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2416/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2417///
2418/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_ph)
2419#[inline]
2420#[target_feature(enable = "avx512fp16")]
2421#[cfg_attr(test, assert_instr(vdivph))]
2422#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2423pub fn _mm512_mask_div_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2424    unsafe {
2425        let r = _mm512_div_ph(a, b);
2426        simd_select_bitmask(k, r, src)
2427    }
2428}
2429
2430/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2431/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2432///
2433/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_ph)
2434#[inline]
2435#[target_feature(enable = "avx512fp16")]
2436#[cfg_attr(test, assert_instr(vdivph))]
2437#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2438pub fn _mm512_maskz_div_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
2439    unsafe {
2440        let r = _mm512_div_ph(a, b);
2441        simd_select_bitmask(k, r, _mm512_setzero_ph())
2442    }
2443}
2444
2445/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst.
2446/// Rounding is done according to the rounding parameter, which can be one of:
2447///
2448/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2449/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2450/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2451/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2452/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2453///
2454/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_ph)
2455#[inline]
2456#[target_feature(enable = "avx512fp16")]
2457#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2458#[rustc_legacy_const_generics(2)]
2459#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2460pub fn _mm512_div_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2461    unsafe {
2462        static_assert_rounding!(ROUNDING);
2463        vdivph(a, b, ROUNDING)
2464    }
2465}
2466
2467/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2468/// writemask k (elements are copied from src when the corresponding mask bit is not set).
2469/// Rounding is done according to the rounding parameter, which can be one of:
2470///
2471/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2472/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2473/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2474/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2475/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2476///
2477/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_round_ph)
2478#[inline]
2479#[target_feature(enable = "avx512fp16")]
2480#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2481#[rustc_legacy_const_generics(4)]
2482#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2483pub fn _mm512_mask_div_round_ph<const ROUNDING: i32>(
2484    src: __m512h,
2485    k: __mmask32,
2486    a: __m512h,
2487    b: __m512h,
2488) -> __m512h {
2489    unsafe {
2490        static_assert_rounding!(ROUNDING);
2491        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2492        simd_select_bitmask(k, r, src)
2493    }
2494}
2495
2496/// Divide packed half-precision (16-bit) floating-point elements in a by b, and store the results in dst using
2497/// zeromask k (elements are zeroed out when the corresponding mask bit is not set).
2498/// Rounding is done according to the rounding parameter, which can be one of:
2499///
2500/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2501/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2502/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2503/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2504/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2505///
2506/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_round_ph)
2507#[inline]
2508#[target_feature(enable = "avx512fp16")]
2509#[cfg_attr(test, assert_instr(vdivph, ROUNDING = 8))]
2510#[rustc_legacy_const_generics(3)]
2511#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2512pub fn _mm512_maskz_div_round_ph<const ROUNDING: i32>(
2513    k: __mmask32,
2514    a: __m512h,
2515    b: __m512h,
2516) -> __m512h {
2517    unsafe {
2518        static_assert_rounding!(ROUNDING);
2519        let r = _mm512_div_round_ph::<ROUNDING>(a, b);
2520        simd_select_bitmask(k, r, _mm512_setzero_ph())
2521    }
2522}
2523
2524/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2525/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2526/// Rounding is done according to the rounding parameter, which can be one of:
2527///
2528/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2529/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2530/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2531/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2532/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2533///
2534/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_round_sh)
2535#[inline]
2536#[target_feature(enable = "avx512fp16")]
2537#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2538#[rustc_legacy_const_generics(2)]
2539#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2540pub fn _mm_div_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2541    static_assert_rounding!(ROUNDING);
2542    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
2543}
2544
2545/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2546/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2547/// writemask k (the element is copied from src when mask bit 0 is not set).
2548/// Rounding is done according to the rounding parameter, which can be one of:
2549///
2550/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2551/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2552/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2553/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2554/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2555///
2556/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_round_sh)
2557#[inline]
2558#[target_feature(enable = "avx512fp16")]
2559#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2560#[rustc_legacy_const_generics(4)]
2561#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2562pub fn _mm_mask_div_round_sh<const ROUNDING: i32>(
2563    src: __m128h,
2564    k: __mmask8,
2565    a: __m128h,
2566    b: __m128h,
2567) -> __m128h {
2568    unsafe {
2569        static_assert_rounding!(ROUNDING);
2570        vdivsh(a, b, src, k, ROUNDING)
2571    }
2572}
2573
2574/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2575/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2576/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2577/// Rounding is done according to the rounding parameter, which can be one of:
2578///
2579/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2580/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2581/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2582/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2583/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2584///
2585/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_round_sh)
2586#[inline]
2587#[target_feature(enable = "avx512fp16")]
2588#[cfg_attr(test, assert_instr(vdivsh, ROUNDING = 8))]
2589#[rustc_legacy_const_generics(3)]
2590#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2591pub fn _mm_maskz_div_round_sh<const ROUNDING: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2592    static_assert_rounding!(ROUNDING);
2593    _mm_mask_div_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2594}
2595
2596/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2597/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
2598///
2599/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sh)
2600#[inline]
2601#[target_feature(enable = "avx512fp16")]
2602#[cfg_attr(test, assert_instr(vdivsh))]
2603#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2604pub fn _mm_div_sh(a: __m128h, b: __m128h) -> __m128h {
2605    unsafe { simd_insert!(a, 0, _mm_cvtsh_h(a) / _mm_cvtsh_h(b)) }
2606}
2607
2608/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2609/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2610/// writemask k (the element is copied from src when mask bit 0 is not set).
2611///
2612/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_sh)
2613#[inline]
2614#[target_feature(enable = "avx512fp16")]
2615#[cfg_attr(test, assert_instr(vdivsh))]
2616#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2617pub fn _mm_mask_div_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2618    unsafe {
2619        let extractsrc: f16 = simd_extract!(src, 0);
2620        let mut add: f16 = extractsrc;
2621        if (k & 0b00000001) != 0 {
2622            let extracta: f16 = simd_extract!(a, 0);
2623            let extractb: f16 = simd_extract!(b, 0);
2624            add = extracta / extractb;
2625        }
2626        simd_insert!(a, 0, add)
2627    }
2628}
2629
2630/// Divide the lower half-precision (16-bit) floating-point elements in a by b, store the result in the
2631/// lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst using
2632/// zeromask k (the element is zeroed out when mask bit 0 is not set).
2633///
2634/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_sh)
2635#[inline]
2636#[target_feature(enable = "avx512fp16")]
2637#[cfg_attr(test, assert_instr(vdivsh))]
2638#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2639pub fn _mm_maskz_div_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2640    unsafe {
2641        let mut add: f16 = 0.;
2642        if (k & 0b00000001) != 0 {
2643            let extracta: f16 = simd_extract!(a, 0);
2644            let extractb: f16 = simd_extract!(b, 0);
2645            add = extracta / extractb;
2646        }
2647        simd_insert!(a, 0, add)
2648    }
2649}
2650
2651/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2652/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2653/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2654///
2655/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pch)
2656#[inline]
2657#[target_feature(enable = "avx512fp16,avx512vl")]
2658#[cfg_attr(test, assert_instr(vfmulcph))]
2659#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2660pub fn _mm_mul_pch(a: __m128h, b: __m128h) -> __m128h {
2661    _mm_mask_mul_pch(_mm_undefined_ph(), 0xff, a, b)
2662}
2663
2664/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2665/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2666/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2667///
2668/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_pch)
2669#[inline]
2670#[target_feature(enable = "avx512fp16,avx512vl")]
2671#[cfg_attr(test, assert_instr(vfmulcph))]
2672#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2673pub fn _mm_mask_mul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2674    unsafe { transmute(vfmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
2675}
2676
2677/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2678/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2679/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2680///
2681/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_pch)
2682#[inline]
2683#[target_feature(enable = "avx512fp16,avx512vl")]
2684#[cfg_attr(test, assert_instr(vfmulcph))]
2685#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2686pub fn _mm_maskz_mul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2687    _mm_mask_mul_pch(_mm_setzero_ph(), k, a, b)
2688}
2689
2690/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2691/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2692/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2693///
2694/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mul_pch)
2695#[inline]
2696#[target_feature(enable = "avx512fp16,avx512vl")]
2697#[cfg_attr(test, assert_instr(vfmulcph))]
2698#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2699pub fn _mm256_mul_pch(a: __m256h, b: __m256h) -> __m256h {
2700    _mm256_mask_mul_pch(_mm256_undefined_ph(), 0xff, a, b)
2701}
2702
2703/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2704/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2705/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2706///
2707/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_pch)
2708#[inline]
2709#[target_feature(enable = "avx512fp16,avx512vl")]
2710#[cfg_attr(test, assert_instr(vfmulcph))]
2711#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2712pub fn _mm256_mask_mul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2713    unsafe { transmute(vfmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
2714}
2715
2716/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2717/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2718/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2719///
2720/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_pch)
2721#[inline]
2722#[target_feature(enable = "avx512fp16,avx512vl")]
2723#[cfg_attr(test, assert_instr(vfmulcph))]
2724#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2725pub fn _mm256_maskz_mul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
2726    _mm256_mask_mul_pch(_mm256_setzero_ph(), k, a, b)
2727}
2728
2729/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2730/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2731/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2732///
2733/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pch)
2734#[inline]
2735#[target_feature(enable = "avx512fp16")]
2736#[cfg_attr(test, assert_instr(vfmulcph))]
2737#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2738pub fn _mm512_mul_pch(a: __m512h, b: __m512h) -> __m512h {
2739    _mm512_mask_mul_pch(_mm512_undefined_ph(), 0xffff, a, b)
2740}
2741
2742/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2743/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2744/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2745///
2746/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pch)
2747#[inline]
2748#[target_feature(enable = "avx512fp16")]
2749#[cfg_attr(test, assert_instr(vfmulcph))]
2750#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2751pub fn _mm512_mask_mul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2752    _mm512_mask_mul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2753}
2754
2755/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2756/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2757/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2758///
2759/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pch)
2760#[inline]
2761#[target_feature(enable = "avx512fp16")]
2762#[cfg_attr(test, assert_instr(vfmulcph))]
2763#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2764pub fn _mm512_maskz_mul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
2765    _mm512_mask_mul_pch(_mm512_setzero_ph(), k, a, b)
2766}
2767
2768/// Multiply the packed complex numbers in a and b, and store the results in dst. Each complex number is
2769/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2770/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2771///
2772/// Rounding is done according to the rounding parameter, which can be one of:
2773///
2774/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2775/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2776/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2777/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2778/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2779///
2780/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_pch)
2781#[inline]
2782#[target_feature(enable = "avx512fp16")]
2783#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2784#[rustc_legacy_const_generics(2)]
2785#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2786pub fn _mm512_mul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
2787    static_assert_rounding!(ROUNDING);
2788    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
2789}
2790
2791/// Multiply the packed complex numbers in a and b, and store the results in dst using writemask k (the element
2792/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
2793/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2794///
2795/// Rounding is done according to the rounding parameter, which can be one of:
2796///
2797/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2798/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2799/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2800/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2801/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2802///
2803/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_pch)
2804#[inline]
2805#[target_feature(enable = "avx512fp16")]
2806#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2807#[rustc_legacy_const_generics(4)]
2808#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2809pub fn _mm512_mask_mul_round_pch<const ROUNDING: i32>(
2810    src: __m512h,
2811    k: __mmask16,
2812    a: __m512h,
2813    b: __m512h,
2814) -> __m512h {
2815    unsafe {
2816        static_assert_rounding!(ROUNDING);
2817        transmute(vfmulcph_512(
2818            transmute(a),
2819            transmute(b),
2820            transmute(src),
2821            k,
2822            ROUNDING,
2823        ))
2824    }
2825}
2826
2827/// Multiply the packed complex numbers in a and b, and store the results in dst using zeromask k (the element
2828/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
2829/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2830///
2831/// Rounding is done according to the rounding parameter, which can be one of:
2832///
2833/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2834/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2835/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2836/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2837/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2838///
2839/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_pch)
2840#[inline]
2841#[target_feature(enable = "avx512fp16")]
2842#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
2843#[rustc_legacy_const_generics(3)]
2844#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2845pub fn _mm512_maskz_mul_round_pch<const ROUNDING: i32>(
2846    k: __mmask16,
2847    a: __m512h,
2848    b: __m512h,
2849) -> __m512h {
2850    static_assert_rounding!(ROUNDING);
2851    _mm512_mask_mul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
2852}
2853
2854/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
2855/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
2856/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2857/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2858///
2859/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_sch)
2860#[inline]
2861#[target_feature(enable = "avx512fp16")]
2862#[cfg_attr(test, assert_instr(vfmulcsh))]
2863#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2864pub fn _mm_mul_sch(a: __m128h, b: __m128h) -> __m128h {
2865    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
2866}
2867
2868/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2869/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
2870/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent
2871/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2872///
2873/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_sch)
2874#[inline]
2875#[target_feature(enable = "avx512fp16")]
2876#[cfg_attr(test, assert_instr(vfmulcsh))]
2877#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2878pub fn _mm_mask_mul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2879    _mm_mask_mul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
2880}
2881
2882/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2883/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
2884/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2885/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2886///
2887/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_sch)
2888#[inline]
2889#[target_feature(enable = "avx512fp16")]
2890#[cfg_attr(test, assert_instr(vfmulcsh))]
2891#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2892pub fn _mm_maskz_mul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
2893    _mm_mask_mul_sch(f16x8::ZERO.as_m128h(), k, a, b)
2894}
2895
2896/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst,
2897/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
2898/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2899/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2900///
2901/// Rounding is done according to the rounding parameter, which can be one of:
2902///
2903/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2904/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2905/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2906/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2907/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2908///
2909/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_round_sch)
2910#[inline]
2911#[target_feature(enable = "avx512fp16")]
2912#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2913#[rustc_legacy_const_generics(2)]
2914#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2915pub fn _mm_mul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
2916    static_assert_rounding!(ROUNDING);
2917    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
2918}
2919
2920/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2921/// writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 6 packed
2922/// elements from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2923/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2924///
2925/// Rounding is done according to the rounding parameter, which can be one of:
2926///
2927/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2928/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2929/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2930/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2931/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2932///
2933/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_round_sch)
2934#[inline]
2935#[target_feature(enable = "avx512fp16")]
2936#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2937#[rustc_legacy_const_generics(4)]
2938#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2939pub fn _mm_mask_mul_round_sch<const ROUNDING: i32>(
2940    src: __m128h,
2941    k: __mmask8,
2942    a: __m128h,
2943    b: __m128h,
2944) -> __m128h {
2945    unsafe {
2946        static_assert_rounding!(ROUNDING);
2947        transmute(vfmulcsh(
2948            transmute(a),
2949            transmute(b),
2950            transmute(src),
2951            k,
2952            ROUNDING,
2953        ))
2954    }
2955}
2956
2957/// Multiply the lower complex numbers in a and b, and store the result in the lower elements of dst using
2958/// zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 6 packed elements
2959/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision
2960/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2961///
2962/// Rounding is done according to the rounding parameter, which can be one of:
2963///
2964/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
2965/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
2966/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
2967/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
2968/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
2969///
2970/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_round_sch)
2971#[inline]
2972#[target_feature(enable = "avx512fp16")]
2973#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
2974#[rustc_legacy_const_generics(3)]
2975#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2976pub fn _mm_maskz_mul_round_sch<const ROUNDING: i32>(
2977    k: __mmask8,
2978    a: __m128h,
2979    b: __m128h,
2980) -> __m128h {
2981    static_assert_rounding!(ROUNDING);
2982    _mm_mask_mul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
2983}
2984
2985/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
2986/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
2987/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
2988///
2989/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_pch)
2990#[inline]
2991#[target_feature(enable = "avx512fp16,avx512vl")]
2992#[cfg_attr(test, assert_instr(vfmulcph))]
2993#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
2994pub fn _mm_fmul_pch(a: __m128h, b: __m128h) -> __m128h {
2995    _mm_mul_pch(a, b)
2996}
2997
2998/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
2999/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent
3000/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3001///
3002/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_pch)
3003#[inline]
3004#[target_feature(enable = "avx512fp16,avx512vl")]
3005#[cfg_attr(test, assert_instr(vfmulcph))]
3006#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3007pub fn _mm_mask_fmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3008    _mm_mask_mul_pch(src, k, a, b)
3009}
3010
3011/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3012/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3013/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3014///
3015/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_pch)
3016#[inline]
3017#[target_feature(enable = "avx512fp16,avx512vl")]
3018#[cfg_attr(test, assert_instr(vfmulcph))]
3019#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3020pub fn _mm_maskz_fmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3021    _mm_maskz_mul_pch(k, a, b)
3022}
3023
3024/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is
3025/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3026/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3027///
3028/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmul_pch)
3029#[inline]
3030#[target_feature(enable = "avx512fp16,avx512vl")]
3031#[cfg_attr(test, assert_instr(vfmulcph))]
3032#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3033pub fn _mm256_fmul_pch(a: __m256h, b: __m256h) -> __m256h {
3034    _mm256_mul_pch(a, b)
3035}
3036
3037/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3038/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3039/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3040///
3041/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmul_pch)
3042#[inline]
3043#[target_feature(enable = "avx512fp16,avx512vl")]
3044#[cfg_attr(test, assert_instr(vfmulcph))]
3045#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3046pub fn _mm256_mask_fmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3047    _mm256_mask_mul_pch(src, k, a, b)
3048}
3049
3050/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3051/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3052/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3053///
3054/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmul_pch)
3055#[inline]
3056#[target_feature(enable = "avx512fp16,avx512vl")]
3057#[cfg_attr(test, assert_instr(vfmulcph))]
3058#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3059pub fn _mm256_maskz_fmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3060    _mm256_maskz_mul_pch(k, a, b)
3061}
3062
3063/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
3064/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3065///
3066/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_pch)
3067#[inline]
3068#[target_feature(enable = "avx512fp16")]
3069#[cfg_attr(test, assert_instr(vfmulcph))]
3070#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3071pub fn _mm512_fmul_pch(a: __m512h, b: __m512h) -> __m512h {
3072    _mm512_mul_pch(a, b)
3073}
3074
3075/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3076/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3077/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3078///
3079/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_pch)
3080#[inline]
3081#[target_feature(enable = "avx512fp16")]
3082#[cfg_attr(test, assert_instr(vfmulcph))]
3083#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3084pub fn _mm512_mask_fmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3085    _mm512_mask_mul_pch(src, k, a, b)
3086}
3087
3088/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3089/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3090/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3091///
3092/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_pch)
3093#[inline]
3094#[target_feature(enable = "avx512fp16")]
3095#[cfg_attr(test, assert_instr(vfmulcph))]
3096#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3097pub fn _mm512_maskz_fmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3098    _mm512_maskz_mul_pch(k, a, b)
3099}
3100
3101/// Multiply packed complex numbers in a and b, and store the results in dst. Each complex number is composed
3102/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3103/// Rounding is done according to the rounding parameter, which can be one of:
3104///
3105/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3106/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3107/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3108/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3109/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3110///
3111/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmul_round_pch)
3112#[inline]
3113#[target_feature(enable = "avx512fp16")]
3114#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3115#[rustc_legacy_const_generics(2)]
3116#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3117pub fn _mm512_fmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3118    static_assert_rounding!(ROUNDING);
3119    _mm512_mul_round_pch::<ROUNDING>(a, b)
3120}
3121
3122/// Multiply packed complex numbers in a and b, and store the results in dst using writemask k (the element
3123/// is copied from src when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3124/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3125/// Rounding is done according to the rounding parameter, which can be one of:
3126///
3127/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3128/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3129/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3130/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3131/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3132///
3133/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmul_round_pch)
3134#[inline]
3135#[target_feature(enable = "avx512fp16")]
3136#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3137#[rustc_legacy_const_generics(4)]
3138#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3139pub fn _mm512_mask_fmul_round_pch<const ROUNDING: i32>(
3140    src: __m512h,
3141    k: __mmask16,
3142    a: __m512h,
3143    b: __m512h,
3144) -> __m512h {
3145    static_assert_rounding!(ROUNDING);
3146    _mm512_mask_mul_round_pch::<ROUNDING>(src, k, a, b)
3147}
3148
3149/// Multiply packed complex numbers in a and b, and store the results in dst using zeromask k (the element
3150/// is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent half-precision
3151/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3152/// Rounding is done according to the rounding parameter, which can be one of:
3153///
3154/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3155/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3156/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3157/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3158/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3159///
3160/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmul_round_pch)
3161#[inline]
3162#[target_feature(enable = "avx512fp16")]
3163#[cfg_attr(test, assert_instr(vfmulcph, ROUNDING = 8))]
3164#[rustc_legacy_const_generics(3)]
3165#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3166pub fn _mm512_maskz_fmul_round_pch<const ROUNDING: i32>(
3167    k: __mmask16,
3168    a: __m512h,
3169    b: __m512h,
3170) -> __m512h {
3171    static_assert_rounding!(ROUNDING);
3172    _mm512_maskz_mul_round_pch::<ROUNDING>(k, a, b)
3173}
3174
3175/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is
3176/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
3177/// number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3178///
3179/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_sch)
3180#[inline]
3181#[target_feature(enable = "avx512fp16")]
3182#[cfg_attr(test, assert_instr(vfmulcsh))]
3183#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3184pub fn _mm_fmul_sch(a: __m128h, b: __m128h) -> __m128h {
3185    _mm_mul_sch(a, b)
3186}
3187
3188/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3189/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3190/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3191///
3192/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_sch)
3193#[inline]
3194#[target_feature(enable = "avx512fp16")]
3195#[cfg_attr(test, assert_instr(vfmulcsh))]
3196#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3197pub fn _mm_mask_fmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3198    _mm_mask_mul_sch(src, k, a, b)
3199}
3200
3201/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3202/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3203/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3204///
3205/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_sch)
3206#[inline]
3207#[target_feature(enable = "avx512fp16")]
3208#[cfg_attr(test, assert_instr(vfmulcsh))]
3209#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3210pub fn _mm_maskz_fmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3211    _mm_maskz_mul_sch(k, a, b)
3212}
3213
3214/// Multiply the lower complex numbers in a and b, and store the results in dst. Each complex number is composed
3215/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3216///
3217/// Rounding is done according to the rounding parameter, which can be one of:
3218///
3219/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3220/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3221/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3222/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3223/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3224///
3225/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmul_round_sch)
3226#[inline]
3227#[target_feature(enable = "avx512fp16")]
3228#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3229#[rustc_legacy_const_generics(2)]
3230#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3231pub fn _mm_fmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3232    static_assert_rounding!(ROUNDING);
3233    _mm_mul_round_sch::<ROUNDING>(a, b)
3234}
3235
3236/// Multiply the lower complex numbers in a and b, and store the results in dst using writemask k (the element
3237/// is copied from src when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3238/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3239///
3240/// Rounding is done according to the rounding parameter, which can be one of:
3241///
3242/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3243/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3244/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3245/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3246/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3247///
3248/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmul_round_sch)
3249#[inline]
3250#[target_feature(enable = "avx512fp16")]
3251#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3252#[rustc_legacy_const_generics(4)]
3253#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3254pub fn _mm_mask_fmul_round_sch<const ROUNDING: i32>(
3255    src: __m128h,
3256    k: __mmask8,
3257    a: __m128h,
3258    b: __m128h,
3259) -> __m128h {
3260    static_assert_rounding!(ROUNDING);
3261    _mm_mask_mul_round_sch::<ROUNDING>(src, k, a, b)
3262}
3263
3264/// Multiply the lower complex numbers in a and b, and store the results in dst using zeromask k (the element
3265/// is zeroed out when mask bit 0 is not set). Each complex number is composed of two adjacent half-precision
3266/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
3267///
3268/// Rounding is done according to the rounding parameter, which can be one of:
3269///
3270/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3271/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3272/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3273/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3274/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3275///
3276/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmul_round_sch)
3277#[inline]
3278#[target_feature(enable = "avx512fp16")]
3279#[cfg_attr(test, assert_instr(vfmulcsh, ROUNDING = 8))]
3280#[rustc_legacy_const_generics(3)]
3281#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3282pub fn _mm_maskz_fmul_round_sch<const ROUNDING: i32>(
3283    k: __mmask8,
3284    a: __m128h,
3285    b: __m128h,
3286) -> __m128h {
3287    static_assert_rounding!(ROUNDING);
3288    _mm_maskz_mul_round_sch::<ROUNDING>(k, a, b)
3289}
3290
3291/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3292/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3293/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3294/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3295///
3296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_pch)
3297#[inline]
3298#[target_feature(enable = "avx512fp16,avx512vl")]
3299#[cfg_attr(test, assert_instr(vfcmulcph))]
3300#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3301pub fn _mm_cmul_pch(a: __m128h, b: __m128h) -> __m128h {
3302    _mm_mask_cmul_pch(_mm_undefined_ph(), 0xff, a, b)
3303}
3304
3305/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3306/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3307/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3308/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3309///
3310/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_pch)
3311#[inline]
3312#[target_feature(enable = "avx512fp16,avx512vl")]
3313#[cfg_attr(test, assert_instr(vfcmulcph))]
3314#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3315pub fn _mm_mask_cmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3316    unsafe { transmute(vfcmulcph_128(transmute(a), transmute(b), transmute(src), k)) }
3317}
3318
3319/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3320/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3321/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3322/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3323///
3324/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_pch)
3325#[inline]
3326#[target_feature(enable = "avx512fp16,avx512vl")]
3327#[cfg_attr(test, assert_instr(vfcmulcph))]
3328#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3329pub fn _mm_maskz_cmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3330    _mm_mask_cmul_pch(_mm_setzero_ph(), k, a, b)
3331}
3332
3333/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3334/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3335/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3336/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3337///
3338/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmul_pch)
3339#[inline]
3340#[target_feature(enable = "avx512fp16,avx512vl")]
3341#[cfg_attr(test, assert_instr(vfcmulcph))]
3342#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3343pub fn _mm256_cmul_pch(a: __m256h, b: __m256h) -> __m256h {
3344    _mm256_mask_cmul_pch(_mm256_undefined_ph(), 0xff, a, b)
3345}
3346
3347/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3348/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3349/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3350/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3351///
3352/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmul_pch)
3353#[inline]
3354#[target_feature(enable = "avx512fp16,avx512vl")]
3355#[cfg_attr(test, assert_instr(vfcmulcph))]
3356#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3357pub fn _mm256_mask_cmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3358    unsafe { transmute(vfcmulcph_256(transmute(a), transmute(b), transmute(src), k)) }
3359}
3360
3361/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3362/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3363/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3364/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3365///
3366/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cmul_pch)
3367#[inline]
3368#[target_feature(enable = "avx512fp16,avx512vl")]
3369#[cfg_attr(test, assert_instr(vfcmulcph))]
3370#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3371pub fn _mm256_maskz_cmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3372    _mm256_mask_cmul_pch(_mm256_setzero_ph(), k, a, b)
3373}
3374
3375/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3376/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3377/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3378/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3379///
3380/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_pch)
3381#[inline]
3382#[target_feature(enable = "avx512fp16")]
3383#[cfg_attr(test, assert_instr(vfcmulcph))]
3384#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3385pub fn _mm512_cmul_pch(a: __m512h, b: __m512h) -> __m512h {
3386    _mm512_mask_cmul_pch(_mm512_undefined_ph(), 0xffff, a, b)
3387}
3388
3389/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3390/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3391/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3392/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3393///
3394/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_pch)
3395#[inline]
3396#[target_feature(enable = "avx512fp16")]
3397#[cfg_attr(test, assert_instr(vfcmulcph))]
3398#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3399pub fn _mm512_mask_cmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3400    _mm512_mask_cmul_round_pch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3401}
3402
3403/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3404/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3405/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3406/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3407///
3408/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_pch)
3409#[inline]
3410#[target_feature(enable = "avx512fp16")]
3411#[cfg_attr(test, assert_instr(vfcmulcph))]
3412#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3413pub fn _mm512_maskz_cmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3414    _mm512_mask_cmul_pch(_mm512_setzero_ph(), k, a, b)
3415}
3416
3417/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3418/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3419/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3420/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3421///
3422/// Rounding is done according to the rounding parameter, which can be one of:
3423///
3424/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3425/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3426/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3427/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3428/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3429///
3430/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmul_round_pch)
3431#[inline]
3432#[target_feature(enable = "avx512fp16")]
3433#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3434#[rustc_legacy_const_generics(2)]
3435#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3436pub fn _mm512_cmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3437    static_assert_rounding!(ROUNDING);
3438    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_undefined_ph(), 0xffff, a, b)
3439}
3440
3441/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3442/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3443/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3444/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3445///
3446/// Rounding is done according to the rounding parameter, which can be one of:
3447///
3448/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3449/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3450/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3451/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3452/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3453///
3454/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmul_round_pch)
3455#[inline]
3456#[target_feature(enable = "avx512fp16")]
3457#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3458#[rustc_legacy_const_generics(4)]
3459#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3460pub fn _mm512_mask_cmul_round_pch<const ROUNDING: i32>(
3461    src: __m512h,
3462    k: __mmask16,
3463    a: __m512h,
3464    b: __m512h,
3465) -> __m512h {
3466    unsafe {
3467        static_assert_rounding!(ROUNDING);
3468        transmute(vfcmulcph_512(
3469            transmute(a),
3470            transmute(b),
3471            transmute(src),
3472            k,
3473            ROUNDING,
3474        ))
3475    }
3476}
3477
3478/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3479/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3480/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3481/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3482///
3483/// Rounding is done according to the rounding parameter, which can be one of:
3484///
3485/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3486/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3487/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3488/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3489/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3490///
3491/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cmul_round_pch)
3492#[inline]
3493#[target_feature(enable = "avx512fp16")]
3494#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3495#[rustc_legacy_const_generics(3)]
3496#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3497pub fn _mm512_maskz_cmul_round_pch<const ROUNDING: i32>(
3498    k: __mmask16,
3499    a: __m512h,
3500    b: __m512h,
3501) -> __m512h {
3502    static_assert_rounding!(ROUNDING);
3503    _mm512_mask_cmul_round_pch::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
3504}
3505
3506/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3507/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3508/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3509///
3510/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_sch)
3511#[inline]
3512#[target_feature(enable = "avx512fp16")]
3513#[cfg_attr(test, assert_instr(vfcmulcsh))]
3514#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3515pub fn _mm_cmul_sch(a: __m128h, b: __m128h) -> __m128h {
3516    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), 0xff, a, b)
3517}
3518
3519/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3520/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3521/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3522/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3523///
3524/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_sch)
3525#[inline]
3526#[target_feature(enable = "avx512fp16")]
3527#[cfg_attr(test, assert_instr(vfcmulcsh))]
3528#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3529pub fn _mm_mask_cmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3530    _mm_mask_cmul_round_sch::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
3531}
3532
3533/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3534/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3535/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3536/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3537///
3538/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_sch)
3539#[inline]
3540#[target_feature(enable = "avx512fp16")]
3541#[cfg_attr(test, assert_instr(vfcmulcsh))]
3542#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3543pub fn _mm_maskz_cmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3544    _mm_mask_cmul_sch(f16x8::ZERO.as_m128h(), k, a, b)
3545}
3546
3547/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3548/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3549/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3550///
3551/// Rounding is done according to the rounding parameter, which can be one of:
3552///
3553/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3554/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3555/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3556/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3557/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3558///
3559/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmul_round_sch)
3560#[inline]
3561#[target_feature(enable = "avx512fp16")]
3562#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3563#[rustc_legacy_const_generics(2)]
3564#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3565pub fn _mm_cmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3566    static_assert_rounding!(ROUNDING);
3567    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
3568}
3569
3570/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3571/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3572/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3573/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3574///
3575/// Rounding is done according to the rounding parameter, which can be one of:
3576///
3577/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3578/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3579/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3580/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3581/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3582///
3583/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmul_round_sch)
3584#[inline]
3585#[target_feature(enable = "avx512fp16")]
3586#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3587#[rustc_legacy_const_generics(4)]
3588#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3589pub fn _mm_mask_cmul_round_sch<const ROUNDING: i32>(
3590    src: __m128h,
3591    k: __mmask8,
3592    a: __m128h,
3593    b: __m128h,
3594) -> __m128h {
3595    unsafe {
3596        static_assert_rounding!(ROUNDING);
3597        transmute(vfcmulcsh(
3598            transmute(a),
3599            transmute(b),
3600            transmute(src),
3601            k,
3602            ROUNDING,
3603        ))
3604    }
3605}
3606
3607/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3608/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3609/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3610/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3611///
3612/// Rounding is done according to the rounding parameter, which can be one of:
3613///
3614/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3615/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3616/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3617/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3618/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3619///
3620/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cmul_round_sch)
3621#[inline]
3622#[target_feature(enable = "avx512fp16")]
3623#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3624#[rustc_legacy_const_generics(3)]
3625#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3626pub fn _mm_maskz_cmul_round_sch<const ROUNDING: i32>(
3627    k: __mmask8,
3628    a: __m128h,
3629    b: __m128h,
3630) -> __m128h {
3631    static_assert_rounding!(ROUNDING);
3632    _mm_mask_cmul_round_sch::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
3633}
3634
3635/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3636/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3637/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3638/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3639///
3640/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_pch)
3641#[inline]
3642#[target_feature(enable = "avx512fp16,avx512vl")]
3643#[cfg_attr(test, assert_instr(vfcmulcph))]
3644#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3645pub fn _mm_fcmul_pch(a: __m128h, b: __m128h) -> __m128h {
3646    _mm_cmul_pch(a, b)
3647}
3648
3649/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3650/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3651/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3652/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3653///
3654/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_pch)
3655#[inline]
3656#[target_feature(enable = "avx512fp16,avx512vl")]
3657#[cfg_attr(test, assert_instr(vfcmulcph))]
3658#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3659pub fn _mm_mask_fcmul_pch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3660    _mm_mask_cmul_pch(src, k, a, b)
3661}
3662
3663/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3664/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3665/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3666/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3667///
3668/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_pch)
3669#[inline]
3670#[target_feature(enable = "avx512fp16,avx512vl")]
3671#[cfg_attr(test, assert_instr(vfcmulcph))]
3672#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3673pub fn _mm_maskz_fcmul_pch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3674    _mm_maskz_cmul_pch(k, a, b)
3675}
3676
3677/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3678/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3679/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3680/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3681///
3682/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmul_pch)
3683#[inline]
3684#[target_feature(enable = "avx512fp16,avx512vl")]
3685#[cfg_attr(test, assert_instr(vfcmulcph))]
3686#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3687pub fn _mm256_fcmul_pch(a: __m256h, b: __m256h) -> __m256h {
3688    _mm256_cmul_pch(a, b)
3689}
3690
3691/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3692/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3693/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3694/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3695///
3696/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmul_pch)
3697#[inline]
3698#[target_feature(enable = "avx512fp16,avx512vl")]
3699#[cfg_attr(test, assert_instr(vfcmulcph))]
3700#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3701pub fn _mm256_mask_fcmul_pch(src: __m256h, k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3702    _mm256_mask_cmul_pch(src, k, a, b)
3703}
3704
3705/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3706/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3707/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3708/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3709///
3710/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmul_pch)
3711#[inline]
3712#[target_feature(enable = "avx512fp16,avx512vl")]
3713#[cfg_attr(test, assert_instr(vfcmulcph))]
3714#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3715pub fn _mm256_maskz_fcmul_pch(k: __mmask8, a: __m256h, b: __m256h) -> __m256h {
3716    _mm256_maskz_cmul_pch(k, a, b)
3717}
3718
3719/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3720/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3721/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3722/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3723///
3724/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_pch)
3725#[inline]
3726#[target_feature(enable = "avx512fp16")]
3727#[cfg_attr(test, assert_instr(vfcmulcph))]
3728#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3729pub fn _mm512_fcmul_pch(a: __m512h, b: __m512h) -> __m512h {
3730    _mm512_cmul_pch(a, b)
3731}
3732
3733/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3734/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3735/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3736/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3737///
3738/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_pch)
3739#[inline]
3740#[target_feature(enable = "avx512fp16")]
3741#[cfg_attr(test, assert_instr(vfcmulcph))]
3742#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3743pub fn _mm512_mask_fcmul_pch(src: __m512h, k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3744    _mm512_mask_cmul_pch(src, k, a, b)
3745}
3746
3747/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3748/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3749/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3750/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3751///
3752/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_pch)
3753#[inline]
3754#[target_feature(enable = "avx512fp16")]
3755#[cfg_attr(test, assert_instr(vfcmulcph))]
3756#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3757pub fn _mm512_maskz_fcmul_pch(k: __mmask16, a: __m512h, b: __m512h) -> __m512h {
3758    _mm512_maskz_cmul_pch(k, a, b)
3759}
3760
3761/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3762/// store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3763/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3764///
3765/// Rounding is done according to the rounding parameter, which can be one of:
3766///
3767/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3768/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3769/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3770/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3771/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3772///
3773/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmul_round_pch)
3774#[inline]
3775#[target_feature(enable = "avx512fp16")]
3776#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3777#[rustc_legacy_const_generics(2)]
3778#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3779pub fn _mm512_fcmul_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
3780    static_assert_rounding!(ROUNDING);
3781    _mm512_cmul_round_pch::<ROUNDING>(a, b)
3782}
3783
3784/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3785/// store the results in dst using writemask k (the element is copied from src when corresponding mask bit is not set).
3786/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3787/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3788///
3789/// Rounding is done according to the rounding parameter, which can be one of:
3790///
3791/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3792/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3793/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3794/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3795/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3796///
3797/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmul_round_pch)
3798#[inline]
3799#[target_feature(enable = "avx512fp16")]
3800#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3801#[rustc_legacy_const_generics(4)]
3802#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3803pub fn _mm512_mask_fcmul_round_pch<const ROUNDING: i32>(
3804    src: __m512h,
3805    k: __mmask16,
3806    a: __m512h,
3807    b: __m512h,
3808) -> __m512h {
3809    static_assert_rounding!(ROUNDING);
3810    _mm512_mask_cmul_round_pch::<ROUNDING>(src, k, a, b)
3811}
3812
3813/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, and
3814/// store the results in dst using zeromask k (the element is zeroed out when corresponding mask bit is not set).
3815/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3816/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3817///
3818/// Rounding is done according to the rounding parameter, which can be one of:
3819///
3820/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3821/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3822/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3823/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3824/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3825///
3826/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmul_round_pch)
3827#[inline]
3828#[target_feature(enable = "avx512fp16")]
3829#[cfg_attr(test, assert_instr(vfcmulcph, ROUNDING = 8))]
3830#[rustc_legacy_const_generics(3)]
3831#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3832pub fn _mm512_maskz_fcmul_round_pch<const ROUNDING: i32>(
3833    k: __mmask16,
3834    a: __m512h,
3835    b: __m512h,
3836) -> __m512h {
3837    static_assert_rounding!(ROUNDING);
3838    _mm512_maskz_cmul_round_pch::<ROUNDING>(k, a, b)
3839}
3840
3841/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3842/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3843/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3844/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3845///
3846/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_sch)
3847#[inline]
3848#[target_feature(enable = "avx512fp16")]
3849#[cfg_attr(test, assert_instr(vfcmulcsh))]
3850#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3851pub fn _mm_fcmul_sch(a: __m128h, b: __m128h) -> __m128h {
3852    _mm_cmul_sch(a, b)
3853}
3854
3855/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3856/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3857/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3858/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3859///
3860/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_sch)
3861#[inline]
3862#[target_feature(enable = "avx512fp16")]
3863#[cfg_attr(test, assert_instr(vfcmulcsh))]
3864#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3865pub fn _mm_mask_fcmul_sch(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3866    _mm_mask_cmul_sch(src, k, a, b)
3867}
3868
3869/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3870/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3871/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3872/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3873///
3874/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_sch)
3875#[inline]
3876#[target_feature(enable = "avx512fp16")]
3877#[cfg_attr(test, assert_instr(vfcmulcsh))]
3878#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3879pub fn _mm_maskz_fcmul_sch(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
3880    _mm_maskz_cmul_sch(k, a, b)
3881}
3882
3883/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3884/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
3885/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
3886///
3887/// Rounding is done according to the rounding parameter, which can be one of:
3888///
3889/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3890/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3891/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3892/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3893/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3894///
3895/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmul_round_sch)
3896#[inline]
3897#[target_feature(enable = "avx512fp16")]
3898#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3899#[rustc_legacy_const_generics(2)]
3900#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3901pub fn _mm_fcmul_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
3902    static_assert_rounding!(ROUNDING);
3903    _mm_cmul_round_sch::<ROUNDING>(a, b)
3904}
3905
3906/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3907/// and store the results in dst using writemask k (the element is copied from src when mask bit 0 is not set).
3908/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3909/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3910///
3911/// Rounding is done according to the rounding parameter, which can be one of:
3912///
3913/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3914/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3915/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3916/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3917/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3918///
3919/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmul_round_sch)
3920#[inline]
3921#[target_feature(enable = "avx512fp16")]
3922#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3923#[rustc_legacy_const_generics(4)]
3924#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3925pub fn _mm_mask_fcmul_round_sch<const ROUNDING: i32>(
3926    src: __m128h,
3927    k: __mmask8,
3928    a: __m128h,
3929    b: __m128h,
3930) -> __m128h {
3931    static_assert_rounding!(ROUNDING);
3932    _mm_mask_cmul_round_sch::<ROUNDING>(src, k, a, b)
3933}
3934
3935/// Multiply the lower complex numbers in a by the complex conjugates of the lower complex numbers in b,
3936/// and store the results in dst using zeromask k (the element is zeroed out when mask bit 0 is not set).
3937/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
3938/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
3939///
3940/// Rounding is done according to the rounding parameter, which can be one of:
3941///
3942/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
3943/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
3944/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
3945/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
3946/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
3947///
3948/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmul_round_sch)
3949#[inline]
3950#[target_feature(enable = "avx512fp16")]
3951#[cfg_attr(test, assert_instr(vfcmulcsh, ROUNDING = 8))]
3952#[rustc_legacy_const_generics(3)]
3953#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3954pub fn _mm_maskz_fcmul_round_sch<const ROUNDING: i32>(
3955    k: __mmask8,
3956    a: __m128h,
3957    b: __m128h,
3958) -> __m128h {
3959    static_assert_rounding!(ROUNDING);
3960    _mm_maskz_cmul_round_sch::<ROUNDING>(k, a, b)
3961}
3962
3963/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3964/// the results in dst.
3965///
3966/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_ph)
3967#[inline]
3968#[target_feature(enable = "avx512fp16,avx512vl")]
3969#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3970pub fn _mm_abs_ph(v2: __m128h) -> __m128h {
3971    unsafe { transmute(_mm_and_si128(transmute(v2), _mm_set1_epi16(i16::MAX))) }
3972}
3973
3974/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3975/// the result in dst.
3976///
3977/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_ph)
3978#[inline]
3979#[target_feature(enable = "avx512fp16,avx512vl")]
3980#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3981pub fn _mm256_abs_ph(v2: __m256h) -> __m256h {
3982    unsafe { transmute(_mm256_and_si256(transmute(v2), _mm256_set1_epi16(i16::MAX))) }
3983}
3984
3985/// Finds the absolute value of each packed half-precision (16-bit) floating-point element in v2, storing
3986/// the result in dst.
3987///
3988/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_ph)
3989#[inline]
3990#[target_feature(enable = "avx512fp16")]
3991#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
3992pub fn _mm512_abs_ph(v2: __m512h) -> __m512h {
3993    unsafe { transmute(_mm512_and_si512(transmute(v2), _mm512_set1_epi16(i16::MAX))) }
3994}
3995
3996/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex
3997/// number is composed of two adjacent half-precision (16-bit) floating-point elements, which defines
3998/// the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate
3999/// `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4000///
4001/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_conj_pch)
4002#[inline]
4003#[target_feature(enable = "avx512fp16,avx512vl")]
4004#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4005pub fn _mm_conj_pch(a: __m128h) -> __m128h {
4006    unsafe { transmute(_mm_xor_si128(transmute(a), _mm_set1_epi32(i32::MIN))) }
4007}
4008
4009/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
4010/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
4011/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4012/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4013///
4014/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_conj_pch)
4015#[inline]
4016#[target_feature(enable = "avx512fp16,avx512vl")]
4017#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4018pub fn _mm_mask_conj_pch(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
4019    unsafe {
4020        let r: __m128 = transmute(_mm_conj_pch(a));
4021        transmute(simd_select_bitmask(k, r, transmute(src)))
4022    }
4023}
4024
4025/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
4026/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4027/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4028/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4029///
4030/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_conj_pch)
4031#[inline]
4032#[target_feature(enable = "avx512fp16,avx512vl")]
4033#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4034pub fn _mm_maskz_conj_pch(k: __mmask8, a: __m128h) -> __m128h {
4035    _mm_mask_conj_pch(_mm_setzero_ph(), k, a)
4036}
4037
4038/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
4039/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4040/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4041///
4042/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_conj_pch)
4043#[inline]
4044#[target_feature(enable = "avx512fp16,avx512vl")]
4045#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4046pub fn _mm256_conj_pch(a: __m256h) -> __m256h {
4047    unsafe { transmute(_mm256_xor_si256(transmute(a), _mm256_set1_epi32(i32::MIN))) }
4048}
4049
4050/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
4051/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
4052/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4053/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4054///
4055/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_conj_pch)
4056#[inline]
4057#[target_feature(enable = "avx512fp16,avx512vl")]
4058#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4059pub fn _mm256_mask_conj_pch(src: __m256h, k: __mmask8, a: __m256h) -> __m256h {
4060    unsafe {
4061        let r: __m256 = transmute(_mm256_conj_pch(a));
4062        transmute(simd_select_bitmask(k, r, transmute(src)))
4063    }
4064}
4065
4066/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
4067/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4068/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4069/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4070///
4071/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_conj_pch)
4072#[inline]
4073#[target_feature(enable = "avx512fp16,avx512vl")]
4074#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4075pub fn _mm256_maskz_conj_pch(k: __mmask8, a: __m256h) -> __m256h {
4076    _mm256_mask_conj_pch(_mm256_setzero_ph(), k, a)
4077}
4078
4079/// Compute the complex conjugates of complex numbers in a, and store the results in dst. Each complex number
4080/// is composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4081/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4082///
4083/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_conj_pch)
4084#[inline]
4085#[target_feature(enable = "avx512fp16")]
4086#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4087pub fn _mm512_conj_pch(a: __m512h) -> __m512h {
4088    unsafe { transmute(_mm512_xor_si512(transmute(a), _mm512_set1_epi32(i32::MIN))) }
4089}
4090
4091/// Compute the complex conjugates of complex numbers in a, and store the results in dst using writemask k
4092/// (the element is copied from src when corresponding mask bit is not set). Each complex number is composed of two
4093/// adjacent half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4094/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4095///
4096/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_conj_pch)
4097#[inline]
4098#[target_feature(enable = "avx512fp16")]
4099#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4100pub fn _mm512_mask_conj_pch(src: __m512h, k: __mmask16, a: __m512h) -> __m512h {
4101    unsafe {
4102        let r: __m512 = transmute(_mm512_conj_pch(a));
4103        transmute(simd_select_bitmask(k, r, transmute(src)))
4104    }
4105}
4106
4107/// Compute the complex conjugates of complex numbers in a, and store the results in dst using zeromask k
4108/// (the element is zeroed out when corresponding mask bit is not set). Each complex number is composed of two adjacent
4109/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4110/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4111///
4112/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_conj_pch)
4113#[inline]
4114#[target_feature(enable = "avx512fp16")]
4115#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4116pub fn _mm512_maskz_conj_pch(k: __mmask16, a: __m512h) -> __m512h {
4117    _mm512_mask_conj_pch(_mm512_setzero_ph(), k, a)
4118}
4119
4120/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4121/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4122/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4123///
4124/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_pch)
4125#[inline]
4126#[target_feature(enable = "avx512fp16,avx512vl")]
4127#[cfg_attr(test, assert_instr(vfmaddcph))]
4128#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4129pub fn _mm_fmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4130    _mm_mask3_fmadd_pch(a, b, c, 0xff)
4131}
4132
4133/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4134/// and store the results in dst using writemask k (the element is copied from a when the corresponding
4135/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4136/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4137///
4138/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_pch)
4139#[inline]
4140#[target_feature(enable = "avx512fp16,avx512vl")]
4141#[cfg_attr(test, assert_instr(vfmaddcph))]
4142#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4143pub fn _mm_mask_fmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4144    unsafe {
4145        let r: __m128 = transmute(_mm_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4146        transmute(simd_select_bitmask(k, r, transmute(a)))
4147    }
4148}
4149
4150/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4151/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4152/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4153/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4154///
4155/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_pch)
4156#[inline]
4157#[target_feature(enable = "avx512fp16,avx512vl")]
4158#[cfg_attr(test, assert_instr(vfmaddcph))]
4159#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4160pub fn _mm_mask3_fmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4161    unsafe {
4162        transmute(vfmaddcph_mask3_128(
4163            transmute(a),
4164            transmute(b),
4165            transmute(c),
4166            k,
4167        ))
4168    }
4169}
4170
4171/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4172/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4173/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4174/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4175///
4176/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_pch)
4177#[inline]
4178#[target_feature(enable = "avx512fp16,avx512vl")]
4179#[cfg_attr(test, assert_instr(vfmaddcph))]
4180#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4181pub fn _mm_maskz_fmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4182    unsafe {
4183        transmute(vfmaddcph_maskz_128(
4184            transmute(a),
4185            transmute(b),
4186            transmute(c),
4187            k,
4188        ))
4189    }
4190}
4191
4192/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4193/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4194/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4195///
4196/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_pch)
4197#[inline]
4198#[target_feature(enable = "avx512fp16,avx512vl")]
4199#[cfg_attr(test, assert_instr(vfmaddcph))]
4200#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4201pub fn _mm256_fmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4202    _mm256_mask3_fmadd_pch(a, b, c, 0xff)
4203}
4204
4205/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4206/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4207/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4208/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4209///
4210/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_pch)
4211#[inline]
4212#[target_feature(enable = "avx512fp16,avx512vl")]
4213#[cfg_attr(test, assert_instr(vfmaddcph))]
4214#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4215pub fn _mm256_mask_fmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4216    unsafe {
4217        let r: __m256 = transmute(_mm256_mask3_fmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4218        transmute(simd_select_bitmask(k, r, transmute(a)))
4219    }
4220}
4221
4222/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4223/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4224/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4225/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4226///
4227/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_pch)
4228#[inline]
4229#[target_feature(enable = "avx512fp16,avx512vl")]
4230#[cfg_attr(test, assert_instr(vfmaddcph))]
4231#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4232pub fn _mm256_mask3_fmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4233    unsafe {
4234        transmute(vfmaddcph_mask3_256(
4235            transmute(a),
4236            transmute(b),
4237            transmute(c),
4238            k,
4239        ))
4240    }
4241}
4242
4243/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4244/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4245/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4246/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4247///
4248/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_pch)
4249#[inline]
4250#[target_feature(enable = "avx512fp16,avx512vl")]
4251#[cfg_attr(test, assert_instr(vfmaddcph))]
4252#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4253pub fn _mm256_maskz_fmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4254    unsafe {
4255        transmute(vfmaddcph_maskz_256(
4256            transmute(a),
4257            transmute(b),
4258            transmute(c),
4259            k,
4260        ))
4261    }
4262}
4263
4264/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4265/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4266/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4267///
4268/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pch)
4269#[inline]
4270#[target_feature(enable = "avx512fp16")]
4271#[cfg_attr(test, assert_instr(vfmaddcph))]
4272#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4273pub fn _mm512_fmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4274    _mm512_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4275}
4276
4277/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4278/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4279/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4280/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4281///
4282/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pch)
4283#[inline]
4284#[target_feature(enable = "avx512fp16")]
4285#[cfg_attr(test, assert_instr(vfmaddcph))]
4286#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4287pub fn _mm512_mask_fmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4288    _mm512_mask_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4289}
4290
4291/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4292/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4293/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4294/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4295///
4296/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pch)
4297#[inline]
4298#[target_feature(enable = "avx512fp16")]
4299#[cfg_attr(test, assert_instr(vfmaddcph))]
4300#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4301pub fn _mm512_mask3_fmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4302    _mm512_mask3_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4303}
4304
4305/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4306/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4307/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4308/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4309///
4310/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pch)
4311#[inline]
4312#[target_feature(enable = "avx512fp16")]
4313#[cfg_attr(test, assert_instr(vfmaddcph))]
4314#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4315pub fn _mm512_maskz_fmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4316    _mm512_maskz_fmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4317}
4318
4319/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4320/// and store the results in dst. Each complex number is composed of two adjacent half-precision (16-bit)
4321/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4322///
4323/// Rounding is done according to the rounding parameter, which can be one of:
4324///
4325/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4326/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4327/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4328/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4329/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4330///
4331/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pch)
4332#[inline]
4333#[target_feature(enable = "avx512fp16")]
4334#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4335#[rustc_legacy_const_generics(3)]
4336#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4337pub fn _mm512_fmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4338    static_assert_rounding!(ROUNDING);
4339    _mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4340}
4341
4342/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4343/// and store the results in dst using writemask k (the element is copied from a when the corresponding mask
4344/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4345/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4346///
4347/// Rounding is done according to the rounding parameter, which can be one of:
4348///
4349/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4350/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4351/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4352/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4353/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4354///
4355/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pch)
4356#[inline]
4357#[target_feature(enable = "avx512fp16")]
4358#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4359#[rustc_legacy_const_generics(4)]
4360#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4361pub fn _mm512_mask_fmadd_round_pch<const ROUNDING: i32>(
4362    a: __m512h,
4363    k: __mmask16,
4364    b: __m512h,
4365    c: __m512h,
4366) -> __m512h {
4367    unsafe {
4368        static_assert_rounding!(ROUNDING);
4369        let r: __m512 = transmute(_mm512_mask3_fmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
4370        transmute(simd_select_bitmask(k, r, transmute(a)))
4371    }
4372}
4373
4374/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4375/// and store the results in dst using writemask k (the element is copied from c when the corresponding
4376/// mask bit is not set). Each complex number is composed of two adjacent half-precision (16-bit)
4377/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4378///
4379/// Rounding is done according to the rounding parameter, which can be one of:
4380///
4381/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4382/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4383/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4384/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4385/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4386///
4387/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pch)
4388#[inline]
4389#[target_feature(enable = "avx512fp16")]
4390#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4391#[rustc_legacy_const_generics(4)]
4392#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4393pub fn _mm512_mask3_fmadd_round_pch<const ROUNDING: i32>(
4394    a: __m512h,
4395    b: __m512h,
4396    c: __m512h,
4397    k: __mmask16,
4398) -> __m512h {
4399    unsafe {
4400        static_assert_rounding!(ROUNDING);
4401        transmute(vfmaddcph_mask3_512(
4402            transmute(a),
4403            transmute(b),
4404            transmute(c),
4405            k,
4406            ROUNDING,
4407        ))
4408    }
4409}
4410
4411/// Multiply packed complex numbers in a and b, accumulate to the corresponding complex numbers in c,
4412/// and store the results in dst using zeromask k (the element is zeroed out when the corresponding mask
4413/// bit is not set). Each complex number is composed of two adjacent half-precision (16-bit) floating-point
4414/// elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4415///
4416/// Rounding is done according to the rounding parameter, which can be one of:
4417///
4418/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4419/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4420/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4421/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4422/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4423///
4424/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pch)
4425#[inline]
4426#[target_feature(enable = "avx512fp16")]
4427#[cfg_attr(test, assert_instr(vfmaddcph, ROUNDING = 8))]
4428#[rustc_legacy_const_generics(4)]
4429#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4430pub fn _mm512_maskz_fmadd_round_pch<const ROUNDING: i32>(
4431    k: __mmask16,
4432    a: __m512h,
4433    b: __m512h,
4434    c: __m512h,
4435) -> __m512h {
4436    unsafe {
4437        static_assert_rounding!(ROUNDING);
4438        transmute(vfmaddcph_maskz_512(
4439            transmute(a),
4440            transmute(b),
4441            transmute(c),
4442            k,
4443            ROUNDING,
4444        ))
4445    }
4446}
4447
4448/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4449/// store the result in the lower elements of dst, and copy the upper 6 packed elements from a to the
4450/// upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
4451/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4452///
4453/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sch)
4454#[inline]
4455#[target_feature(enable = "avx512fp16")]
4456#[cfg_attr(test, assert_instr(vfmaddcsh))]
4457#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4458pub fn _mm_fmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4459    _mm_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4460}
4461
4462/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4463/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4464/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4465/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4466/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4467///
4468/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sch)
4469#[inline]
4470#[target_feature(enable = "avx512fp16")]
4471#[cfg_attr(test, assert_instr(vfmaddcsh))]
4472#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4473pub fn _mm_mask_fmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4474    _mm_mask_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4475}
4476
4477/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4478/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4479/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4480/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4481/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4482///
4483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sch)
4484#[inline]
4485#[target_feature(enable = "avx512fp16")]
4486#[cfg_attr(test, assert_instr(vfmaddcsh))]
4487#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4488pub fn _mm_mask3_fmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4489    _mm_mask3_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4490}
4491
4492/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4493/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4494/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4495/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4496/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4497///
4498/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sch)
4499#[inline]
4500#[target_feature(enable = "avx512fp16")]
4501#[cfg_attr(test, assert_instr(vfmaddcsh))]
4502#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4503pub fn _mm_maskz_fmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4504    _mm_maskz_fmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4505}
4506
4507/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4508/// store the result in the lower elements of dst. Each complex number is composed of two adjacent
4509/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4510///
4511/// Rounding is done according to the rounding parameter, which can be one of:
4512///
4513/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4514/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4515/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4516/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4517/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4518///
4519/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sch)
4520#[inline]
4521#[target_feature(enable = "avx512fp16")]
4522#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4523#[rustc_legacy_const_generics(3)]
4524#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4525pub fn _mm_fmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4526    unsafe {
4527        static_assert_rounding!(ROUNDING);
4528        transmute(vfmaddcsh_mask(
4529            transmute(a),
4530            transmute(b),
4531            transmute(c),
4532            0xff,
4533            ROUNDING,
4534        ))
4535    }
4536}
4537
4538/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4539/// store the result in the lower elements of dst using writemask k (elements are copied from a when
4540/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4541/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4542/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4543///
4544/// Rounding is done according to the rounding parameter, which can be one of:
4545///
4546/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4547/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4548/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4549/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4550/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4551///
4552/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sch)
4553#[inline]
4554#[target_feature(enable = "avx512fp16")]
4555#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4556#[rustc_legacy_const_generics(4)]
4557#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4558pub fn _mm_mask_fmadd_round_sch<const ROUNDING: i32>(
4559    a: __m128h,
4560    k: __mmask8,
4561    b: __m128h,
4562    c: __m128h,
4563) -> __m128h {
4564    unsafe {
4565        static_assert_rounding!(ROUNDING);
4566        let a = transmute(a);
4567        let r = vfmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING); // using `0xff` would have been fine here, but this is what CLang does
4568        transmute(_mm_mask_move_ss(a, k, a, r))
4569    }
4570}
4571
4572/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4573/// store the result in the lower elements of dst using writemask k (elements are copied from c when
4574/// mask bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst.
4575/// Each complex number is composed of two adjacent half-precision (16-bit) floating-point elements,
4576/// which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4577///
4578/// Rounding is done according to the rounding parameter, which can be one of:
4579///
4580/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4581/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4582/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4583/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4584/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4585///
4586/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sch)
4587#[inline]
4588#[target_feature(enable = "avx512fp16")]
4589#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4590#[rustc_legacy_const_generics(4)]
4591#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4592pub fn _mm_mask3_fmadd_round_sch<const ROUNDING: i32>(
4593    a: __m128h,
4594    b: __m128h,
4595    c: __m128h,
4596    k: __mmask8,
4597) -> __m128h {
4598    unsafe {
4599        static_assert_rounding!(ROUNDING);
4600        let c = transmute(c);
4601        let r = vfmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
4602        transmute(_mm_move_ss(c, r))
4603    }
4604}
4605
4606/// Multiply the lower complex numbers in a and b, accumulate to the lower complex number in c, and
4607/// store the result in the lower elements of dst using zeromask k (elements are zeroed out when mask
4608/// bit 0 is not set), and copy the upper 6 packed elements from a to the upper elements of dst. Each
4609/// complex number is composed of two adjacent half-precision (16-bit) floating-point elements, which
4610/// defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`.
4611///
4612/// Rounding is done according to the rounding parameter, which can be one of:
4613///
4614/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4615/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4616/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4617/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4618/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4619///
4620/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sch)
4621#[inline]
4622#[target_feature(enable = "avx512fp16")]
4623#[cfg_attr(test, assert_instr(vfmaddcsh, ROUNDING = 8))]
4624#[rustc_legacy_const_generics(4)]
4625#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4626pub fn _mm_maskz_fmadd_round_sch<const ROUNDING: i32>(
4627    k: __mmask8,
4628    a: __m128h,
4629    b: __m128h,
4630    c: __m128h,
4631) -> __m128h {
4632    unsafe {
4633        static_assert_rounding!(ROUNDING);
4634        transmute(vfmaddcsh_maskz(
4635            transmute(a),
4636            transmute(b),
4637            transmute(c),
4638            k,
4639            ROUNDING,
4640        ))
4641    }
4642}
4643
4644/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4645/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4646/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4647/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4648///
4649/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_pch)
4650#[inline]
4651#[target_feature(enable = "avx512fp16,avx512vl")]
4652#[cfg_attr(test, assert_instr(vfcmaddcph))]
4653#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4654pub fn _mm_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4655    _mm_mask3_fcmadd_pch(a, b, c, 0xff)
4656}
4657
4658/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4659/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4660/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4661/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4662/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4663///
4664/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_pch)
4665#[inline]
4666#[target_feature(enable = "avx512fp16,avx512vl")]
4667#[cfg_attr(test, assert_instr(vfcmaddcph))]
4668#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4669pub fn _mm_mask_fcmadd_pch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
4670    unsafe {
4671        let r: __m128 = transmute(_mm_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4672        transmute(simd_select_bitmask(k, r, transmute(a)))
4673    }
4674}
4675
4676/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4677/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4678/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4679/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4680/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4681///
4682/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_pch)
4683#[inline]
4684#[target_feature(enable = "avx512fp16,avx512vl")]
4685#[cfg_attr(test, assert_instr(vfcmaddcph))]
4686#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4687pub fn _mm_mask3_fcmadd_pch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
4688    unsafe {
4689        transmute(vfcmaddcph_mask3_128(
4690            transmute(a),
4691            transmute(b),
4692            transmute(c),
4693            k,
4694        ))
4695    }
4696}
4697
4698/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4699/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4700/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4701/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4702/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4703///
4704/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_pch)
4705#[inline]
4706#[target_feature(enable = "avx512fp16,avx512vl")]
4707#[cfg_attr(test, assert_instr(vfcmaddcph))]
4708#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4709pub fn _mm_maskz_fcmadd_pch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
4710    unsafe {
4711        transmute(vfcmaddcph_maskz_128(
4712            transmute(a),
4713            transmute(b),
4714            transmute(c),
4715            k,
4716        ))
4717    }
4718}
4719
4720/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4721/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4722/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4723/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4724///
4725/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fcmadd_pch)
4726#[inline]
4727#[target_feature(enable = "avx512fp16,avx512vl")]
4728#[cfg_attr(test, assert_instr(vfcmaddcph))]
4729#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4730pub fn _mm256_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4731    _mm256_mask3_fcmadd_pch(a, b, c, 0xff)
4732}
4733
4734/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4735/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4736/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4737/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4738/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4739///
4740/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fcmadd_pch)
4741#[inline]
4742#[target_feature(enable = "avx512fp16,avx512vl")]
4743#[cfg_attr(test, assert_instr(vfcmaddcph))]
4744#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4745pub fn _mm256_mask_fcmadd_pch(a: __m256h, k: __mmask8, b: __m256h, c: __m256h) -> __m256h {
4746    unsafe {
4747        let r: __m256 = transmute(_mm256_mask3_fcmadd_pch(a, b, c, k)); // using `0xff` would have been fine here, but this is what CLang does
4748        transmute(simd_select_bitmask(k, r, transmute(a)))
4749    }
4750}
4751
4752/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4753/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4754/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4755/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4756/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4757///
4758/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fcmadd_pch)
4759#[inline]
4760#[target_feature(enable = "avx512fp16,avx512vl")]
4761#[cfg_attr(test, assert_instr(vfcmaddcph))]
4762#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4763pub fn _mm256_mask3_fcmadd_pch(a: __m256h, b: __m256h, c: __m256h, k: __mmask8) -> __m256h {
4764    unsafe {
4765        transmute(vfcmaddcph_mask3_256(
4766            transmute(a),
4767            transmute(b),
4768            transmute(c),
4769            k,
4770        ))
4771    }
4772}
4773
4774/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4775/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4776/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4777/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4778/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4779///
4780/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fcmadd_pch)
4781#[inline]
4782#[target_feature(enable = "avx512fp16,avx512vl")]
4783#[cfg_attr(test, assert_instr(vfcmaddcph))]
4784#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4785pub fn _mm256_maskz_fcmadd_pch(k: __mmask8, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
4786    unsafe {
4787        transmute(vfcmaddcph_maskz_256(
4788            transmute(a),
4789            transmute(b),
4790            transmute(c),
4791            k,
4792        ))
4793    }
4794}
4795
4796/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4797/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4798/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4799/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4800///
4801/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_pch)
4802#[inline]
4803#[target_feature(enable = "avx512fp16")]
4804#[cfg_attr(test, assert_instr(vfcmaddcph))]
4805#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4806pub fn _mm512_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4807    _mm512_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
4808}
4809
4810/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4811/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4812/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4813/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4814/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4815///
4816/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_pch)
4817#[inline]
4818#[target_feature(enable = "avx512fp16")]
4819#[cfg_attr(test, assert_instr(vfcmaddcph))]
4820#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4821pub fn _mm512_mask_fcmadd_pch(a: __m512h, k: __mmask16, b: __m512h, c: __m512h) -> __m512h {
4822    _mm512_mask_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
4823}
4824
4825/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4826/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4827/// copied from c when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4828/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4829/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4830///
4831/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_pch)
4832#[inline]
4833#[target_feature(enable = "avx512fp16")]
4834#[cfg_attr(test, assert_instr(vfcmaddcph))]
4835#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4836pub fn _mm512_mask3_fcmadd_pch(a: __m512h, b: __m512h, c: __m512h, k: __mmask16) -> __m512h {
4837    _mm512_mask3_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
4838}
4839
4840/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4841/// to the corresponding complex numbers in c, and store the results in dst using zeromask k (the element is
4842/// zeroed out when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4843/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4844/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4845///
4846/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_pch)
4847#[inline]
4848#[target_feature(enable = "avx512fp16")]
4849#[cfg_attr(test, assert_instr(vfcmaddcph))]
4850#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4851pub fn _mm512_maskz_fcmadd_pch(k: __mmask16, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4852    _mm512_maskz_fcmadd_round_pch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
4853}
4854
4855/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4856/// to the corresponding complex numbers in c, and store the results in dst. Each complex number is composed
4857/// of two adjacent half-precision (16-bit) floating-point elements, which defines the complex number
4858/// `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4859///
4860/// Rounding is done according to the rounding parameter, which can be one of:
4861///
4862/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4863/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4864/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4865/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4866/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4867///
4868/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fcmadd_round_pch)
4869#[inline]
4870#[target_feature(enable = "avx512fp16")]
4871#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4872#[rustc_legacy_const_generics(3)]
4873#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4874pub fn _mm512_fcmadd_round_pch<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
4875    static_assert_rounding!(ROUNDING);
4876    _mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, 0xffff)
4877}
4878
4879/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4880/// to the corresponding complex numbers in c, and store the results in dst using writemask k (the element is
4881/// copied from a when the corresponding mask bit is not set). Each complex number is composed of two adjacent
4882/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
4883/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4884///
4885/// Rounding is done according to the rounding parameter, which can be one of:
4886///
4887/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4888/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4889/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4890/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4891/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4892///
4893/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fcmadd_round_pch)
4894#[inline]
4895#[target_feature(enable = "avx512fp16")]
4896#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4897#[rustc_legacy_const_generics(4)]
4898#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4899pub fn _mm512_mask_fcmadd_round_pch<const ROUNDING: i32>(
4900    a: __m512h,
4901    k: __mmask16,
4902    b: __m512h,
4903    c: __m512h,
4904) -> __m512h {
4905    unsafe {
4906        static_assert_rounding!(ROUNDING);
4907        let r: __m512 = transmute(_mm512_mask3_fcmadd_round_pch::<ROUNDING>(a, b, c, k)); // using `0xffff` would have been fine here, but this is what CLang does
4908        transmute(simd_select_bitmask(k, r, transmute(a)))
4909    }
4910}
4911
4912/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4913/// to the corresponding complex numbers in c using writemask k (the element is copied from c when the corresponding
4914/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
4915/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
4916/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4917///
4918/// Rounding is done according to the rounding parameter, which can be one of:
4919///
4920/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4921/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4922/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4923/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4924/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4925///
4926/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fcmadd_round_pch)
4927#[inline]
4928#[target_feature(enable = "avx512fp16")]
4929#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4930#[rustc_legacy_const_generics(4)]
4931#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4932pub fn _mm512_mask3_fcmadd_round_pch<const ROUNDING: i32>(
4933    a: __m512h,
4934    b: __m512h,
4935    c: __m512h,
4936    k: __mmask16,
4937) -> __m512h {
4938    unsafe {
4939        static_assert_rounding!(ROUNDING);
4940        transmute(vfcmaddcph_mask3_512(
4941            transmute(a),
4942            transmute(b),
4943            transmute(c),
4944            k,
4945            ROUNDING,
4946        ))
4947    }
4948}
4949
4950/// Multiply packed complex numbers in a by the complex conjugates of packed complex numbers in b, accumulate
4951/// to the corresponding complex numbers in c using zeromask k (the element is zeroed out when the corresponding
4952/// mask bit is not set), and store the results in dst. Each complex number is composed of two adjacent half-precision
4953/// (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
4954/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4955///
4956/// Rounding is done according to the rounding parameter, which can be one of:
4957///
4958/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
4959/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
4960/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
4961/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
4962/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
4963///
4964/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fcmadd_round_pch)
4965#[inline]
4966#[target_feature(enable = "avx512fp16")]
4967#[cfg_attr(test, assert_instr(vfcmaddcph, ROUNDING = 8))]
4968#[rustc_legacy_const_generics(4)]
4969#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4970pub fn _mm512_maskz_fcmadd_round_pch<const ROUNDING: i32>(
4971    k: __mmask16,
4972    a: __m512h,
4973    b: __m512h,
4974    c: __m512h,
4975) -> __m512h {
4976    unsafe {
4977        static_assert_rounding!(ROUNDING);
4978        transmute(vfcmaddcph_maskz_512(
4979            transmute(a),
4980            transmute(b),
4981            transmute(c),
4982            k,
4983            ROUNDING,
4984        ))
4985    }
4986}
4987
4988/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
4989/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
4990/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
4991/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
4992/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
4993///
4994/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_sch)
4995#[inline]
4996#[target_feature(enable = "avx512fp16")]
4997#[cfg_attr(test, assert_instr(vfcmaddcsh))]
4998#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
4999pub fn _mm_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5000    _mm_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c)
5001}
5002
5003/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5004/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5005/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
5006/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5007/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5008/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5009///
5010/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_sch)
5011#[inline]
5012#[target_feature(enable = "avx512fp16")]
5013#[cfg_attr(test, assert_instr(vfcmaddcsh))]
5014#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5015pub fn _mm_mask_fcmadd_sch(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5016    _mm_mask_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, k, b, c)
5017}
5018
5019/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5020/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5021/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
5022/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5023/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5024/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5025///
5026/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_sch)
5027#[inline]
5028#[target_feature(enable = "avx512fp16")]
5029#[cfg_attr(test, assert_instr(vfcmaddcsh))]
5030#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5031pub fn _mm_mask3_fcmadd_sch(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5032    _mm_mask3_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(a, b, c, k)
5033}
5034
5035/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5036/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5037/// zeromask k (the element is zeroed out when the corresponding mask bit is not set), and copy the upper
5038/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5039/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5040/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5041///
5042/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_sch)
5043#[inline]
5044#[target_feature(enable = "avx512fp16")]
5045#[cfg_attr(test, assert_instr(vfcmaddcsh))]
5046#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5047pub fn _mm_maskz_fcmadd_sch(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5048    _mm_maskz_fcmadd_round_sch::<_MM_FROUND_CUR_DIRECTION>(k, a, b, c)
5049}
5050
5051/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5052/// accumulate to the lower complex number in c, and store the result in the lower elements of dst,
5053/// and copy the upper 6 packed elements from a to the upper elements of dst. Each complex number is
5054/// composed of two adjacent half-precision (16-bit) floating-point elements, which defines the complex
5055/// number `complex = vec.fp16[0] + i * vec.fp16[1]`, or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5056///
5057/// Rounding is done according to the rounding parameter, which can be one of:
5058///
5059/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5060/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5061/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5062/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5063/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5064///
5065/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fcmadd_round_sch)
5066#[inline]
5067#[target_feature(enable = "avx512fp16")]
5068#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5069#[rustc_legacy_const_generics(3)]
5070#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5071pub fn _mm_fcmadd_round_sch<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5072    unsafe {
5073        static_assert_rounding!(ROUNDING);
5074        transmute(vfcmaddcsh_mask(
5075            transmute(a),
5076            transmute(b),
5077            transmute(c),
5078            0xff,
5079            ROUNDING,
5080        ))
5081    }
5082}
5083
5084/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5085/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5086/// writemask k (the element is copied from a when the corresponding mask bit is not set), and copy the upper
5087/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5088/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5089/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5090///
5091/// Rounding is done according to the rounding parameter, which can be one of:
5092///
5093/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5094/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5095/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5096/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5097/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5098///
5099/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fcmadd_round_sch)
5100#[inline]
5101#[target_feature(enable = "avx512fp16")]
5102#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5103#[rustc_legacy_const_generics(4)]
5104#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5105pub fn _mm_mask_fcmadd_round_sch<const ROUNDING: i32>(
5106    a: __m128h,
5107    k: __mmask8,
5108    b: __m128h,
5109    c: __m128h,
5110) -> __m128h {
5111    unsafe {
5112        static_assert_rounding!(ROUNDING);
5113        let a = transmute(a);
5114        let r = vfcmaddcsh_mask(a, transmute(b), transmute(c), k, ROUNDING);
5115        transmute(_mm_mask_move_ss(a, k, a, r))
5116    }
5117}
5118
5119/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5120/// accumulate to the lower complex number in c, and store the result in the lower elements of dst using
5121/// writemask k (the element is copied from c when the corresponding mask bit is not set), and copy the upper
5122/// 6 packed elements from a to the upper elements of dst. Each complex number is composed of two adjacent
5123/// half-precision (16-bit) floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1]`,
5124/// or the complex conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5125///
5126/// Rounding is done according to the rounding parameter, which can be one of:
5127///
5128/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5129/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5130/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5131/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5132/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5133///
5134/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fcmadd_round_sch)
5135#[inline]
5136#[target_feature(enable = "avx512fp16")]
5137#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5138#[rustc_legacy_const_generics(4)]
5139#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5140pub fn _mm_mask3_fcmadd_round_sch<const ROUNDING: i32>(
5141    a: __m128h,
5142    b: __m128h,
5143    c: __m128h,
5144    k: __mmask8,
5145) -> __m128h {
5146    unsafe {
5147        static_assert_rounding!(ROUNDING);
5148        let c = transmute(c);
5149        let r = vfcmaddcsh_mask(transmute(a), transmute(b), c, k, ROUNDING);
5150        transmute(_mm_move_ss(c, r))
5151    }
5152}
5153
5154/// Multiply the lower complex number in a by the complex conjugate of the lower complex number in b,
5155/// accumulate to the lower complex number in c using zeromask k (the element is zeroed out when the corresponding
5156/// mask bit is not set), and store the result in the lower elements of dst, and copy the upper 6 packed elements
5157/// from a to the upper elements of dst. Each complex number is composed of two adjacent half-precision (16-bit)
5158/// floating-point elements, which defines the complex number `complex = vec.fp16[0] + i * vec.fp16[1`, or the complex
5159/// conjugate `conjugate = vec.fp16[0] - i * vec.fp16[1]`.
5160///
5161/// Rounding is done according to the rounding parameter, which can be one of:
5162///
5163/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5164/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5165/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5166/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5167/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5168///
5169/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fcmadd_round_sch)
5170#[inline]
5171#[target_feature(enable = "avx512fp16")]
5172#[cfg_attr(test, assert_instr(vfcmaddcsh, ROUNDING = 8))]
5173#[rustc_legacy_const_generics(4)]
5174#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5175pub fn _mm_maskz_fcmadd_round_sch<const ROUNDING: i32>(
5176    k: __mmask8,
5177    a: __m128h,
5178    b: __m128h,
5179    c: __m128h,
5180) -> __m128h {
5181    unsafe {
5182        static_assert_rounding!(ROUNDING);
5183        transmute(vfcmaddcsh_maskz(
5184            transmute(a),
5185            transmute(b),
5186            transmute(c),
5187            k,
5188            ROUNDING,
5189        ))
5190    }
5191}
5192
5193/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5194/// result to packed elements in c, and store the results in dst.
5195///
5196/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_ph)
5197#[inline]
5198#[target_feature(enable = "avx512fp16,avx512vl")]
5199#[cfg_attr(test, assert_instr(vfmadd))]
5200#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5201pub fn _mm_fmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5202    unsafe { simd_fma(a, b, c) }
5203}
5204
5205/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5206/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5207/// from a when the corresponding mask bit is not set).
5208///
5209/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ph)
5210#[inline]
5211#[target_feature(enable = "avx512fp16,avx512vl")]
5212#[cfg_attr(test, assert_instr(vfmadd))]
5213#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5214pub fn _mm_mask_fmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5215    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), a) }
5216}
5217
5218/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5219/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5220/// from c when the corresponding mask bit is not set).
5221///
5222/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ph)
5223#[inline]
5224#[target_feature(enable = "avx512fp16,avx512vl")]
5225#[cfg_attr(test, assert_instr(vfmadd))]
5226#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5227pub fn _mm_mask3_fmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5228    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), c) }
5229}
5230
5231/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5232/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5233/// out when the corresponding mask bit is not set).
5234///
5235/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ph)
5236#[inline]
5237#[target_feature(enable = "avx512fp16,avx512vl")]
5238#[cfg_attr(test, assert_instr(vfmadd))]
5239#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5240pub fn _mm_maskz_fmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5241    unsafe { simd_select_bitmask(k, _mm_fmadd_ph(a, b, c), _mm_setzero_ph()) }
5242}
5243
5244/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5245/// result to packed elements in c, and store the results in dst.
5246///
5247/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmadd_ph)
5248#[inline]
5249#[target_feature(enable = "avx512fp16,avx512vl")]
5250#[cfg_attr(test, assert_instr(vfmadd))]
5251#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5252pub fn _mm256_fmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5253    unsafe { simd_fma(a, b, c) }
5254}
5255
5256/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5257/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5258/// from a when the corresponding mask bit is not set).
5259///
5260/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_ph)
5261#[inline]
5262#[target_feature(enable = "avx512fp16,avx512vl")]
5263#[cfg_attr(test, assert_instr(vfmadd))]
5264#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5265pub fn _mm256_mask_fmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5266    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), a) }
5267}
5268
5269/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5270/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5271/// from c when the corresponding mask bit is not set).
5272///
5273/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_ph)
5274#[inline]
5275#[target_feature(enable = "avx512fp16,avx512vl")]
5276#[cfg_attr(test, assert_instr(vfmadd))]
5277#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5278pub fn _mm256_mask3_fmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5279    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), c) }
5280}
5281
5282/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5283/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5284/// out when the corresponding mask bit is not set).
5285///
5286/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_ph)
5287#[inline]
5288#[target_feature(enable = "avx512fp16,avx512vl")]
5289#[cfg_attr(test, assert_instr(vfmadd))]
5290#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5291pub fn _mm256_maskz_fmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5292    unsafe { simd_select_bitmask(k, _mm256_fmadd_ph(a, b, c), _mm256_setzero_ph()) }
5293}
5294
5295/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5296/// result to packed elements in c, and store the results in dst.
5297///
5298/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ph)
5299#[inline]
5300#[target_feature(enable = "avx512fp16")]
5301#[cfg_attr(test, assert_instr(vfmadd))]
5302#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5303pub fn _mm512_fmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5304    unsafe { simd_fma(a, b, c) }
5305}
5306
5307/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5308/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5309/// from a when the corresponding mask bit is not set).
5310///
5311/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ph)
5312#[inline]
5313#[target_feature(enable = "avx512fp16")]
5314#[cfg_attr(test, assert_instr(vfmadd))]
5315#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5316pub fn _mm512_mask_fmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5317    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), a) }
5318}
5319
5320/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5321/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5322/// from c when the corresponding mask bit is not set).
5323///
5324/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ph)
5325#[inline]
5326#[target_feature(enable = "avx512fp16")]
5327#[cfg_attr(test, assert_instr(vfmadd))]
5328#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5329pub fn _mm512_mask3_fmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5330    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), c) }
5331}
5332
5333/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5334/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5335/// out when the corresponding mask bit is not set).
5336///
5337/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ph)
5338#[inline]
5339#[target_feature(enable = "avx512fp16")]
5340#[cfg_attr(test, assert_instr(vfmadd))]
5341#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5342pub fn _mm512_maskz_fmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5343    unsafe { simd_select_bitmask(k, _mm512_fmadd_ph(a, b, c), _mm512_setzero_ph()) }
5344}
5345
5346/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5347/// result to packed elements in c, and store the results in dst.
5348///
5349/// Rounding is done according to the rounding parameter, which can be one of:
5350///
5351/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5352/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5353/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5354/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5355/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5356///
5357/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_ph)
5358#[inline]
5359#[target_feature(enable = "avx512fp16")]
5360#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5361#[rustc_legacy_const_generics(3)]
5362#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5363pub fn _mm512_fmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5364    unsafe {
5365        static_assert_rounding!(ROUNDING);
5366        vfmaddph_512(a, b, c, ROUNDING)
5367    }
5368}
5369
5370/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5371/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5372/// from a when the corresponding mask bit is not set).
5373///
5374/// Rounding is done according to the rounding parameter, which can be one of:
5375///
5376/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5377/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5378/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5379/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5380/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5381///
5382/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_ph)
5383#[inline]
5384#[target_feature(enable = "avx512fp16")]
5385#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5386#[rustc_legacy_const_generics(4)]
5387#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5388pub fn _mm512_mask_fmadd_round_ph<const ROUNDING: i32>(
5389    a: __m512h,
5390    k: __mmask32,
5391    b: __m512h,
5392    c: __m512h,
5393) -> __m512h {
5394    unsafe {
5395        static_assert_rounding!(ROUNDING);
5396        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), a)
5397    }
5398}
5399
5400/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5401/// result to packed elements in c, and store the results in dst using writemask k (the element is copied
5402/// from c when the corresponding mask bit is not set).
5403///
5404/// Rounding is done according to the rounding parameter, which can be one of:
5405///
5406/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5407/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5408/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5409/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5410/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5411///
5412/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_ph)
5413#[inline]
5414#[target_feature(enable = "avx512fp16")]
5415#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5416#[rustc_legacy_const_generics(4)]
5417#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5418pub fn _mm512_mask3_fmadd_round_ph<const ROUNDING: i32>(
5419    a: __m512h,
5420    b: __m512h,
5421    c: __m512h,
5422    k: __mmask32,
5423) -> __m512h {
5424    unsafe {
5425        static_assert_rounding!(ROUNDING);
5426        simd_select_bitmask(k, _mm512_fmadd_round_ph::<ROUNDING>(a, b, c), c)
5427    }
5428}
5429
5430/// Multiply packed half-precision (16-bit) floating-point elements in a and b, add the intermediate
5431/// result to packed elements in c, and store the results in dst using zeromask k (the element is zeroed
5432/// out when the corresponding mask bit is not set).
5433///
5434/// Rounding is done according to the rounding parameter, which can be one of:
5435///
5436/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5437/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5438/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5439/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5440/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5441///
5442/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_ph)
5443#[inline]
5444#[target_feature(enable = "avx512fp16")]
5445#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5446#[rustc_legacy_const_generics(4)]
5447#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5448pub fn _mm512_maskz_fmadd_round_ph<const ROUNDING: i32>(
5449    k: __mmask32,
5450    a: __m512h,
5451    b: __m512h,
5452    c: __m512h,
5453) -> __m512h {
5454    unsafe {
5455        static_assert_rounding!(ROUNDING);
5456        simd_select_bitmask(
5457            k,
5458            _mm512_fmadd_round_ph::<ROUNDING>(a, b, c),
5459            _mm512_setzero_ph(),
5460        )
5461    }
5462}
5463
5464/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5465/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5466/// 7 packed elements from a to the upper elements of dst.
5467///
5468/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_sh)
5469#[inline]
5470#[target_feature(enable = "avx512fp16")]
5471#[cfg_attr(test, assert_instr(vfmadd))]
5472#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5473pub fn _mm_fmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5474    unsafe {
5475        let extracta: f16 = simd_extract!(a, 0);
5476        let extractb: f16 = simd_extract!(b, 0);
5477        let extractc: f16 = simd_extract!(c, 0);
5478        let r = fmaf16(extracta, extractb, extractc);
5479        simd_insert!(a, 0, r)
5480    }
5481}
5482
5483/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5484/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5485/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5486/// upper elements of dst.
5487///
5488/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_sh)
5489#[inline]
5490#[target_feature(enable = "avx512fp16")]
5491#[cfg_attr(test, assert_instr(vfmadd))]
5492#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5493pub fn _mm_mask_fmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5494    unsafe {
5495        let mut fmadd: f16 = simd_extract!(a, 0);
5496        if k & 1 != 0 {
5497            let extractb: f16 = simd_extract!(b, 0);
5498            let extractc: f16 = simd_extract!(c, 0);
5499            fmadd = fmaf16(fmadd, extractb, extractc);
5500        }
5501        simd_insert!(a, 0, fmadd)
5502    }
5503}
5504
5505/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5506/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5507/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5508/// upper elements of dst.
5509///
5510/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_sh)
5511#[inline]
5512#[target_feature(enable = "avx512fp16")]
5513#[cfg_attr(test, assert_instr(vfmadd))]
5514#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5515pub fn _mm_mask3_fmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5516    unsafe {
5517        let mut fmadd: f16 = simd_extract!(c, 0);
5518        if k & 1 != 0 {
5519            let extracta: f16 = simd_extract!(a, 0);
5520            let extractb: f16 = simd_extract!(b, 0);
5521            fmadd = fmaf16(extracta, extractb, fmadd);
5522        }
5523        simd_insert!(c, 0, fmadd)
5524    }
5525}
5526
5527/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5528/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5529/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5530/// upper elements of dst.
5531///
5532/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_sh)
5533#[inline]
5534#[target_feature(enable = "avx512fp16")]
5535#[cfg_attr(test, assert_instr(vfmadd))]
5536#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5537pub fn _mm_maskz_fmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5538    unsafe {
5539        let mut fmadd: f16 = 0.0;
5540        if k & 1 != 0 {
5541            let extracta: f16 = simd_extract!(a, 0);
5542            let extractb: f16 = simd_extract!(b, 0);
5543            let extractc: f16 = simd_extract!(c, 0);
5544            fmadd = fmaf16(extracta, extractb, extractc);
5545        }
5546        simd_insert!(a, 0, fmadd)
5547    }
5548}
5549
5550/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5551/// result to the lower element in c. Store the result in the lower element of dst, and copy the upper
5552/// 7 packed elements from a to the upper elements of dst.
5553///
5554/// Rounding is done according to the rounding parameter, which can be one of:
5555///
5556/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5557/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5558/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5559/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5560/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5561///
5562/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd_round_sh)
5563#[inline]
5564#[target_feature(enable = "avx512fp16")]
5565#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5566#[rustc_legacy_const_generics(3)]
5567#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5568pub fn _mm_fmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5569    unsafe {
5570        static_assert_rounding!(ROUNDING);
5571        let extracta: f16 = simd_extract!(a, 0);
5572        let extractb: f16 = simd_extract!(b, 0);
5573        let extractc: f16 = simd_extract!(c, 0);
5574        let r = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5575        simd_insert!(a, 0, r)
5576    }
5577}
5578
5579/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5580/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5581/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5582/// upper elements of dst.
5583///
5584/// Rounding is done according to the rounding parameter, which can be one of:
5585///
5586/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5587/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5588/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5589/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5590/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5591///
5592/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_round_sh)
5593#[inline]
5594#[target_feature(enable = "avx512fp16")]
5595#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5596#[rustc_legacy_const_generics(4)]
5597#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5598pub fn _mm_mask_fmadd_round_sh<const ROUNDING: i32>(
5599    a: __m128h,
5600    k: __mmask8,
5601    b: __m128h,
5602    c: __m128h,
5603) -> __m128h {
5604    unsafe {
5605        static_assert_rounding!(ROUNDING);
5606        let mut fmadd: f16 = simd_extract!(a, 0);
5607        if k & 1 != 0 {
5608            let extractb: f16 = simd_extract!(b, 0);
5609            let extractc: f16 = simd_extract!(c, 0);
5610            fmadd = vfmaddsh(fmadd, extractb, extractc, ROUNDING);
5611        }
5612        simd_insert!(a, 0, fmadd)
5613    }
5614}
5615
5616/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5617/// result to the lower element in c. Store the result in the lower element of dst using writemask k (the element
5618/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
5619/// upper elements of dst.
5620///
5621/// Rounding is done according to the rounding parameter, which can be one of:
5622///
5623/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5624/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5625/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5626/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5627/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5628///
5629/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_round_sh)
5630#[inline]
5631#[target_feature(enable = "avx512fp16")]
5632#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5633#[rustc_legacy_const_generics(4)]
5634#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5635pub fn _mm_mask3_fmadd_round_sh<const ROUNDING: i32>(
5636    a: __m128h,
5637    b: __m128h,
5638    c: __m128h,
5639    k: __mmask8,
5640) -> __m128h {
5641    unsafe {
5642        static_assert_rounding!(ROUNDING);
5643        let mut fmadd: f16 = simd_extract!(c, 0);
5644        if k & 1 != 0 {
5645            let extracta: f16 = simd_extract!(a, 0);
5646            let extractb: f16 = simd_extract!(b, 0);
5647            fmadd = vfmaddsh(extracta, extractb, fmadd, ROUNDING);
5648        }
5649        simd_insert!(c, 0, fmadd)
5650    }
5651}
5652
5653/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and add the intermediate
5654/// result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element
5655/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5656/// upper elements of dst.
5657///
5658/// Rounding is done according to the rounding parameter, which can be one of:
5659///
5660/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5661/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5662/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5663/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5664/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5665///
5666/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_round_sh)
5667#[inline]
5668#[target_feature(enable = "avx512fp16")]
5669#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))]
5670#[rustc_legacy_const_generics(4)]
5671#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5672pub fn _mm_maskz_fmadd_round_sh<const ROUNDING: i32>(
5673    k: __mmask8,
5674    a: __m128h,
5675    b: __m128h,
5676    c: __m128h,
5677) -> __m128h {
5678    unsafe {
5679        static_assert_rounding!(ROUNDING);
5680        let mut fmadd: f16 = 0.0;
5681        if k & 1 != 0 {
5682            let extracta: f16 = simd_extract!(a, 0);
5683            let extractb: f16 = simd_extract!(b, 0);
5684            let extractc: f16 = simd_extract!(c, 0);
5685            fmadd = vfmaddsh(extracta, extractb, extractc, ROUNDING);
5686        }
5687        simd_insert!(a, 0, fmadd)
5688    }
5689}
5690
5691/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5692/// in c from the intermediate result, and store the results in dst.
5693/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5694///
5695/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_ph)
5696#[inline]
5697#[target_feature(enable = "avx512fp16,avx512vl")]
5698#[cfg_attr(test, assert_instr(vfmsub))]
5699#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5700pub fn _mm_fmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5701    unsafe { simd_fma(a, b, simd_neg(c)) }
5702}
5703
5704/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5705/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5706/// from a when the corresponding mask bit is not set).
5707///
5708/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_ph)
5709#[inline]
5710#[target_feature(enable = "avx512fp16,avx512vl")]
5711#[cfg_attr(test, assert_instr(vfmsub))]
5712#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5713pub fn _mm_mask_fmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5714    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), a) }
5715}
5716
5717/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5718/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5719/// from c when the corresponding mask bit is not set).
5720///
5721/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_ph)
5722#[inline]
5723#[target_feature(enable = "avx512fp16,avx512vl")]
5724#[cfg_attr(test, assert_instr(vfmsub))]
5725#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5726pub fn _mm_mask3_fmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
5727    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), c) }
5728}
5729
5730/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5731/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5732/// out when the corresponding mask bit is not set).
5733///
5734/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_ph)
5735#[inline]
5736#[target_feature(enable = "avx512fp16,avx512vl")]
5737#[cfg_attr(test, assert_instr(vfmsub))]
5738#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5739pub fn _mm_maskz_fmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5740    unsafe { simd_select_bitmask(k, _mm_fmsub_ph(a, b, c), _mm_setzero_ph()) }
5741}
5742
5743/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5744/// in c from the intermediate result, and store the results in dst.
5745///
5746/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsub_ph)
5747#[inline]
5748#[target_feature(enable = "avx512fp16,avx512vl")]
5749#[cfg_attr(test, assert_instr(vfmsub))]
5750#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5751pub fn _mm256_fmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5752    unsafe { simd_fma(a, b, simd_neg(c)) }
5753}
5754
5755/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5756/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5757/// from a when the corresponding mask bit is not set).
5758///
5759/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_ph)
5760#[inline]
5761#[target_feature(enable = "avx512fp16,avx512vl")]
5762#[cfg_attr(test, assert_instr(vfmsub))]
5763#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5764pub fn _mm256_mask_fmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
5765    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), a) }
5766}
5767
5768/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5769/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5770/// from c when the corresponding mask bit is not set).
5771///
5772/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsub_ph)
5773#[inline]
5774#[target_feature(enable = "avx512fp16,avx512vl")]
5775#[cfg_attr(test, assert_instr(vfmsub))]
5776#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5777pub fn _mm256_mask3_fmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
5778    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), c) }
5779}
5780
5781/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5782/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5783/// out when the corresponding mask bit is not set).
5784///
5785/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_ph)
5786#[inline]
5787#[target_feature(enable = "avx512fp16,avx512vl")]
5788#[cfg_attr(test, assert_instr(vfmsub))]
5789#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5790pub fn _mm256_maskz_fmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
5791    unsafe { simd_select_bitmask(k, _mm256_fmsub_ph(a, b, c), _mm256_setzero_ph()) }
5792}
5793
5794/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5795/// in c from the intermediate result, and store the results in dst.
5796///
5797/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_ph)
5798#[inline]
5799#[target_feature(enable = "avx512fp16")]
5800#[cfg_attr(test, assert_instr(vfmsub))]
5801#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5802pub fn _mm512_fmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5803    unsafe { simd_fma(a, b, simd_neg(c)) }
5804}
5805
5806/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5807/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5808/// from a when the corresponding mask bit is not set).
5809///
5810/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_ph)
5811#[inline]
5812#[target_feature(enable = "avx512fp16")]
5813#[cfg_attr(test, assert_instr(vfmsub))]
5814#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5815pub fn _mm512_mask_fmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
5816    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), a) }
5817}
5818
5819/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5820/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5821/// from c when the corresponding mask bit is not set).
5822///
5823/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_ph)
5824#[inline]
5825#[target_feature(enable = "avx512fp16")]
5826#[cfg_attr(test, assert_instr(vfmsub))]
5827#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5828pub fn _mm512_mask3_fmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
5829    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), c) }
5830}
5831
5832/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5833/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5834/// out when the corresponding mask bit is not set).
5835///
5836/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_ph)
5837#[inline]
5838#[target_feature(enable = "avx512fp16")]
5839#[cfg_attr(test, assert_instr(vfmsub))]
5840#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5841pub fn _mm512_maskz_fmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5842    unsafe { simd_select_bitmask(k, _mm512_fmsub_ph(a, b, c), _mm512_setzero_ph()) }
5843}
5844
5845/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5846/// in c from the intermediate result, and store the results in dst.
5847///
5848/// Rounding is done according to the rounding parameter, which can be one of:
5849///
5850/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5851/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5852/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5853/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5854/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5855///
5856/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_ph)
5857#[inline]
5858#[target_feature(enable = "avx512fp16")]
5859#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5860#[rustc_legacy_const_generics(3)]
5861#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5862pub fn _mm512_fmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
5863    unsafe {
5864        static_assert_rounding!(ROUNDING);
5865        vfmaddph_512(a, b, simd_neg(c), ROUNDING)
5866    }
5867}
5868
5869/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5870/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5871/// from a when the corresponding mask bit is not set).
5872///
5873/// Rounding is done according to the rounding parameter, which can be one of:
5874///
5875/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5876/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5877/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5878/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5879/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5880///
5881/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_ph)
5882#[inline]
5883#[target_feature(enable = "avx512fp16")]
5884#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5885#[rustc_legacy_const_generics(4)]
5886#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5887pub fn _mm512_mask_fmsub_round_ph<const ROUNDING: i32>(
5888    a: __m512h,
5889    k: __mmask32,
5890    b: __m512h,
5891    c: __m512h,
5892) -> __m512h {
5893    unsafe {
5894        static_assert_rounding!(ROUNDING);
5895        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), a)
5896    }
5897}
5898
5899/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5900/// in c from the intermediate result, and store the results in dst using writemask k (the element is copied
5901/// from c when the corresponding mask bit is not set).
5902///
5903/// Rounding is done according to the rounding parameter, which can be one of:
5904///
5905/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5906/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5907/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5908/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5909/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5910///
5911/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_round_ph)
5912#[inline]
5913#[target_feature(enable = "avx512fp16")]
5914#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5915#[rustc_legacy_const_generics(4)]
5916#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5917pub fn _mm512_mask3_fmsub_round_ph<const ROUNDING: i32>(
5918    a: __m512h,
5919    b: __m512h,
5920    c: __m512h,
5921    k: __mmask32,
5922) -> __m512h {
5923    unsafe {
5924        static_assert_rounding!(ROUNDING);
5925        simd_select_bitmask(k, _mm512_fmsub_round_ph::<ROUNDING>(a, b, c), c)
5926    }
5927}
5928
5929/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
5930/// in c from the intermediate result, and store the results in dst using zeromask k (the element is zeroed
5931/// out when the corresponding mask bit is not set).
5932///
5933/// Rounding is done according to the rounding parameter, which can be one of:
5934///
5935/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
5936/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
5937/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
5938/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
5939/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
5940///
5941/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_ph)
5942#[inline]
5943#[target_feature(enable = "avx512fp16")]
5944#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
5945#[rustc_legacy_const_generics(4)]
5946#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5947pub fn _mm512_maskz_fmsub_round_ph<const ROUNDING: i32>(
5948    k: __mmask32,
5949    a: __m512h,
5950    b: __m512h,
5951    c: __m512h,
5952) -> __m512h {
5953    unsafe {
5954        static_assert_rounding!(ROUNDING);
5955        simd_select_bitmask(
5956            k,
5957            _mm512_fmsub_round_ph::<ROUNDING>(a, b, c),
5958            _mm512_setzero_ph(),
5959        )
5960    }
5961}
5962
5963/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5964/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
5965/// 7 packed elements from a to the upper elements of dst.
5966///
5967/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_sh)
5968#[inline]
5969#[target_feature(enable = "avx512fp16")]
5970#[cfg_attr(test, assert_instr(vfmsub))]
5971#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5972pub fn _mm_fmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
5973    unsafe {
5974        let extracta: f16 = simd_extract!(a, 0);
5975        let extractb: f16 = simd_extract!(b, 0);
5976        let extractc: f16 = simd_extract!(c, 0);
5977        let r = fmaf16(extracta, extractb, -extractc);
5978        simd_insert!(a, 0, r)
5979    }
5980}
5981
5982/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
5983/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
5984/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
5985/// upper elements of dst.
5986///
5987/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_sh)
5988#[inline]
5989#[target_feature(enable = "avx512fp16")]
5990#[cfg_attr(test, assert_instr(vfmsub))]
5991#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
5992pub fn _mm_mask_fmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
5993    unsafe {
5994        let mut fmsub: f16 = simd_extract!(a, 0);
5995        if k & 1 != 0 {
5996            let extractb: f16 = simd_extract!(b, 0);
5997            let extractc: f16 = simd_extract!(c, 0);
5998            fmsub = fmaf16(fmsub, extractb, -extractc);
5999        }
6000        simd_insert!(a, 0, fmsub)
6001    }
6002}
6003
6004/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6005/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6006/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
6007/// upper elements of dst.
6008///
6009/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_sh)
6010#[inline]
6011#[target_feature(enable = "avx512fp16")]
6012#[cfg_attr(test, assert_instr(vfmsub))]
6013#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6014pub fn _mm_mask3_fmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6015    unsafe {
6016        let mut fmsub: f16 = simd_extract!(c, 0);
6017        if k & 1 != 0 {
6018            let extracta: f16 = simd_extract!(a, 0);
6019            let extractb: f16 = simd_extract!(b, 0);
6020            fmsub = fmaf16(extracta, extractb, -fmsub);
6021        }
6022        simd_insert!(c, 0, fmsub)
6023    }
6024}
6025
6026/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6027/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
6028/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6029/// upper elements of dst.
6030///
6031/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_sh)
6032#[inline]
6033#[target_feature(enable = "avx512fp16")]
6034#[cfg_attr(test, assert_instr(vfmsub))]
6035#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6036pub fn _mm_maskz_fmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6037    unsafe {
6038        let mut fmsub: f16 = 0.0;
6039        if k & 1 != 0 {
6040            let extracta: f16 = simd_extract!(a, 0);
6041            let extractb: f16 = simd_extract!(b, 0);
6042            let extractc: f16 = simd_extract!(c, 0);
6043            fmsub = fmaf16(extracta, extractb, -extractc);
6044        }
6045        simd_insert!(a, 0, fmsub)
6046    }
6047}
6048
6049/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6050/// in c from the intermediate result. Store the result in the lower element of dst, and copy the upper
6051/// 7 packed elements from a to the upper elements of dst.
6052///
6053/// Rounding is done according to the rounding parameter, which can be one of:
6054///
6055/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6056/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6057/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6058/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6059/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6060///
6061/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsub_round_sh)
6062#[inline]
6063#[target_feature(enable = "avx512fp16")]
6064#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6065#[rustc_legacy_const_generics(3)]
6066#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6067pub fn _mm_fmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6068    unsafe {
6069        static_assert_rounding!(ROUNDING);
6070        let extracta: f16 = simd_extract!(a, 0);
6071        let extractb: f16 = simd_extract!(b, 0);
6072        let extractc: f16 = simd_extract!(c, 0);
6073        let r = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
6074        simd_insert!(a, 0, r)
6075    }
6076}
6077
6078/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6079/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6080/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6081/// upper elements of dst.
6082///
6083/// Rounding is done according to the rounding parameter, which can be one of:
6084///
6085/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6086/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6087/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6088/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6089/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6090///
6091/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_round_sh)
6092#[inline]
6093#[target_feature(enable = "avx512fp16")]
6094#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6095#[rustc_legacy_const_generics(4)]
6096#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6097pub fn _mm_mask_fmsub_round_sh<const ROUNDING: i32>(
6098    a: __m128h,
6099    k: __mmask8,
6100    b: __m128h,
6101    c: __m128h,
6102) -> __m128h {
6103    unsafe {
6104        static_assert_rounding!(ROUNDING);
6105        let mut fmsub: f16 = simd_extract!(a, 0);
6106        if k & 1 != 0 {
6107            let extractb: f16 = simd_extract!(b, 0);
6108            let extractc: f16 = simd_extract!(c, 0);
6109            fmsub = vfmaddsh(fmsub, extractb, -extractc, ROUNDING);
6110        }
6111        simd_insert!(a, 0, fmsub)
6112    }
6113}
6114
6115/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6116/// in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element
6117/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the
6118/// upper elements of dst.
6119///
6120/// Rounding is done according to the rounding parameter, which can be one of:
6121///
6122/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6123/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6124/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6125/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6126/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6127///
6128/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_round_sh)
6129#[inline]
6130#[target_feature(enable = "avx512fp16")]
6131#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6132#[rustc_legacy_const_generics(4)]
6133#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6134pub fn _mm_mask3_fmsub_round_sh<const ROUNDING: i32>(
6135    a: __m128h,
6136    b: __m128h,
6137    c: __m128h,
6138    k: __mmask8,
6139) -> __m128h {
6140    unsafe {
6141        static_assert_rounding!(ROUNDING);
6142        let mut fmsub: f16 = simd_extract!(c, 0);
6143        if k & 1 != 0 {
6144            let extracta: f16 = simd_extract!(a, 0);
6145            let extractb: f16 = simd_extract!(b, 0);
6146            fmsub = vfmaddsh(extracta, extractb, -fmsub, ROUNDING);
6147        }
6148        simd_insert!(c, 0, fmsub)
6149    }
6150}
6151
6152/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract packed elements
6153/// in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element
6154/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
6155/// upper elements of dst.
6156///
6157/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_round_sh)
6158#[inline]
6159#[target_feature(enable = "avx512fp16")]
6160#[cfg_attr(test, assert_instr(vfmsub, ROUNDING = 8))]
6161#[rustc_legacy_const_generics(4)]
6162#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6163pub fn _mm_maskz_fmsub_round_sh<const ROUNDING: i32>(
6164    k: __mmask8,
6165    a: __m128h,
6166    b: __m128h,
6167    c: __m128h,
6168) -> __m128h {
6169    unsafe {
6170        static_assert_rounding!(ROUNDING);
6171        let mut fmsub: f16 = 0.0;
6172        if k & 1 != 0 {
6173            let extracta: f16 = simd_extract!(a, 0);
6174            let extractb: f16 = simd_extract!(b, 0);
6175            let extractc: f16 = simd_extract!(c, 0);
6176            fmsub = vfmaddsh(extracta, extractb, -extractc, ROUNDING);
6177        }
6178        simd_insert!(a, 0, fmsub)
6179    }
6180}
6181
6182/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6183/// result from packed elements in c, and store the results in dst.
6184///
6185/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_ph)
6186#[inline]
6187#[target_feature(enable = "avx512fp16,avx512vl")]
6188#[cfg_attr(test, assert_instr(vfnmadd))]
6189#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6190pub fn _mm_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6191    unsafe { simd_fma(simd_neg(a), b, c) }
6192}
6193
6194/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6195/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6196/// from a when the corresponding mask bit is not set).
6197///
6198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_ph)
6199#[inline]
6200#[target_feature(enable = "avx512fp16,avx512vl")]
6201#[cfg_attr(test, assert_instr(vfnmadd))]
6202#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6203pub fn _mm_mask_fnmadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6204    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), a) }
6205}
6206
6207/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6208/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6209/// from c when the corresponding mask bit is not set).
6210///
6211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_ph)
6212#[inline]
6213#[target_feature(enable = "avx512fp16,avx512vl")]
6214#[cfg_attr(test, assert_instr(vfnmadd))]
6215#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6216pub fn _mm_mask3_fnmadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6217    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), c) }
6218}
6219
6220/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6221/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6222/// out when the corresponding mask bit is not set).
6223///
6224/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_ph)
6225#[inline]
6226#[target_feature(enable = "avx512fp16,avx512vl")]
6227#[cfg_attr(test, assert_instr(vfnmadd))]
6228#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6229pub fn _mm_maskz_fnmadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6230    unsafe { simd_select_bitmask(k, _mm_fnmadd_ph(a, b, c), _mm_setzero_ph()) }
6231}
6232
6233/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6234/// result from packed elements in c, and store the results in dst.
6235///
6236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmadd_ph)
6237#[inline]
6238#[target_feature(enable = "avx512fp16,avx512vl")]
6239#[cfg_attr(test, assert_instr(vfnmadd))]
6240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6241pub fn _mm256_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6242    unsafe { simd_fma(simd_neg(a), b, c) }
6243}
6244
6245/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6246/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6247/// from a when the corresponding mask bit is not set).
6248///
6249/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_ph)
6250#[inline]
6251#[target_feature(enable = "avx512fp16,avx512vl")]
6252#[cfg_attr(test, assert_instr(vfnmadd))]
6253#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6254pub fn _mm256_mask_fnmadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6255    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), a) }
6256}
6257
6258/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6259/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6260/// from c when the corresponding mask bit is not set).
6261///
6262/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmadd_ph)
6263#[inline]
6264#[target_feature(enable = "avx512fp16,avx512vl")]
6265#[cfg_attr(test, assert_instr(vfnmadd))]
6266#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6267pub fn _mm256_mask3_fnmadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6268    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), c) }
6269}
6270
6271/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6272/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6273/// out when the corresponding mask bit is not set).
6274///
6275/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_ph)
6276#[inline]
6277#[target_feature(enable = "avx512fp16,avx512vl")]
6278#[cfg_attr(test, assert_instr(vfnmadd))]
6279#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6280pub fn _mm256_maskz_fnmadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6281    unsafe { simd_select_bitmask(k, _mm256_fnmadd_ph(a, b, c), _mm256_setzero_ph()) }
6282}
6283
6284/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6285/// result from packed elements in c, and store the results in dst.
6286///
6287/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_ph)
6288#[inline]
6289#[target_feature(enable = "avx512fp16")]
6290#[cfg_attr(test, assert_instr(vfnmadd))]
6291#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6292pub fn _mm512_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6293    unsafe { simd_fma(simd_neg(a), b, c) }
6294}
6295
6296/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6297/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6298/// from a when the corresponding mask bit is not set).
6299///
6300/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_ph)
6301#[inline]
6302#[target_feature(enable = "avx512fp16")]
6303#[cfg_attr(test, assert_instr(vfnmadd))]
6304#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6305pub fn _mm512_mask_fnmadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6306    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), a) }
6307}
6308
6309/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6310/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6311/// from c when the corresponding mask bit is not set).
6312///
6313/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_ph)
6314#[inline]
6315#[target_feature(enable = "avx512fp16")]
6316#[cfg_attr(test, assert_instr(vfnmadd))]
6317#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6318pub fn _mm512_mask3_fnmadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6319    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), c) }
6320}
6321
6322/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6323/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6324/// out when the corresponding mask bit is not set).
6325///
6326/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_ph)
6327#[inline]
6328#[target_feature(enable = "avx512fp16")]
6329#[cfg_attr(test, assert_instr(vfnmadd))]
6330#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6331pub fn _mm512_maskz_fnmadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6332    unsafe { simd_select_bitmask(k, _mm512_fnmadd_ph(a, b, c), _mm512_setzero_ph()) }
6333}
6334
6335/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6336/// result from packed elements in c, and store the results in dst.
6337///
6338/// Rounding is done according to the rounding parameter, which can be one of:
6339///
6340/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6341/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6342/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6343/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6344/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6345///
6346/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_round_ph)
6347#[inline]
6348#[target_feature(enable = "avx512fp16")]
6349#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6350#[rustc_legacy_const_generics(3)]
6351#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6352pub fn _mm512_fnmadd_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6353    unsafe {
6354        static_assert_rounding!(ROUNDING);
6355        vfmaddph_512(simd_neg(a), b, c, ROUNDING)
6356    }
6357}
6358
6359/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6360/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6361/// from a when the corresponding mask bit is not set).
6362///
6363/// Rounding is done according to the rounding parameter, which can be one of:
6364///
6365/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6366/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6367/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6368/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6369/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6370///
6371/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_ph)
6372#[inline]
6373#[target_feature(enable = "avx512fp16")]
6374#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6375#[rustc_legacy_const_generics(4)]
6376#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6377pub fn _mm512_mask_fnmadd_round_ph<const ROUNDING: i32>(
6378    a: __m512h,
6379    k: __mmask32,
6380    b: __m512h,
6381    c: __m512h,
6382) -> __m512h {
6383    unsafe {
6384        static_assert_rounding!(ROUNDING);
6385        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), a)
6386    }
6387}
6388
6389/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6390/// result from packed elements in c, and store the results in dst using writemask k (the element is copied
6391/// from c when the corresponding mask bit is not set).
6392///
6393/// Rounding is done according to the rounding parameter, which can be one of:
6394///
6395/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6396/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6397/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6398/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6399/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6400///
6401/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_round_ph)
6402#[inline]
6403#[target_feature(enable = "avx512fp16")]
6404#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6405#[rustc_legacy_const_generics(4)]
6406#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6407pub fn _mm512_mask3_fnmadd_round_ph<const ROUNDING: i32>(
6408    a: __m512h,
6409    b: __m512h,
6410    c: __m512h,
6411    k: __mmask32,
6412) -> __m512h {
6413    unsafe {
6414        static_assert_rounding!(ROUNDING);
6415        simd_select_bitmask(k, _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c), c)
6416    }
6417}
6418
6419/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract the intermediate
6420/// result from packed elements in c, and store the results in dst using zeromask k (the element is zeroed
6421/// out when the corresponding mask bit is not set).
6422///
6423/// Rounding is done according to the rounding parameter, which can be one of:
6424///
6425/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6426/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6427/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6428/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6429/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6430///
6431/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_ph)
6432#[inline]
6433#[target_feature(enable = "avx512fp16")]
6434#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6435#[rustc_legacy_const_generics(4)]
6436#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6437pub fn _mm512_maskz_fnmadd_round_ph<const ROUNDING: i32>(
6438    k: __mmask32,
6439    a: __m512h,
6440    b: __m512h,
6441    c: __m512h,
6442) -> __m512h {
6443    unsafe {
6444        static_assert_rounding!(ROUNDING);
6445        simd_select_bitmask(
6446            k,
6447            _mm512_fnmadd_round_ph::<ROUNDING>(a, b, c),
6448            _mm512_setzero_ph(),
6449        )
6450    }
6451}
6452
6453/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6454/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6455/// elements from a to the upper elements of dst.
6456///
6457/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_sh)
6458#[inline]
6459#[target_feature(enable = "avx512fp16")]
6460#[cfg_attr(test, assert_instr(vfnmadd))]
6461#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6462pub fn _mm_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6463    unsafe {
6464        let extracta: f16 = simd_extract!(a, 0);
6465        let extractb: f16 = simd_extract!(b, 0);
6466        let extractc: f16 = simd_extract!(c, 0);
6467        let r = fmaf16(-extracta, extractb, extractc);
6468        simd_insert!(a, 0, r)
6469    }
6470}
6471
6472/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6473/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6474/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6475/// elements of dst.
6476///
6477/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_sh)
6478#[inline]
6479#[target_feature(enable = "avx512fp16")]
6480#[cfg_attr(test, assert_instr(vfnmadd))]
6481#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6482pub fn _mm_mask_fnmadd_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6483    unsafe {
6484        let mut fnmadd: f16 = simd_extract!(a, 0);
6485        if k & 1 != 0 {
6486            let extractb: f16 = simd_extract!(b, 0);
6487            let extractc: f16 = simd_extract!(c, 0);
6488            fnmadd = fmaf16(-fnmadd, extractb, extractc);
6489        }
6490        simd_insert!(a, 0, fnmadd)
6491    }
6492}
6493
6494/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6495/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6496/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6497/// elements of dst.
6498///
6499/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_sh)
6500#[inline]
6501#[target_feature(enable = "avx512fp16")]
6502#[cfg_attr(test, assert_instr(vfnmadd))]
6503#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6504pub fn _mm_mask3_fnmadd_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6505    unsafe {
6506        let mut fnmadd: f16 = simd_extract!(c, 0);
6507        if k & 1 != 0 {
6508            let extracta: f16 = simd_extract!(a, 0);
6509            let extractb: f16 = simd_extract!(b, 0);
6510            fnmadd = fmaf16(-extracta, extractb, fnmadd);
6511        }
6512        simd_insert!(c, 0, fnmadd)
6513    }
6514}
6515
6516/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6517/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6518/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6519/// elements of dst.
6520///
6521/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_sh)
6522#[inline]
6523#[target_feature(enable = "avx512fp16")]
6524#[cfg_attr(test, assert_instr(vfnmadd))]
6525#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6526pub fn _mm_maskz_fnmadd_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6527    unsafe {
6528        let mut fnmadd: f16 = 0.0;
6529        if k & 1 != 0 {
6530            let extracta: f16 = simd_extract!(a, 0);
6531            let extractb: f16 = simd_extract!(b, 0);
6532            let extractc: f16 = simd_extract!(c, 0);
6533            fnmadd = fmaf16(-extracta, extractb, extractc);
6534        }
6535        simd_insert!(a, 0, fnmadd)
6536    }
6537}
6538
6539/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6540/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6541/// elements from a to the upper elements of dst.
6542///
6543/// Rounding is done according to the rounding parameter, which can be one of:
6544///
6545/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6546/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6547/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6548/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6549/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6550///
6551/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmadd_round_sh)
6552#[inline]
6553#[target_feature(enable = "avx512fp16")]
6554#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6555#[rustc_legacy_const_generics(3)]
6556#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6557pub fn _mm_fnmadd_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6558    unsafe {
6559        static_assert_rounding!(ROUNDING);
6560        let extracta: f16 = simd_extract!(a, 0);
6561        let extractb: f16 = simd_extract!(b, 0);
6562        let extractc: f16 = simd_extract!(c, 0);
6563        let r = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6564        simd_insert!(a, 0, r)
6565    }
6566}
6567
6568/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6569/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6570/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6571/// elements of dst.
6572///
6573/// Rounding is done according to the rounding parameter, which can be one of:
6574///
6575/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6576/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6577/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6578/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6579/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6580///
6581/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_round_sh)
6582#[inline]
6583#[target_feature(enable = "avx512fp16")]
6584#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6585#[rustc_legacy_const_generics(4)]
6586#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6587pub fn _mm_mask_fnmadd_round_sh<const ROUNDING: i32>(
6588    a: __m128h,
6589    k: __mmask8,
6590    b: __m128h,
6591    c: __m128h,
6592) -> __m128h {
6593    unsafe {
6594        static_assert_rounding!(ROUNDING);
6595        let mut fnmadd: f16 = simd_extract!(a, 0);
6596        if k & 1 != 0 {
6597            let extractb: f16 = simd_extract!(b, 0);
6598            let extractc: f16 = simd_extract!(c, 0);
6599            fnmadd = vfmaddsh(-fnmadd, extractb, extractc, ROUNDING);
6600        }
6601        simd_insert!(a, 0, fnmadd)
6602    }
6603}
6604
6605/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6606/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6607/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6608/// elements of dst.
6609///
6610/// Rounding is done according to the rounding parameter, which can be one of:
6611///
6612/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6613/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6614/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6615/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6616/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6617///
6618/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_round_sh)
6619#[inline]
6620#[target_feature(enable = "avx512fp16")]
6621#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6622#[rustc_legacy_const_generics(4)]
6623#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6624pub fn _mm_mask3_fnmadd_round_sh<const ROUNDING: i32>(
6625    a: __m128h,
6626    b: __m128h,
6627    c: __m128h,
6628    k: __mmask8,
6629) -> __m128h {
6630    unsafe {
6631        static_assert_rounding!(ROUNDING);
6632        let mut fnmadd: f16 = simd_extract!(c, 0);
6633        if k & 1 != 0 {
6634            let extracta: f16 = simd_extract!(a, 0);
6635            let extractb: f16 = simd_extract!(b, 0);
6636            fnmadd = vfmaddsh(-extracta, extractb, fnmadd, ROUNDING);
6637        }
6638        simd_insert!(c, 0, fnmadd)
6639    }
6640}
6641
6642/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6643/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
6644/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6645/// elements of dst.
6646///
6647/// Rounding is done according to the rounding parameter, which can be one of:
6648///
6649/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6650/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6651/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6652/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6653/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6654///
6655/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_round_sh)
6656#[inline]
6657#[target_feature(enable = "avx512fp16")]
6658#[cfg_attr(test, assert_instr(vfnmadd, ROUNDING = 8))]
6659#[rustc_legacy_const_generics(4)]
6660#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6661pub fn _mm_maskz_fnmadd_round_sh<const ROUNDING: i32>(
6662    k: __mmask8,
6663    a: __m128h,
6664    b: __m128h,
6665    c: __m128h,
6666) -> __m128h {
6667    unsafe {
6668        static_assert_rounding!(ROUNDING);
6669        let mut fnmadd: f16 = 0.0;
6670        if k & 1 != 0 {
6671            let extracta: f16 = simd_extract!(a, 0);
6672            let extractb: f16 = simd_extract!(b, 0);
6673            let extractc: f16 = simd_extract!(c, 0);
6674            fnmadd = vfmaddsh(-extracta, extractb, extractc, ROUNDING);
6675        }
6676        simd_insert!(a, 0, fnmadd)
6677    }
6678}
6679
6680/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6681/// in c from the negated intermediate result, and store the results in dst.
6682///
6683/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_ph)
6684#[inline]
6685#[target_feature(enable = "avx512fp16,avx512vl")]
6686#[cfg_attr(test, assert_instr(vfnmsub))]
6687#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6688pub fn _mm_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6689    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6690}
6691
6692/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6693/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6694/// copied from a when the corresponding mask bit is not set).
6695///
6696/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ph)
6697#[inline]
6698#[target_feature(enable = "avx512fp16,avx512vl")]
6699#[cfg_attr(test, assert_instr(vfnmsub))]
6700#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6701pub fn _mm_mask_fnmsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6702    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), a) }
6703}
6704
6705/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6706/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6707/// copied from c when the corresponding mask bit is not set).
6708///
6709/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ph)
6710#[inline]
6711#[target_feature(enable = "avx512fp16,avx512vl")]
6712#[cfg_attr(test, assert_instr(vfnmsub))]
6713#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6714pub fn _mm_mask3_fnmsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
6715    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), c) }
6716}
6717
6718/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6719/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6720/// zeroed out when the corresponding mask bit is not set).
6721///
6722/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ph)
6723#[inline]
6724#[target_feature(enable = "avx512fp16,avx512vl")]
6725#[cfg_attr(test, assert_instr(vfnmsub))]
6726#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6727pub fn _mm_maskz_fnmsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6728    unsafe { simd_select_bitmask(k, _mm_fnmsub_ph(a, b, c), _mm_setzero_ph()) }
6729}
6730
6731/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6732/// in c from the negated intermediate result, and store the results in dst.
6733///
6734/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fnmsub_ph)
6735#[inline]
6736#[target_feature(enable = "avx512fp16,avx512vl")]
6737#[cfg_attr(test, assert_instr(vfnmsub))]
6738#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6739pub fn _mm256_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6740    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6741}
6742
6743/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6744/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6745/// copied from a when the corresponding mask bit is not set).
6746///
6747/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_ph)
6748#[inline]
6749#[target_feature(enable = "avx512fp16,avx512vl")]
6750#[cfg_attr(test, assert_instr(vfnmsub))]
6751#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6752pub fn _mm256_mask_fnmsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
6753    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), a) }
6754}
6755
6756/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6757/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6758/// copied from c when the corresponding mask bit is not set).
6759///
6760/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_ph)
6761#[inline]
6762#[target_feature(enable = "avx512fp16,avx512vl")]
6763#[cfg_attr(test, assert_instr(vfnmsub))]
6764#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6765pub fn _mm256_mask3_fnmsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
6766    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), c) }
6767}
6768
6769/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6770/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6771/// zeroed out when the corresponding mask bit is not set).
6772///
6773/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_ph)
6774#[inline]
6775#[target_feature(enable = "avx512fp16,avx512vl")]
6776#[cfg_attr(test, assert_instr(vfnmsub))]
6777#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6778pub fn _mm256_maskz_fnmsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
6779    unsafe { simd_select_bitmask(k, _mm256_fnmsub_ph(a, b, c), _mm256_setzero_ph()) }
6780}
6781
6782/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6783/// in c from the negated intermediate result, and store the results in dst.
6784///
6785/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_ph)
6786#[inline]
6787#[target_feature(enable = "avx512fp16")]
6788#[cfg_attr(test, assert_instr(vfnmsub))]
6789#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6790pub fn _mm512_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6791    unsafe { simd_fma(simd_neg(a), b, simd_neg(c)) }
6792}
6793
6794/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6795/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6796/// copied from a when the corresponding mask bit is not set).
6797///
6798/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_ph)
6799#[inline]
6800#[target_feature(enable = "avx512fp16")]
6801#[cfg_attr(test, assert_instr(vfnmsub))]
6802#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6803pub fn _mm512_mask_fnmsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
6804    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), a) }
6805}
6806
6807/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6808/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6809/// copied from c when the corresponding mask bit is not set).
6810///
6811/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_ph)
6812#[inline]
6813#[target_feature(enable = "avx512fp16")]
6814#[cfg_attr(test, assert_instr(vfnmsub))]
6815#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6816pub fn _mm512_mask3_fnmsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
6817    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), c) }
6818}
6819
6820/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6821/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6822/// zeroed out when the corresponding mask bit is not set).
6823///
6824/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_ph)
6825#[inline]
6826#[target_feature(enable = "avx512fp16")]
6827#[cfg_attr(test, assert_instr(vfnmsub))]
6828#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6829pub fn _mm512_maskz_fnmsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6830    unsafe { simd_select_bitmask(k, _mm512_fnmsub_ph(a, b, c), _mm512_setzero_ph()) }
6831}
6832
6833/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6834/// in c from the negated intermediate result, and store the results in dst.
6835///
6836/// Rounding is done according to the rounding parameter, which can be one of:
6837///
6838/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6839/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6840/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6841/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6842/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6843///
6844/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_round_ph)
6845#[inline]
6846#[target_feature(enable = "avx512fp16")]
6847#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6848#[rustc_legacy_const_generics(3)]
6849#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6850pub fn _mm512_fnmsub_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
6851    unsafe {
6852        static_assert_rounding!(ROUNDING);
6853        vfmaddph_512(simd_neg(a), b, simd_neg(c), ROUNDING)
6854    }
6855}
6856
6857/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6858/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6859/// copied from a when the corresponding mask bit is not set).
6860///
6861/// Rounding is done according to the rounding parameter, which can be one of:
6862///
6863/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6864/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6865/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6866/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6867/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6868///
6869/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_round_ph)
6870#[inline]
6871#[target_feature(enable = "avx512fp16")]
6872#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6873#[rustc_legacy_const_generics(4)]
6874#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6875pub fn _mm512_mask_fnmsub_round_ph<const ROUNDING: i32>(
6876    a: __m512h,
6877    k: __mmask32,
6878    b: __m512h,
6879    c: __m512h,
6880) -> __m512h {
6881    unsafe {
6882        static_assert_rounding!(ROUNDING);
6883        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), a)
6884    }
6885}
6886
6887/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6888/// in c from the negated intermediate result, and store the results in dst using writemask k (the element is
6889/// copied from c when the corresponding mask bit is not set).
6890///
6891/// Rounding is done according to the rounding parameter, which can be one of:
6892///
6893/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6894/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6895/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6896/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6897/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6898///
6899/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_round_ph)
6900#[inline]
6901#[target_feature(enable = "avx512fp16")]
6902#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6903#[rustc_legacy_const_generics(4)]
6904#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6905pub fn _mm512_mask3_fnmsub_round_ph<const ROUNDING: i32>(
6906    a: __m512h,
6907    b: __m512h,
6908    c: __m512h,
6909    k: __mmask32,
6910) -> __m512h {
6911    unsafe {
6912        static_assert_rounding!(ROUNDING);
6913        simd_select_bitmask(k, _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c), c)
6914    }
6915}
6916
6917/// Multiply packed half-precision (16-bit) floating-point elements in a and b, subtract packed elements
6918/// in c from the negated intermediate result, and store the results in dst using zeromask k (the element is
6919/// zeroed out when the corresponding mask bit is not set).
6920///
6921/// Rounding is done according to the rounding parameter, which can be one of:
6922///
6923/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
6924/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
6925/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
6926/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
6927/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
6928///
6929/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_round_ph)
6930#[inline]
6931#[target_feature(enable = "avx512fp16")]
6932#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
6933#[rustc_legacy_const_generics(4)]
6934#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6935pub fn _mm512_maskz_fnmsub_round_ph<const ROUNDING: i32>(
6936    k: __mmask32,
6937    a: __m512h,
6938    b: __m512h,
6939    c: __m512h,
6940) -> __m512h {
6941    unsafe {
6942        static_assert_rounding!(ROUNDING);
6943        simd_select_bitmask(
6944            k,
6945            _mm512_fnmsub_round_ph::<ROUNDING>(a, b, c),
6946            _mm512_setzero_ph(),
6947        )
6948    }
6949}
6950
6951/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6952/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
6953/// elements from a to the upper elements of dst.
6954///
6955/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_sh)
6956#[inline]
6957#[target_feature(enable = "avx512fp16")]
6958#[cfg_attr(test, assert_instr(vfnmsub))]
6959#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6960pub fn _mm_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
6961    unsafe {
6962        let extracta: f16 = simd_extract!(a, 0);
6963        let extractb: f16 = simd_extract!(b, 0);
6964        let extractc: f16 = simd_extract!(c, 0);
6965        let r = fmaf16(-extracta, extractb, -extractc);
6966        simd_insert!(a, 0, r)
6967    }
6968}
6969
6970/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6971/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6972/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
6973/// elements of dst.
6974///
6975/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_sh)
6976#[inline]
6977#[target_feature(enable = "avx512fp16")]
6978#[cfg_attr(test, assert_instr(vfnmsub))]
6979#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
6980pub fn _mm_mask_fnmsub_sh(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
6981    unsafe {
6982        let mut fnmsub: f16 = simd_extract!(a, 0);
6983        if k & 1 != 0 {
6984            let extractb: f16 = simd_extract!(b, 0);
6985            let extractc: f16 = simd_extract!(c, 0);
6986            fnmsub = fmaf16(-fnmsub, extractb, -extractc);
6987        }
6988        simd_insert!(a, 0, fnmsub)
6989    }
6990}
6991
6992/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
6993/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
6994/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
6995/// elements of dst.
6996///
6997/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_sh)
6998#[inline]
6999#[target_feature(enable = "avx512fp16")]
7000#[cfg_attr(test, assert_instr(vfnmsub))]
7001#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7002pub fn _mm_mask3_fnmsub_sh(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7003    unsafe {
7004        let mut fnmsub: f16 = simd_extract!(c, 0);
7005        if k & 1 != 0 {
7006            let extracta: f16 = simd_extract!(a, 0);
7007            let extractb: f16 = simd_extract!(b, 0);
7008            fnmsub = fmaf16(-extracta, extractb, -fnmsub);
7009        }
7010        simd_insert!(c, 0, fnmsub)
7011    }
7012}
7013
7014/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7015/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
7016/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7017/// elements of dst.
7018///
7019/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_sh)
7020#[inline]
7021#[target_feature(enable = "avx512fp16")]
7022#[cfg_attr(test, assert_instr(vfnmsub))]
7023#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7024pub fn _mm_maskz_fnmsub_sh(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7025    unsafe {
7026        let mut fnmsub: f16 = 0.0;
7027        if k & 1 != 0 {
7028            let extracta: f16 = simd_extract!(a, 0);
7029            let extractb: f16 = simd_extract!(b, 0);
7030            let extractc: f16 = simd_extract!(c, 0);
7031            fnmsub = fmaf16(-extracta, extractb, -extractc);
7032        }
7033        simd_insert!(a, 0, fnmsub)
7034    }
7035}
7036
7037/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7038/// result from the lower element in c. Store the result in the lower element of dst, and copy the upper 7 packed
7039/// elements from a to the upper elements of dst.
7040///
7041/// Rounding is done according to the rounding parameter, which can be one of:
7042///
7043/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7044/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7045/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7046/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7047/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7048///
7049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fnmsub_round_sh)
7050#[inline]
7051#[target_feature(enable = "avx512fp16")]
7052#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7053#[rustc_legacy_const_generics(3)]
7054#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7055pub fn _mm_fnmsub_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7056    unsafe {
7057        static_assert_rounding!(ROUNDING);
7058        let extracta: f16 = simd_extract!(a, 0);
7059        let extractb: f16 = simd_extract!(b, 0);
7060        let extractc: f16 = simd_extract!(c, 0);
7061        let r = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
7062        simd_insert!(a, 0, r)
7063    }
7064}
7065
7066/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7067/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7068/// is copied from a when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7069/// elements of dst.
7070///
7071/// Rounding is done according to the rounding parameter, which can be one of:
7072///
7073/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7074/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7075/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7076/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7077/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7078///
7079/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_round_sh)
7080#[inline]
7081#[target_feature(enable = "avx512fp16")]
7082#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7083#[rustc_legacy_const_generics(4)]
7084#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7085pub fn _mm_mask_fnmsub_round_sh<const ROUNDING: i32>(
7086    a: __m128h,
7087    k: __mmask8,
7088    b: __m128h,
7089    c: __m128h,
7090) -> __m128h {
7091    unsafe {
7092        static_assert_rounding!(ROUNDING);
7093        let mut fnmsub: f16 = simd_extract!(a, 0);
7094        if k & 1 != 0 {
7095            let extractb: f16 = simd_extract!(b, 0);
7096            let extractc: f16 = simd_extract!(c, 0);
7097            fnmsub = vfmaddsh(-fnmsub, extractb, -extractc, ROUNDING);
7098        }
7099        simd_insert!(a, 0, fnmsub)
7100    }
7101}
7102
7103/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7104/// result from the lower element in c. Store the result in the lower element of dst using writemask k (the element
7105/// is copied from c when the mask bit 0 is not set), and copy the upper 7 packed elements from c to the upper
7106/// elements of dst.
7107///
7108/// Rounding is done according to the rounding parameter, which can be one of:
7109///
7110/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7111/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7112/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7113/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7114/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7115///
7116/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_round_sh)
7117#[inline]
7118#[target_feature(enable = "avx512fp16")]
7119#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7120#[rustc_legacy_const_generics(4)]
7121#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7122pub fn _mm_mask3_fnmsub_round_sh<const ROUNDING: i32>(
7123    a: __m128h,
7124    b: __m128h,
7125    c: __m128h,
7126    k: __mmask8,
7127) -> __m128h {
7128    unsafe {
7129        static_assert_rounding!(ROUNDING);
7130        let mut fnmsub: f16 = simd_extract!(c, 0);
7131        if k & 1 != 0 {
7132            let extracta: f16 = simd_extract!(a, 0);
7133            let extractb: f16 = simd_extract!(b, 0);
7134            fnmsub = vfmaddsh(-extracta, extractb, -fnmsub, ROUNDING);
7135        }
7136        simd_insert!(c, 0, fnmsub)
7137    }
7138}
7139
7140/// Multiply the lower half-precision (16-bit) floating-point elements in a and b, and subtract the intermediate
7141/// result from the lower element in c. Store the result in the lower element of dst using zeromask k (the element
7142/// is zeroed out when the mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
7143/// elements of dst.
7144///
7145/// Rounding is done according to the rounding parameter, which can be one of:
7146///
7147/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7148/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7149/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7150/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7151/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7152///
7153/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_round_sh)
7154#[inline]
7155#[target_feature(enable = "avx512fp16")]
7156#[cfg_attr(test, assert_instr(vfnmsub, ROUNDING = 8))]
7157#[rustc_legacy_const_generics(4)]
7158#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7159pub fn _mm_maskz_fnmsub_round_sh<const ROUNDING: i32>(
7160    k: __mmask8,
7161    a: __m128h,
7162    b: __m128h,
7163    c: __m128h,
7164) -> __m128h {
7165    unsafe {
7166        static_assert_rounding!(ROUNDING);
7167        let mut fnmsub: f16 = 0.0;
7168        if k & 1 != 0 {
7169            let extracta: f16 = simd_extract!(a, 0);
7170            let extractb: f16 = simd_extract!(b, 0);
7171            let extractc: f16 = simd_extract!(c, 0);
7172            fnmsub = vfmaddsh(-extracta, extractb, -extractc, ROUNDING);
7173        }
7174        simd_insert!(a, 0, fnmsub)
7175    }
7176}
7177
7178/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7179/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7180///
7181/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmaddsub_ph)
7182#[inline]
7183#[target_feature(enable = "avx512fp16,avx512vl")]
7184#[cfg_attr(test, assert_instr(vfmaddsub))]
7185#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7186pub fn _mm_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7187    unsafe {
7188        let add = simd_fma(a, b, c);
7189        let sub = simd_fma(a, b, simd_neg(c));
7190        simd_shuffle!(sub, add, [0, 9, 2, 11, 4, 13, 6, 15])
7191    }
7192}
7193
7194/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7195/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7196/// (the element is copied from a when the corresponding mask bit is not set).
7197///
7198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_ph)
7199#[inline]
7200#[target_feature(enable = "avx512fp16,avx512vl")]
7201#[cfg_attr(test, assert_instr(vfmaddsub))]
7202#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7203pub fn _mm_mask_fmaddsub_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7204    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), a) }
7205}
7206
7207/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7208/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7209/// (the element is copied from c when the corresponding mask bit is not set).
7210///
7211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmaddsub_ph)
7212#[inline]
7213#[target_feature(enable = "avx512fp16,avx512vl")]
7214#[cfg_attr(test, assert_instr(vfmaddsub))]
7215#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7216pub fn _mm_mask3_fmaddsub_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7217    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), c) }
7218}
7219
7220/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7221/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7222/// (the element is zeroed out when the corresponding mask bit is not set).
7223///
7224/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ph)
7225#[inline]
7226#[target_feature(enable = "avx512fp16,avx512vl")]
7227#[cfg_attr(test, assert_instr(vfmaddsub))]
7228#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7229pub fn _mm_maskz_fmaddsub_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7230    unsafe { simd_select_bitmask(k, _mm_fmaddsub_ph(a, b, c), _mm_setzero_ph()) }
7231}
7232
7233/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7234/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7235///
7236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmaddsub_ph)
7237#[inline]
7238#[target_feature(enable = "avx512fp16,avx512vl")]
7239#[cfg_attr(test, assert_instr(vfmaddsub))]
7240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7241pub fn _mm256_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7242    unsafe {
7243        let add = simd_fma(a, b, c);
7244        let sub = simd_fma(a, b, simd_neg(c));
7245        simd_shuffle!(
7246            sub,
7247            add,
7248            [0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31]
7249        )
7250    }
7251}
7252
7253/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7254/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7255/// (the element is copied from a when the corresponding mask bit is not set).
7256///
7257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_ph)
7258#[inline]
7259#[target_feature(enable = "avx512fp16,avx512vl")]
7260#[cfg_attr(test, assert_instr(vfmaddsub))]
7261#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7262pub fn _mm256_mask_fmaddsub_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7263    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), a) }
7264}
7265
7266/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7267/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7268/// (the element is copied from c when the corresponding mask bit is not set).
7269///
7270/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmaddsub_ph)
7271#[inline]
7272#[target_feature(enable = "avx512fp16,avx512vl")]
7273#[cfg_attr(test, assert_instr(vfmaddsub))]
7274#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7275pub fn _mm256_mask3_fmaddsub_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
7276    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), c) }
7277}
7278
7279/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7280/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7281/// (the element is zeroed out when the corresponding mask bit is not set).
7282///
7283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_ph)
7284#[inline]
7285#[target_feature(enable = "avx512fp16,avx512vl")]
7286#[cfg_attr(test, assert_instr(vfmaddsub))]
7287#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7288pub fn _mm256_maskz_fmaddsub_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7289    unsafe { simd_select_bitmask(k, _mm256_fmaddsub_ph(a, b, c), _mm256_setzero_ph()) }
7290}
7291
7292/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7293/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7294///
7295/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_ph)
7296#[inline]
7297#[target_feature(enable = "avx512fp16")]
7298#[cfg_attr(test, assert_instr(vfmaddsub))]
7299#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7300pub fn _mm512_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7301    unsafe {
7302        let add = simd_fma(a, b, c);
7303        let sub = simd_fma(a, b, simd_neg(c));
7304        simd_shuffle!(
7305            sub,
7306            add,
7307            [
7308                0, 33, 2, 35, 4, 37, 6, 39, 8, 41, 10, 43, 12, 45, 14, 47, 16, 49, 18, 51, 20, 53,
7309                22, 55, 24, 57, 26, 59, 28, 61, 30, 63
7310            ]
7311        )
7312    }
7313}
7314
7315/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7316/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7317/// (the element is copied from a when the corresponding mask bit is not set).
7318///
7319/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_ph)
7320#[inline]
7321#[target_feature(enable = "avx512fp16")]
7322#[cfg_attr(test, assert_instr(vfmaddsub))]
7323#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7324pub fn _mm512_mask_fmaddsub_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7325    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), a) }
7326}
7327
7328/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7329/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7330/// (the element is copied from c when the corresponding mask bit is not set).
7331///
7332/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_ph)
7333#[inline]
7334#[target_feature(enable = "avx512fp16")]
7335#[cfg_attr(test, assert_instr(vfmaddsub))]
7336#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7337pub fn _mm512_mask3_fmaddsub_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
7338    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), c) }
7339}
7340
7341/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7342/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7343/// (the element is zeroed out when the corresponding mask bit is not set).
7344///
7345/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_ph)
7346#[inline]
7347#[target_feature(enable = "avx512fp16")]
7348#[cfg_attr(test, assert_instr(vfmaddsub))]
7349#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7350pub fn _mm512_maskz_fmaddsub_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7351    unsafe { simd_select_bitmask(k, _mm512_fmaddsub_ph(a, b, c), _mm512_setzero_ph()) }
7352}
7353
7354/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7355/// subtract packed elements in c to/from the intermediate result, and store the results in dst.
7356///
7357/// Rounding is done according to the rounding parameter, which can be one of:
7358///
7359/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7360/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7361/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7362/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7363/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7364///
7365/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_round_ph)
7366#[inline]
7367#[target_feature(enable = "avx512fp16")]
7368#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7369#[rustc_legacy_const_generics(3)]
7370#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7371pub fn _mm512_fmaddsub_round_ph<const ROUNDING: i32>(
7372    a: __m512h,
7373    b: __m512h,
7374    c: __m512h,
7375) -> __m512h {
7376    unsafe {
7377        static_assert_rounding!(ROUNDING);
7378        vfmaddsubph_512(a, b, c, ROUNDING)
7379    }
7380}
7381
7382/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7383/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7384/// (the element is copied from a when the corresponding mask bit is not set).
7385///
7386/// Rounding is done according to the rounding parameter, which can be one of:
7387///
7388/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7389/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7390/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7391/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7392/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7393///
7394/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_round_ph)
7395#[inline]
7396#[target_feature(enable = "avx512fp16")]
7397#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7398#[rustc_legacy_const_generics(4)]
7399#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7400pub fn _mm512_mask_fmaddsub_round_ph<const ROUNDING: i32>(
7401    a: __m512h,
7402    k: __mmask32,
7403    b: __m512h,
7404    c: __m512h,
7405) -> __m512h {
7406    unsafe {
7407        static_assert_rounding!(ROUNDING);
7408        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), a)
7409    }
7410}
7411
7412/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7413/// subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7414/// (the element is copied from c when the corresponding mask bit is not set).
7415///
7416/// Rounding is done according to the rounding parameter, which can be one of:
7417///
7418/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7419/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7420/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7421/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7422/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7423///
7424/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_round_ph)
7425#[inline]
7426#[target_feature(enable = "avx512fp16")]
7427#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7428#[rustc_legacy_const_generics(4)]
7429#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7430pub fn _mm512_mask3_fmaddsub_round_ph<const ROUNDING: i32>(
7431    a: __m512h,
7432    b: __m512h,
7433    c: __m512h,
7434    k: __mmask32,
7435) -> __m512h {
7436    unsafe {
7437        static_assert_rounding!(ROUNDING);
7438        simd_select_bitmask(k, _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c), c)
7439    }
7440}
7441
7442/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively add and
7443/// subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7444/// (the element is zeroed out when the corresponding mask bit is not set).
7445///
7446/// Rounding is done according to the rounding parameter, which can be one of:
7447///
7448/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7449/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7450/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7451/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7452/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7453///
7454/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_round_ph)
7455#[inline]
7456#[target_feature(enable = "avx512fp16")]
7457#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))]
7458#[rustc_legacy_const_generics(4)]
7459#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7460pub fn _mm512_maskz_fmaddsub_round_ph<const ROUNDING: i32>(
7461    k: __mmask32,
7462    a: __m512h,
7463    b: __m512h,
7464    c: __m512h,
7465) -> __m512h {
7466    unsafe {
7467        static_assert_rounding!(ROUNDING);
7468        simd_select_bitmask(
7469            k,
7470            _mm512_fmaddsub_round_ph::<ROUNDING>(a, b, c),
7471            _mm512_setzero_ph(),
7472        )
7473    }
7474}
7475
7476/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7477/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7478///
7479/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmsubadd_ph)
7480#[inline]
7481#[target_feature(enable = "avx512fp16,avx512vl")]
7482#[cfg_attr(test, assert_instr(vfmsubadd))]
7483#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7484pub fn _mm_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7485    _mm_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
7486}
7487
7488/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7489/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7490/// (the element is copied from a when the corresponding mask bit is not set).
7491///
7492/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_ph)
7493#[inline]
7494#[target_feature(enable = "avx512fp16,avx512vl")]
7495#[cfg_attr(test, assert_instr(vfmsubadd))]
7496#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7497pub fn _mm_mask_fmsubadd_ph(a: __m128h, k: __mmask8, b: __m128h, c: __m128h) -> __m128h {
7498    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), a) }
7499}
7500
7501/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7502/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7503/// (the element is copied from c when the corresponding mask bit is not set).
7504///
7505/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsubadd_ph)
7506#[inline]
7507#[target_feature(enable = "avx512fp16,avx512vl")]
7508#[cfg_attr(test, assert_instr(vfmsubadd))]
7509#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7510pub fn _mm_mask3_fmsubadd_ph(a: __m128h, b: __m128h, c: __m128h, k: __mmask8) -> __m128h {
7511    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), c) }
7512}
7513
7514/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7515/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7516/// (the element is zeroed out when the corresponding mask bit is not set).
7517///
7518/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_ph)
7519#[inline]
7520#[target_feature(enable = "avx512fp16,avx512vl")]
7521#[cfg_attr(test, assert_instr(vfmsubadd))]
7522#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7523pub fn _mm_maskz_fmsubadd_ph(k: __mmask8, a: __m128h, b: __m128h, c: __m128h) -> __m128h {
7524    unsafe { simd_select_bitmask(k, _mm_fmsubadd_ph(a, b, c), _mm_setzero_ph()) }
7525}
7526
7527/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7528/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7529///
7530/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fmsubadd_ph)
7531#[inline]
7532#[target_feature(enable = "avx512fp16,avx512vl")]
7533#[cfg_attr(test, assert_instr(vfmsubadd))]
7534#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7535pub fn _mm256_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7536    _mm256_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
7537}
7538
7539/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7540/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7541/// (the element is copied from a when the corresponding mask bit is not set).
7542///
7543/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_ph)
7544#[inline]
7545#[target_feature(enable = "avx512fp16,avx512vl")]
7546#[cfg_attr(test, assert_instr(vfmsubadd))]
7547#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7548pub fn _mm256_mask_fmsubadd_ph(a: __m256h, k: __mmask16, b: __m256h, c: __m256h) -> __m256h {
7549    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), a) }
7550}
7551
7552/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7553/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7554/// (the element is copied from c when the corresponding mask bit is not set).
7555///
7556/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsubadd_ph)
7557#[inline]
7558#[target_feature(enable = "avx512fp16,avx512vl")]
7559#[cfg_attr(test, assert_instr(vfmsubadd))]
7560#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7561pub fn _mm256_mask3_fmsubadd_ph(a: __m256h, b: __m256h, c: __m256h, k: __mmask16) -> __m256h {
7562    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), c) }
7563}
7564
7565/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7566/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7567/// (the element is zeroed out when the corresponding mask bit is not set).
7568///
7569/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_ph)
7570#[inline]
7571#[target_feature(enable = "avx512fp16,avx512vl")]
7572#[cfg_attr(test, assert_instr(vfmsubadd))]
7573#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7574pub fn _mm256_maskz_fmsubadd_ph(k: __mmask16, a: __m256h, b: __m256h, c: __m256h) -> __m256h {
7575    unsafe { simd_select_bitmask(k, _mm256_fmsubadd_ph(a, b, c), _mm256_setzero_ph()) }
7576}
7577
7578/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7579/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7580///
7581/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_ph)
7582#[inline]
7583#[target_feature(enable = "avx512fp16")]
7584#[cfg_attr(test, assert_instr(vfmsubadd))]
7585#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7586pub fn _mm512_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7587    _mm512_fmaddsub_ph(a, b, unsafe { simd_neg(c) })
7588}
7589
7590/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7591/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7592/// (the element is copied from a when the corresponding mask bit is not set).
7593///
7594/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_ph)
7595#[inline]
7596#[target_feature(enable = "avx512fp16")]
7597#[cfg_attr(test, assert_instr(vfmsubadd))]
7598#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7599pub fn _mm512_mask_fmsubadd_ph(a: __m512h, k: __mmask32, b: __m512h, c: __m512h) -> __m512h {
7600    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), a) }
7601}
7602
7603/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7604/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7605/// (the element is copied from c when the corresponding mask bit is not set).
7606///
7607/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_ph)
7608#[inline]
7609#[target_feature(enable = "avx512fp16")]
7610#[cfg_attr(test, assert_instr(vfmsubadd))]
7611#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7612pub fn _mm512_mask3_fmsubadd_ph(a: __m512h, b: __m512h, c: __m512h, k: __mmask32) -> __m512h {
7613    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), c) }
7614}
7615
7616/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7617/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7618/// (the element is zeroed out when the corresponding mask bit is not set).
7619///
7620/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_ph)
7621#[inline]
7622#[target_feature(enable = "avx512fp16")]
7623#[cfg_attr(test, assert_instr(vfmsubadd))]
7624#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7625pub fn _mm512_maskz_fmsubadd_ph(k: __mmask32, a: __m512h, b: __m512h, c: __m512h) -> __m512h {
7626    unsafe { simd_select_bitmask(k, _mm512_fmsubadd_ph(a, b, c), _mm512_setzero_ph()) }
7627}
7628
7629/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7630/// and add packed elements in c to/from the intermediate result, and store the results in dst.
7631///
7632/// Rounding is done according to the rounding parameter, which can be one of:
7633///
7634/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7635/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7636/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7637/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7638/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7639///
7640/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_ph)
7641#[inline]
7642#[target_feature(enable = "avx512fp16")]
7643#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7644#[rustc_legacy_const_generics(3)]
7645#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7646pub fn _mm512_fmsubadd_round_ph<const ROUNDING: i32>(
7647    a: __m512h,
7648    b: __m512h,
7649    c: __m512h,
7650) -> __m512h {
7651    unsafe {
7652        static_assert_rounding!(ROUNDING);
7653        vfmaddsubph_512(a, b, simd_neg(c), ROUNDING)
7654    }
7655}
7656
7657/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7658/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7659/// (the element is copied from a when the corresponding mask bit is not set).
7660///
7661/// Rounding is done according to the rounding parameter, which can be one of:
7662///
7663/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7664/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7665/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7666/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7667/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7668///
7669/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_ph)
7670#[inline]
7671#[target_feature(enable = "avx512fp16")]
7672#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7673#[rustc_legacy_const_generics(4)]
7674#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7675pub fn _mm512_mask_fmsubadd_round_ph<const ROUNDING: i32>(
7676    a: __m512h,
7677    k: __mmask32,
7678    b: __m512h,
7679    c: __m512h,
7680) -> __m512h {
7681    unsafe {
7682        static_assert_rounding!(ROUNDING);
7683        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), a)
7684    }
7685}
7686
7687/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7688/// and add packed elements in c to/from the intermediate result, and store the results in dst using writemask k
7689/// (the element is copied from c when the corresponding mask bit is not set).
7690///
7691/// Rounding is done according to the rounding parameter, which can be one of:
7692///
7693/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7694/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7695/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7696/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7697/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7698///
7699/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_round_ph)
7700#[inline]
7701#[target_feature(enable = "avx512fp16")]
7702#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7703#[rustc_legacy_const_generics(4)]
7704#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7705pub fn _mm512_mask3_fmsubadd_round_ph<const ROUNDING: i32>(
7706    a: __m512h,
7707    b: __m512h,
7708    c: __m512h,
7709    k: __mmask32,
7710) -> __m512h {
7711    unsafe {
7712        static_assert_rounding!(ROUNDING);
7713        simd_select_bitmask(k, _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c), c)
7714    }
7715}
7716
7717/// Multiply packed half-precision (16-bit) floating-point elements in a and b, alternatively subtract
7718/// and add packed elements in c to/from the intermediate result, and store the results in dst using zeromask k
7719/// (the element is zeroed out when the corresponding mask bit is not set).
7720///
7721/// Rounding is done according to the rounding parameter, which can be one of:
7722///
7723/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
7724/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
7725/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
7726/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
7727/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
7728///
7729/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_ph)
7730#[inline]
7731#[target_feature(enable = "avx512fp16")]
7732#[cfg_attr(test, assert_instr(vfmsubadd, ROUNDING = 8))]
7733#[rustc_legacy_const_generics(4)]
7734#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7735pub fn _mm512_maskz_fmsubadd_round_ph<const ROUNDING: i32>(
7736    k: __mmask32,
7737    a: __m512h,
7738    b: __m512h,
7739    c: __m512h,
7740) -> __m512h {
7741    unsafe {
7742        static_assert_rounding!(ROUNDING);
7743        simd_select_bitmask(
7744            k,
7745            _mm512_fmsubadd_round_ph::<ROUNDING>(a, b, c),
7746            _mm512_setzero_ph(),
7747        )
7748    }
7749}
7750
7751/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7752/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7753///
7754/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ph)
7755#[inline]
7756#[target_feature(enable = "avx512fp16,avx512vl")]
7757#[cfg_attr(test, assert_instr(vrcpph))]
7758#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7759pub fn _mm_rcp_ph(a: __m128h) -> __m128h {
7760    _mm_mask_rcp_ph(_mm_undefined_ph(), 0xff, a)
7761}
7762
7763/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7764/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7765/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7766///
7767/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_ph)
7768#[inline]
7769#[target_feature(enable = "avx512fp16,avx512vl")]
7770#[cfg_attr(test, assert_instr(vrcpph))]
7771#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7772pub fn _mm_mask_rcp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7773    unsafe { vrcpph_128(a, src, k) }
7774}
7775
7776/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7777/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7778/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7779///
7780/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_ph)
7781#[inline]
7782#[target_feature(enable = "avx512fp16,avx512vl")]
7783#[cfg_attr(test, assert_instr(vrcpph))]
7784#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7785pub fn _mm_maskz_rcp_ph(k: __mmask8, a: __m128h) -> __m128h {
7786    _mm_mask_rcp_ph(_mm_setzero_ph(), k, a)
7787}
7788
7789/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7790/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7791///
7792/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp_ph)
7793#[inline]
7794#[target_feature(enable = "avx512fp16,avx512vl")]
7795#[cfg_attr(test, assert_instr(vrcpph))]
7796#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7797pub fn _mm256_rcp_ph(a: __m256h) -> __m256h {
7798    _mm256_mask_rcp_ph(_mm256_undefined_ph(), 0xffff, a)
7799}
7800
7801/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7802/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7803/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7804///
7805/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp_ph)
7806#[inline]
7807#[target_feature(enable = "avx512fp16,avx512vl")]
7808#[cfg_attr(test, assert_instr(vrcpph))]
7809#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7810pub fn _mm256_mask_rcp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
7811    unsafe { vrcpph_256(a, src, k) }
7812}
7813
7814/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7815/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7816/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7817///
7818/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp_ph)
7819#[inline]
7820#[target_feature(enable = "avx512fp16,avx512vl")]
7821#[cfg_attr(test, assert_instr(vrcpph))]
7822#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7823pub fn _mm256_maskz_rcp_ph(k: __mmask16, a: __m256h) -> __m256h {
7824    _mm256_mask_rcp_ph(_mm256_setzero_ph(), k, a)
7825}
7826
7827/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`.
7828/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7829///
7830/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp_ph)
7831#[inline]
7832#[target_feature(enable = "avx512fp16")]
7833#[cfg_attr(test, assert_instr(vrcpph))]
7834#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7835pub fn _mm512_rcp_ph(a: __m512h) -> __m512h {
7836    _mm512_mask_rcp_ph(_mm512_undefined_ph(), 0xffffffff, a)
7837}
7838
7839/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7840/// using writemask `k` (elements are copied from `src` when the corresponding mask bit is not set).
7841/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7842///
7843/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp_ph)
7844#[inline]
7845#[target_feature(enable = "avx512fp16")]
7846#[cfg_attr(test, assert_instr(vrcpph))]
7847#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7848pub fn _mm512_mask_rcp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
7849    unsafe { vrcpph_512(a, src, k) }
7850}
7851
7852/// Compute the approximate reciprocal of packed 16-bit floating-point elements in `a` and stores the results in `dst`
7853/// using zeromask `k` (elements are zeroed out when the corresponding mask bit is not set).
7854/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7855///
7856/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp_ph)
7857#[inline]
7858#[target_feature(enable = "avx512fp16")]
7859#[cfg_attr(test, assert_instr(vrcpph))]
7860#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7861pub fn _mm512_maskz_rcp_ph(k: __mmask32, a: __m512h) -> __m512h {
7862    _mm512_mask_rcp_ph(_mm512_setzero_ph(), k, a)
7863}
7864
7865/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7866/// store the result in the lower element of dst, and copy the upper 7 packed elements from a to the
7867/// upper elements of dst.
7868/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7869///
7870/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_sh)
7871#[inline]
7872#[target_feature(enable = "avx512fp16")]
7873#[cfg_attr(test, assert_instr(vrcpsh))]
7874#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7875pub fn _mm_rcp_sh(a: __m128h, b: __m128h) -> __m128h {
7876    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
7877}
7878
7879/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7880/// store the result in the lower element of dst using writemask k (the element is copied from src when
7881/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7882/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7883///
7884/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp_sh)
7885#[inline]
7886#[target_feature(enable = "avx512fp16")]
7887#[cfg_attr(test, assert_instr(vrcpsh))]
7888#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7889pub fn _mm_mask_rcp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7890    unsafe { vrcpsh(a, b, src, k) }
7891}
7892
7893/// Compute the approximate reciprocal of the lower half-precision (16-bit) floating-point element in b,
7894/// store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
7895/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
7896/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7897///
7898/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp_sh)
7899#[inline]
7900#[target_feature(enable = "avx512fp16")]
7901#[cfg_attr(test, assert_instr(vrcpsh))]
7902#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7903pub fn _mm_maskz_rcp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
7904    _mm_mask_rcp_sh(f16x8::ZERO.as_m128h(), k, a, b)
7905}
7906
7907/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7908/// elements in a, and store the results in dst.
7909/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7910///
7911/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ph)
7912#[inline]
7913#[target_feature(enable = "avx512fp16,avx512vl")]
7914#[cfg_attr(test, assert_instr(vrsqrtph))]
7915#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7916pub fn _mm_rsqrt_ph(a: __m128h) -> __m128h {
7917    _mm_mask_rsqrt_ph(_mm_undefined_ph(), 0xff, a)
7918}
7919
7920/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7921/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7922/// the corresponding mask bit is not set).
7923/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7924///
7925/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_ph)
7926#[inline]
7927#[target_feature(enable = "avx512fp16,avx512vl")]
7928#[cfg_attr(test, assert_instr(vrsqrtph))]
7929#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7930pub fn _mm_mask_rsqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
7931    unsafe { vrsqrtph_128(a, src, k) }
7932}
7933
7934/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7935/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7936/// corresponding mask bit is not set).
7937/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7938///
7939/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_ph)
7940#[inline]
7941#[target_feature(enable = "avx512fp16,avx512vl")]
7942#[cfg_attr(test, assert_instr(vrsqrtph))]
7943#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7944pub fn _mm_maskz_rsqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
7945    _mm_mask_rsqrt_ph(_mm_setzero_ph(), k, a)
7946}
7947
7948/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7949/// elements in a, and store the results in dst.
7950/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7951///
7952/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rsqrt_ph)
7953#[inline]
7954#[target_feature(enable = "avx512fp16,avx512vl")]
7955#[cfg_attr(test, assert_instr(vrsqrtph))]
7956#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7957pub fn _mm256_rsqrt_ph(a: __m256h) -> __m256h {
7958    _mm256_mask_rsqrt_ph(_mm256_undefined_ph(), 0xffff, a)
7959}
7960
7961/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7962/// elements in a, and store the results in dst using writemask k (elements are copied from src when
7963/// the corresponding mask bit is not set).
7964/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7965///
7966/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt_ph)
7967#[inline]
7968#[target_feature(enable = "avx512fp16,avx512vl")]
7969#[cfg_attr(test, assert_instr(vrsqrtph))]
7970#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7971pub fn _mm256_mask_rsqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
7972    unsafe { vrsqrtph_256(a, src, k) }
7973}
7974
7975/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7976/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
7977/// corresponding mask bit is not set).
7978/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7979///
7980/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rsqrt_ph)
7981#[inline]
7982#[target_feature(enable = "avx512fp16,avx512vl")]
7983#[cfg_attr(test, assert_instr(vrsqrtph))]
7984#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7985pub fn _mm256_maskz_rsqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
7986    _mm256_mask_rsqrt_ph(_mm256_setzero_ph(), k, a)
7987}
7988
7989/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
7990/// elements in a, and store the results in dst.
7991/// The maximum relative error for this approximation is less than `1.5*2^-12`.
7992///
7993/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt_ph)
7994#[inline]
7995#[target_feature(enable = "avx512fp16")]
7996#[cfg_attr(test, assert_instr(vrsqrtph))]
7997#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
7998pub fn _mm512_rsqrt_ph(a: __m512h) -> __m512h {
7999    _mm512_mask_rsqrt_ph(_mm512_undefined_ph(), 0xffffffff, a)
8000}
8001
8002/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8003/// elements in a, and store the results in dst using writemask k (elements are copied from src when
8004/// the corresponding mask bit is not set).
8005/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8006///
8007/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt_ph)
8008#[inline]
8009#[target_feature(enable = "avx512fp16")]
8010#[cfg_attr(test, assert_instr(vrsqrtph))]
8011#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8012pub fn _mm512_mask_rsqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8013    unsafe { vrsqrtph_512(a, src, k) }
8014}
8015
8016/// Compute the approximate reciprocal square root of packed half-precision (16-bit) floating-point
8017/// elements in a, and store the results in dst using zeromask k (elements are zeroed out when the
8018/// corresponding mask bit is not set).
8019/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8020///
8021/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt_ph)
8022#[inline]
8023#[target_feature(enable = "avx512fp16")]
8024#[cfg_attr(test, assert_instr(vrsqrtph))]
8025#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8026pub fn _mm512_maskz_rsqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
8027    _mm512_mask_rsqrt_ph(_mm512_setzero_ph(), k, a)
8028}
8029
8030/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
8031/// element in b, store the result in the lower element of dst, and copy the upper 7 packed elements from a
8032/// to the upper elements of dst.
8033/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8034///
8035/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_sh)
8036#[inline]
8037#[target_feature(enable = "avx512fp16")]
8038#[cfg_attr(test, assert_instr(vrsqrtsh))]
8039#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8040pub fn _mm_rsqrt_sh(a: __m128h, b: __m128h) -> __m128h {
8041    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
8042}
8043
8044/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
8045/// element in b, store the result in the lower element of dst using writemask k (the element is copied from src
8046/// when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8047/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8048///
8049/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt_sh)
8050#[inline]
8051#[target_feature(enable = "avx512fp16")]
8052#[cfg_attr(test, assert_instr(vrsqrtsh))]
8053#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8054pub fn _mm_mask_rsqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8055    unsafe { vrsqrtsh(a, b, src, k) }
8056}
8057
8058/// Compute the approximate reciprocal square root of the lower half-precision (16-bit) floating-point
8059/// element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when
8060/// mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8061/// The maximum relative error for this approximation is less than `1.5*2^-12`.
8062///
8063/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt_sh)
8064#[inline]
8065#[target_feature(enable = "avx512fp16")]
8066#[cfg_attr(test, assert_instr(vrsqrtsh))]
8067#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8068pub fn _mm_maskz_rsqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8069    _mm_mask_rsqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
8070}
8071
8072/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8073/// results in dst.
8074///
8075/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_ph)
8076#[inline]
8077#[target_feature(enable = "avx512fp16,avx512vl")]
8078#[cfg_attr(test, assert_instr(vsqrtph))]
8079#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8080pub fn _mm_sqrt_ph(a: __m128h) -> __m128h {
8081    unsafe { simd_fsqrt(a) }
8082}
8083
8084/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8085/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8086///
8087/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ph)
8088#[inline]
8089#[target_feature(enable = "avx512fp16,avx512vl")]
8090#[cfg_attr(test, assert_instr(vsqrtph))]
8091#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8092pub fn _mm_mask_sqrt_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
8093    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), src) }
8094}
8095
8096/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8097/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8098///
8099/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ph)
8100#[inline]
8101#[target_feature(enable = "avx512fp16,avx512vl")]
8102#[cfg_attr(test, assert_instr(vsqrtph))]
8103#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8104pub fn _mm_maskz_sqrt_ph(k: __mmask8, a: __m128h) -> __m128h {
8105    unsafe { simd_select_bitmask(k, _mm_sqrt_ph(a), _mm_setzero_ph()) }
8106}
8107
8108/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8109/// results in dst.
8110///
8111/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sqrt_ph)
8112#[inline]
8113#[target_feature(enable = "avx512fp16,avx512vl")]
8114#[cfg_attr(test, assert_instr(vsqrtph))]
8115#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8116pub fn _mm256_sqrt_ph(a: __m256h) -> __m256h {
8117    unsafe { simd_fsqrt(a) }
8118}
8119
8120/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8121/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8122///
8123/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_ph)
8124#[inline]
8125#[target_feature(enable = "avx512fp16,avx512vl")]
8126#[cfg_attr(test, assert_instr(vsqrtph))]
8127#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8128pub fn _mm256_mask_sqrt_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8129    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), src) }
8130}
8131
8132/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8133/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8134///
8135/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_ph)
8136#[inline]
8137#[target_feature(enable = "avx512fp16,avx512vl")]
8138#[cfg_attr(test, assert_instr(vsqrtph))]
8139#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8140pub fn _mm256_maskz_sqrt_ph(k: __mmask16, a: __m256h) -> __m256h {
8141    unsafe { simd_select_bitmask(k, _mm256_sqrt_ph(a), _mm256_setzero_ph()) }
8142}
8143
8144/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8145/// results in dst.
8146///
8147/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_ph)
8148#[inline]
8149#[target_feature(enable = "avx512fp16")]
8150#[cfg_attr(test, assert_instr(vsqrtph))]
8151#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8152pub fn _mm512_sqrt_ph(a: __m512h) -> __m512h {
8153    unsafe { simd_fsqrt(a) }
8154}
8155
8156/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8157/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8158///
8159/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_ph)
8160#[inline]
8161#[target_feature(enable = "avx512fp16")]
8162#[cfg_attr(test, assert_instr(vsqrtph))]
8163#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8164pub fn _mm512_mask_sqrt_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
8165    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), src) }
8166}
8167
8168/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8169/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8170///
8171/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_ph)
8172#[inline]
8173#[target_feature(enable = "avx512fp16")]
8174#[cfg_attr(test, assert_instr(vsqrtph))]
8175#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8176pub fn _mm512_maskz_sqrt_ph(k: __mmask32, a: __m512h) -> __m512h {
8177    unsafe { simd_select_bitmask(k, _mm512_sqrt_ph(a), _mm512_setzero_ph()) }
8178}
8179
8180/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8181/// results in dst.
8182/// Rounding is done according to the rounding parameter, which can be one of:
8183///
8184/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8185/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8186/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8187/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8188/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8189///
8190/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_ph)
8191#[inline]
8192#[target_feature(enable = "avx512fp16")]
8193#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8194#[rustc_legacy_const_generics(1)]
8195#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8196pub fn _mm512_sqrt_round_ph<const ROUNDING: i32>(a: __m512h) -> __m512h {
8197    unsafe {
8198        static_assert_rounding!(ROUNDING);
8199        vsqrtph_512(a, ROUNDING)
8200    }
8201}
8202
8203/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8204/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8205/// Rounding is done according to the rounding parameter, which can be one of:
8206///
8207/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8208/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8209/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8210/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8211/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8212///
8213/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_ph)
8214#[inline]
8215#[target_feature(enable = "avx512fp16")]
8216#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8217#[rustc_legacy_const_generics(3)]
8218#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8219pub fn _mm512_mask_sqrt_round_ph<const ROUNDING: i32>(
8220    src: __m512h,
8221    k: __mmask32,
8222    a: __m512h,
8223) -> __m512h {
8224    unsafe {
8225        static_assert_rounding!(ROUNDING);
8226        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), src)
8227    }
8228}
8229
8230/// Compute the square root of packed half-precision (16-bit) floating-point elements in a, and store the
8231/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8232/// Rounding is done according to the rounding parameter, which can be one of:
8233///
8234/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8235/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8236/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8237/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8238/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8239///
8240/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_round_ph)
8241#[inline]
8242#[target_feature(enable = "avx512fp16")]
8243#[cfg_attr(test, assert_instr(vsqrtph, ROUNDING = 8))]
8244#[rustc_legacy_const_generics(2)]
8245#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8246pub fn _mm512_maskz_sqrt_round_ph<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512h {
8247    unsafe {
8248        static_assert_rounding!(ROUNDING);
8249        simd_select_bitmask(k, _mm512_sqrt_round_ph::<ROUNDING>(a), _mm512_setzero_ph())
8250    }
8251}
8252
8253/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8254/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8255/// elements of dst.
8256///
8257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sh)
8258#[inline]
8259#[target_feature(enable = "avx512fp16")]
8260#[cfg_attr(test, assert_instr(vsqrtsh))]
8261#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8262pub fn _mm_sqrt_sh(a: __m128h, b: __m128h) -> __m128h {
8263    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
8264}
8265
8266/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8267/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8268/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8269///
8270/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_sh)
8271#[inline]
8272#[target_feature(enable = "avx512fp16")]
8273#[cfg_attr(test, assert_instr(vsqrtsh))]
8274#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8275pub fn _mm_mask_sqrt_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8276    _mm_mask_sqrt_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8277}
8278
8279/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8280/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8281/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8282///
8283/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_sh)
8284#[inline]
8285#[target_feature(enable = "avx512fp16")]
8286#[cfg_attr(test, assert_instr(vsqrtsh))]
8287#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8288pub fn _mm_maskz_sqrt_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8289    _mm_mask_sqrt_sh(f16x8::ZERO.as_m128h(), k, a, b)
8290}
8291
8292/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8293/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
8294/// elements of dst.
8295/// Rounding is done according to the rounding parameter, which can be one of:
8296///
8297/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8298/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8299/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8300/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8301/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8302///
8303/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_round_sh)
8304#[inline]
8305#[target_feature(enable = "avx512fp16")]
8306#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8307#[rustc_legacy_const_generics(2)]
8308#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8309pub fn _mm_sqrt_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
8310    static_assert_rounding!(ROUNDING);
8311    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
8312}
8313
8314/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8315/// the result in the lower element of dst using writemask k (the element is copied from src when mask
8316/// bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8317/// Rounding is done according to the rounding parameter, which can be one of:
8318///
8319/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8320/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8321/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8322/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8323/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8324///
8325/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_round_sh)
8326#[inline]
8327#[target_feature(enable = "avx512fp16")]
8328#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8329#[rustc_legacy_const_generics(4)]
8330#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8331pub fn _mm_mask_sqrt_round_sh<const ROUNDING: i32>(
8332    src: __m128h,
8333    k: __mmask8,
8334    a: __m128h,
8335    b: __m128h,
8336) -> __m128h {
8337    unsafe {
8338        static_assert_rounding!(ROUNDING);
8339        vsqrtsh(a, b, src, k, ROUNDING)
8340    }
8341}
8342
8343/// Compute the square root of the lower half-precision (16-bit) floating-point element in b, store
8344/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0
8345/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
8346/// Rounding is done according to the rounding parameter, which can be one of:
8347///
8348/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
8349/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
8350/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
8351/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
8352/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
8353///
8354/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_round_sh)
8355#[inline]
8356#[target_feature(enable = "avx512fp16")]
8357#[cfg_attr(test, assert_instr(vsqrtsh, ROUNDING = 8))]
8358#[rustc_legacy_const_generics(3)]
8359#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8360pub fn _mm_maskz_sqrt_round_sh<const ROUNDING: i32>(
8361    k: __mmask8,
8362    a: __m128h,
8363    b: __m128h,
8364) -> __m128h {
8365    static_assert_rounding!(ROUNDING);
8366    _mm_mask_sqrt_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
8367}
8368
8369/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8370/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8371/// value when inputs are NaN or signed-zero values.
8372///
8373/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_ph)
8374#[inline]
8375#[target_feature(enable = "avx512fp16,avx512vl")]
8376#[cfg_attr(test, assert_instr(vmaxph))]
8377#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8378pub fn _mm_max_ph(a: __m128h, b: __m128h) -> __m128h {
8379    unsafe { vmaxph_128(a, b) }
8380}
8381
8382/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8383/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8384/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8385/// NaN or signed-zero values.
8386///
8387/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ph)
8388#[inline]
8389#[target_feature(enable = "avx512fp16,avx512vl")]
8390#[cfg_attr(test, assert_instr(vmaxph))]
8391#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8392pub fn _mm_mask_max_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8393    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), src) }
8394}
8395
8396/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8397/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8398/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8399/// NaN or signed-zero values.
8400///
8401/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ph)
8402#[inline]
8403#[target_feature(enable = "avx512fp16,avx512vl")]
8404#[cfg_attr(test, assert_instr(vmaxph))]
8405#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8406pub fn _mm_maskz_max_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8407    unsafe { simd_select_bitmask(k, _mm_max_ph(a, b), _mm_setzero_ph()) }
8408}
8409
8410/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8411/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8412/// value when inputs are NaN or signed-zero values.
8413///
8414/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_ph)
8415#[inline]
8416#[target_feature(enable = "avx512fp16,avx512vl")]
8417#[cfg_attr(test, assert_instr(vmaxph))]
8418#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8419pub fn _mm256_max_ph(a: __m256h, b: __m256h) -> __m256h {
8420    unsafe { vmaxph_256(a, b) }
8421}
8422
8423/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8424/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8425/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8426/// NaN or signed-zero values.
8427///
8428/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_ph)
8429#[inline]
8430#[target_feature(enable = "avx512fp16,avx512vl")]
8431#[cfg_attr(test, assert_instr(vmaxph))]
8432#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8433pub fn _mm256_mask_max_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8434    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), src) }
8435}
8436
8437/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8438/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8439/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8440/// NaN or signed-zero values.
8441///
8442/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_ph)
8443#[inline]
8444#[target_feature(enable = "avx512fp16,avx512vl")]
8445#[cfg_attr(test, assert_instr(vmaxph))]
8446#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8447pub fn _mm256_maskz_max_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8448    unsafe { simd_select_bitmask(k, _mm256_max_ph(a, b), _mm256_setzero_ph()) }
8449}
8450
8451/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8452/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum
8453/// value when inputs are NaN or signed-zero values.
8454///
8455/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_ph)
8456#[inline]
8457#[target_feature(enable = "avx512fp16")]
8458#[cfg_attr(test, assert_instr(vmaxph))]
8459#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8460pub fn _mm512_max_ph(a: __m512h, b: __m512h) -> __m512h {
8461    _mm512_max_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8462}
8463
8464/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8465/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8466/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8467/// NaN or signed-zero values.
8468///
8469/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_ph)
8470#[inline]
8471#[target_feature(enable = "avx512fp16")]
8472#[cfg_attr(test, assert_instr(vmaxph))]
8473#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8474pub fn _mm512_mask_max_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8475    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), src) }
8476}
8477
8478/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8479/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8480/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8481/// NaN or signed-zero values.
8482///
8483/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_ph)
8484#[inline]
8485#[target_feature(enable = "avx512fp16")]
8486#[cfg_attr(test, assert_instr(vmaxph))]
8487#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8488pub fn _mm512_maskz_max_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8489    unsafe { simd_select_bitmask(k, _mm512_max_ph(a, b), _mm512_setzero_ph()) }
8490}
8491
8492/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8493/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
8494/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are
8495/// NaN or signed-zero values.
8496///
8497/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_ph)
8498#[inline]
8499#[target_feature(enable = "avx512fp16")]
8500#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8501#[rustc_legacy_const_generics(2)]
8502#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8503pub fn _mm512_max_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8504    unsafe {
8505        static_assert_sae!(SAE);
8506        vmaxph_512(a, b, SAE)
8507    }
8508}
8509
8510/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8511/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8512/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8513/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8514///
8515/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_ph)
8516#[inline]
8517#[target_feature(enable = "avx512fp16")]
8518#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8519#[rustc_legacy_const_generics(4)]
8520#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8521pub fn _mm512_mask_max_round_ph<const SAE: i32>(
8522    src: __m512h,
8523    k: __mmask32,
8524    a: __m512h,
8525    b: __m512h,
8526) -> __m512h {
8527    unsafe {
8528        static_assert_sae!(SAE);
8529        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), src)
8530    }
8531}
8532
8533/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed maximum
8534/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8535/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8536/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8537///
8538/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_ph)
8539#[inline]
8540#[target_feature(enable = "avx512fp16")]
8541#[cfg_attr(test, assert_instr(vmaxph, SAE = 8))]
8542#[rustc_legacy_const_generics(3)]
8543#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8544pub fn _mm512_maskz_max_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8545    unsafe {
8546        static_assert_sae!(SAE);
8547        simd_select_bitmask(k, _mm512_max_round_ph::<SAE>(a, b), _mm512_setzero_ph())
8548    }
8549}
8550
8551/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8552/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
8553/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value
8554/// when inputs are NaN or signed-zero values.
8555///
8556/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sh)
8557#[inline]
8558#[target_feature(enable = "avx512fp16,avx512vl")]
8559#[cfg_attr(test, assert_instr(vmaxsh))]
8560#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8561pub fn _mm_max_sh(a: __m128h, b: __m128h) -> __m128h {
8562    _mm_mask_max_sh(_mm_undefined_ph(), 0xff, a, b)
8563}
8564
8565/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum
8566/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
8567/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
8568/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8569///
8570/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_sh)
8571#[inline]
8572#[target_feature(enable = "avx512fp16,avx512vl")]
8573#[cfg_attr(test, assert_instr(vmaxsh))]
8574#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8575pub fn _mm_mask_max_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8576    _mm_mask_max_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8577}
8578
8579/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8580/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8581/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
8582/// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8583///
8584/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_sh)
8585#[inline]
8586#[target_feature(enable = "avx512fp16,avx512vl")]
8587#[cfg_attr(test, assert_instr(vmaxsh))]
8588#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8589pub fn _mm_maskz_max_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8590    _mm_mask_max_sh(f16x8::ZERO.as_m128h(), k, a, b)
8591}
8592
8593/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8594/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
8595/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8596/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8597///
8598/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_round_sh)
8599#[inline]
8600#[target_feature(enable = "avx512fp16,avx512vl")]
8601#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8602#[rustc_legacy_const_generics(2)]
8603#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8604pub fn _mm_max_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
8605    static_assert_sae!(SAE);
8606    _mm_mask_max_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
8607}
8608
8609/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8610/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
8611/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8612/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8613/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8614///
8615/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_round_sh)
8616#[inline]
8617#[target_feature(enable = "avx512fp16,avx512vl")]
8618#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8619#[rustc_legacy_const_generics(4)]
8620#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8621pub fn _mm_mask_max_round_sh<const SAE: i32>(
8622    src: __m128h,
8623    k: __mmask8,
8624    a: __m128h,
8625    b: __m128h,
8626) -> __m128h {
8627    unsafe {
8628        static_assert_sae!(SAE);
8629        vmaxsh(a, b, src, k, SAE)
8630    }
8631}
8632
8633/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the maximum value
8634/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8635/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8636/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8637/// (IEEE 754) maximum value when inputs are NaN or signed-zero values.
8638///
8639/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_round_sh)
8640#[inline]
8641#[target_feature(enable = "avx512fp16,avx512vl")]
8642#[cfg_attr(test, assert_instr(vmaxsh, SAE = 8))]
8643#[rustc_legacy_const_generics(3)]
8644#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8645pub fn _mm_maskz_max_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8646    static_assert_sae!(SAE);
8647    _mm_mask_max_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
8648}
8649
8650/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8651/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8652/// when inputs are NaN or signed-zero values.
8653///
8654/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_ph)
8655#[inline]
8656#[target_feature(enable = "avx512fp16,avx512vl")]
8657#[cfg_attr(test, assert_instr(vminph))]
8658#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8659pub fn _mm_min_ph(a: __m128h, b: __m128h) -> __m128h {
8660    unsafe { vminph_128(a, b) }
8661}
8662
8663/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8664/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8665/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8666/// NaN or signed-zero values.
8667///
8668/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ph)
8669#[inline]
8670#[target_feature(enable = "avx512fp16,avx512vl")]
8671#[cfg_attr(test, assert_instr(vminph))]
8672#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8673pub fn _mm_mask_min_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8674    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), src) }
8675}
8676
8677/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8678/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8679/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8680/// NaN or signed-zero values.
8681///
8682/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ph)
8683#[inline]
8684#[target_feature(enable = "avx512fp16,avx512vl")]
8685#[cfg_attr(test, assert_instr(vminph))]
8686#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8687pub fn _mm_maskz_min_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8688    unsafe { simd_select_bitmask(k, _mm_min_ph(a, b), _mm_setzero_ph()) }
8689}
8690
8691/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8692/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8693/// when inputs are NaN or signed-zero values.
8694///
8695/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_ph)
8696#[inline]
8697#[target_feature(enable = "avx512fp16,avx512vl")]
8698#[cfg_attr(test, assert_instr(vminph))]
8699#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8700pub fn _mm256_min_ph(a: __m256h, b: __m256h) -> __m256h {
8701    unsafe { vminph_256(a, b) }
8702}
8703
8704/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8705/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8706/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8707/// NaN or signed-zero values.
8708///
8709/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_ph)
8710#[inline]
8711#[target_feature(enable = "avx512fp16,avx512vl")]
8712#[cfg_attr(test, assert_instr(vminph))]
8713#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8714pub fn _mm256_mask_min_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8715    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), src) }
8716}
8717
8718/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8719/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8720/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8721/// NaN or signed-zero values.
8722///
8723/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_ph)
8724#[inline]
8725#[target_feature(enable = "avx512fp16,avx512vl")]
8726#[cfg_attr(test, assert_instr(vminph))]
8727#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8728pub fn _mm256_maskz_min_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
8729    unsafe { simd_select_bitmask(k, _mm256_min_ph(a, b), _mm256_setzero_ph()) }
8730}
8731
8732/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8733/// values in dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value
8734/// when inputs are NaN or signed-zero values.
8735///
8736/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_ph)
8737#[inline]
8738#[target_feature(enable = "avx512fp16")]
8739#[cfg_attr(test, assert_instr(vminph))]
8740#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8741pub fn _mm512_min_ph(a: __m512h, b: __m512h) -> __m512h {
8742    _mm512_min_round_ph::<_MM_FROUND_CUR_DIRECTION>(a, b)
8743}
8744
8745/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8746/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8747/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8748/// NaN or signed-zero values.
8749///
8750/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ph)
8751#[inline]
8752#[target_feature(enable = "avx512fp16")]
8753#[cfg_attr(test, assert_instr(vminph))]
8754#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8755pub fn _mm512_mask_min_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8756    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), src) }
8757}
8758
8759/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8760/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8761/// Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are
8762/// NaN or signed-zero values.
8763///
8764/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ph)
8765#[inline]
8766#[target_feature(enable = "avx512fp16")]
8767#[cfg_attr(test, assert_instr(vminph))]
8768#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8769pub fn _mm512_maskz_min_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8770    unsafe { simd_select_bitmask(k, _mm512_min_ph(a, b), _mm512_setzero_ph()) }
8771}
8772
8773/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8774/// values in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not
8775/// follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8776///
8777/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_ph)
8778#[inline]
8779#[target_feature(enable = "avx512fp16")]
8780#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8781#[rustc_legacy_const_generics(2)]
8782#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8783pub fn _mm512_min_round_ph<const SAE: i32>(a: __m512h, b: __m512h) -> __m512h {
8784    unsafe {
8785        static_assert_sae!(SAE);
8786        vminph_512(a, b, SAE)
8787    }
8788}
8789
8790/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8791/// values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
8792/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8793/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8794///
8795/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ph)
8796#[inline]
8797#[target_feature(enable = "avx512fp16")]
8798#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8799#[rustc_legacy_const_generics(4)]
8800#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8801pub fn _mm512_mask_min_round_ph<const SAE: i32>(
8802    src: __m512h,
8803    k: __mmask32,
8804    a: __m512h,
8805    b: __m512h,
8806) -> __m512h {
8807    unsafe {
8808        static_assert_sae!(SAE);
8809        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), src)
8810    }
8811}
8812
8813/// Compare packed half-precision (16-bit) floating-point elements in a and b, and store packed minimum
8814/// values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
8815/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8816/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8817///
8818/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ph)
8819#[inline]
8820#[target_feature(enable = "avx512fp16")]
8821#[cfg_attr(test, assert_instr(vminph, SAE = 8))]
8822#[rustc_legacy_const_generics(3)]
8823#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8824pub fn _mm512_maskz_min_round_ph<const SAE: i32>(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
8825    unsafe {
8826        static_assert_sae!(SAE);
8827        simd_select_bitmask(k, _mm512_min_round_ph::<SAE>(a, b), _mm512_setzero_ph())
8828    }
8829}
8830
8831/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
8832/// value in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
8833/// of dst. Does not follow the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
8834/// inputs are NaN or signed-zero values.
8835///
8836/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sh)
8837#[inline]
8838#[target_feature(enable = "avx512fp16,avx512vl")]
8839#[cfg_attr(test, assert_instr(vminsh))]
8840#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8841pub fn _mm_min_sh(a: __m128h, b: __m128h) -> __m128h {
8842    _mm_mask_min_sh(_mm_undefined_ph(), 0xff, a, b)
8843}
8844
8845/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum
8846/// value in the lower element of dst using writemask k (the element is copied from src when mask bit 0
8847/// is not set), and copy the upper 7 packed elements from a to the upper elements of dst. Does not follow
8848/// the IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8849///
8850/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_sh)
8851#[inline]
8852#[target_feature(enable = "avx512fp16,avx512vl")]
8853#[cfg_attr(test, assert_instr(vminsh))]
8854#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8855pub fn _mm_mask_min_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8856    _mm_mask_min_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
8857}
8858
8859/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8860/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8861/// copy the upper 7 packed elements from a to the upper elements of dst. Does not follow the IEEE Standard
8862/// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8863///
8864/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_sh)
8865#[inline]
8866#[target_feature(enable = "avx512fp16,avx512vl")]
8867#[cfg_attr(test, assert_instr(vminsh))]
8868#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8869pub fn _mm_maskz_min_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8870    _mm_mask_min_sh(f16x8::ZERO.as_m128h(), k, a, b)
8871}
8872
8873/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8874/// in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements of dst.
8875/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the
8876/// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8877///
8878/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_round_sh)
8879#[inline]
8880#[target_feature(enable = "avx512fp16,avx512vl")]
8881#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8882#[rustc_legacy_const_generics(2)]
8883#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8884pub fn _mm_min_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
8885    static_assert_sae!(SAE);
8886    _mm_mask_min_round_sh::<SAE>(_mm_undefined_ph(), 0xff, a, b)
8887}
8888
8889/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8890/// in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
8891/// and copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8892/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8893/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8894///
8895/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_round_sh)
8896#[inline]
8897#[target_feature(enable = "avx512fp16,avx512vl")]
8898#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8899#[rustc_legacy_const_generics(4)]
8900#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8901pub fn _mm_mask_min_round_sh<const SAE: i32>(
8902    src: __m128h,
8903    k: __mmask8,
8904    a: __m128h,
8905    b: __m128h,
8906) -> __m128h {
8907    unsafe {
8908        static_assert_sae!(SAE);
8909        vminsh(a, b, src, k, SAE)
8910    }
8911}
8912
8913/// Compare the lower half-precision (16-bit) floating-point elements in a and b, store the minimum value
8914/// in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and
8915/// copy the upper 7 packed elements from a to the upper elements of dst. Exceptions can be suppressed by
8916/// passing _MM_FROUND_NO_EXC in the sae parameter. Does not follow the IEEE Standard for Floating-Point Arithmetic
8917/// (IEEE 754) minimum value when inputs are NaN or signed-zero values.
8918///
8919/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_round_sh)
8920#[inline]
8921#[target_feature(enable = "avx512fp16,avx512vl")]
8922#[cfg_attr(test, assert_instr(vminsh, SAE = 8))]
8923#[rustc_legacy_const_generics(3)]
8924#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8925pub fn _mm_maskz_min_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
8926    static_assert_sae!(SAE);
8927    _mm_mask_min_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
8928}
8929
8930/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8931/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8932/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8933///
8934/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ph)
8935#[inline]
8936#[target_feature(enable = "avx512fp16,avx512vl")]
8937#[cfg_attr(test, assert_instr(vgetexpph))]
8938#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8939pub fn _mm_getexp_ph(a: __m128h) -> __m128h {
8940    _mm_mask_getexp_ph(_mm_undefined_ph(), 0xff, a)
8941}
8942
8943/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8944/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8945/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8946/// `floor(log2(x))` for each element.
8947///
8948/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ph)
8949#[inline]
8950#[target_feature(enable = "avx512fp16,avx512vl")]
8951#[cfg_attr(test, assert_instr(vgetexpph))]
8952#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8953pub fn _mm_mask_getexp_ph(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
8954    unsafe { vgetexpph_128(a, src, k) }
8955}
8956
8957/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8958/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
8959/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
8960/// `floor(log2(x))` for each element.
8961///
8962/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ph)
8963#[inline]
8964#[target_feature(enable = "avx512fp16,avx512vl")]
8965#[cfg_attr(test, assert_instr(vgetexpph))]
8966#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8967pub fn _mm_maskz_getexp_ph(k: __mmask8, a: __m128h) -> __m128h {
8968    _mm_mask_getexp_ph(_mm_setzero_ph(), k, a)
8969}
8970
8971/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8972/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
8973/// This intrinsic essentially calculates `floor(log2(x))` for each element.
8974///
8975/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_ph)
8976#[inline]
8977#[target_feature(enable = "avx512fp16,avx512vl")]
8978#[cfg_attr(test, assert_instr(vgetexpph))]
8979#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8980pub fn _mm256_getexp_ph(a: __m256h) -> __m256h {
8981    _mm256_mask_getexp_ph(_mm256_undefined_ph(), 0xffff, a)
8982}
8983
8984/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8985/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
8986/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
8987/// `floor(log2(x))` for each element.
8988///
8989/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_ph)
8990#[inline]
8991#[target_feature(enable = "avx512fp16,avx512vl")]
8992#[cfg_attr(test, assert_instr(vgetexpph))]
8993#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
8994pub fn _mm256_mask_getexp_ph(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
8995    unsafe { vgetexpph_256(a, src, k) }
8996}
8997
8998/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
8999/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9000/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9001/// `floor(log2(x))` for each element.
9002///
9003/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ph)
9004#[inline]
9005#[target_feature(enable = "avx512fp16,avx512vl")]
9006#[cfg_attr(test, assert_instr(vgetexpph))]
9007#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9008pub fn _mm256_maskz_getexp_ph(k: __mmask16, a: __m256h) -> __m256h {
9009    _mm256_mask_getexp_ph(_mm256_setzero_ph(), k, a)
9010}
9011
9012/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9013/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
9014/// This intrinsic essentially calculates `floor(log2(x))` for each element.
9015///
9016/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_ph)
9017#[inline]
9018#[target_feature(enable = "avx512fp16")]
9019#[cfg_attr(test, assert_instr(vgetexpph))]
9020#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9021pub fn _mm512_getexp_ph(a: __m512h) -> __m512h {
9022    _mm512_mask_getexp_ph(_mm512_undefined_ph(), 0xffffffff, a)
9023}
9024
9025/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9026/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
9027/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
9028/// `floor(log2(x))` for each element.
9029///
9030/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_ph)
9031#[inline]
9032#[target_feature(enable = "avx512fp16")]
9033#[cfg_attr(test, assert_instr(vgetexpph))]
9034#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9035pub fn _mm512_mask_getexp_ph(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
9036    _mm512_mask_getexp_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a)
9037}
9038
9039/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9040/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9041/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9042/// `floor(log2(x))` for each element.
9043///
9044/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_ph)
9045#[inline]
9046#[target_feature(enable = "avx512fp16")]
9047#[cfg_attr(test, assert_instr(vgetexpph))]
9048#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9049pub fn _mm512_maskz_getexp_ph(k: __mmask32, a: __m512h) -> __m512h {
9050    _mm512_mask_getexp_ph(_mm512_setzero_ph(), k, a)
9051}
9052
9053/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9054/// (16-bit) floating-point number representing the integer exponent, and store the results in dst.
9055/// This intrinsic essentially calculates `floor(log2(x))` for each element. Exceptions can be suppressed
9056/// by passing _MM_FROUND_NO_EXC in the sae parameter
9057///
9058/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_ph)
9059#[inline]
9060#[target_feature(enable = "avx512fp16")]
9061#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
9062#[rustc_legacy_const_generics(1)]
9063#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9064pub fn _mm512_getexp_round_ph<const SAE: i32>(a: __m512h) -> __m512h {
9065    static_assert_sae!(SAE);
9066    _mm512_mask_getexp_round_ph::<SAE>(_mm512_undefined_ph(), 0xffffffff, a)
9067}
9068
9069/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9070/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k
9071/// (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates
9072/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9073///
9074/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_ph)
9075#[inline]
9076#[target_feature(enable = "avx512fp16")]
9077#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
9078#[rustc_legacy_const_generics(3)]
9079#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9080pub fn _mm512_mask_getexp_round_ph<const SAE: i32>(
9081    src: __m512h,
9082    k: __mmask32,
9083    a: __m512h,
9084) -> __m512h {
9085    unsafe {
9086        static_assert_sae!(SAE);
9087        vgetexpph_512(a, src, k, SAE)
9088    }
9089}
9090
9091/// Convert the exponent of each packed half-precision (16-bit) floating-point element in a to a half-precision
9092/// (16-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask
9093/// k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates
9094/// `floor(log2(x))` for each element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9095///
9096/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_ph)
9097#[inline]
9098#[target_feature(enable = "avx512fp16")]
9099#[cfg_attr(test, assert_instr(vgetexpph, SAE = 8))]
9100#[rustc_legacy_const_generics(2)]
9101#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9102pub fn _mm512_maskz_getexp_round_ph<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512h {
9103    static_assert_sae!(SAE);
9104    _mm512_mask_getexp_round_ph::<SAE>(_mm512_setzero_ph(), k, a)
9105}
9106
9107/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9108/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9109/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
9110/// calculates `floor(log2(x))` for the lower element.
9111///
9112/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_sh)
9113#[inline]
9114#[target_feature(enable = "avx512fp16")]
9115#[cfg_attr(test, assert_instr(vgetexpsh))]
9116#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9117pub fn _mm_getexp_sh(a: __m128h, b: __m128h) -> __m128h {
9118    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
9119}
9120
9121/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9122/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9123/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
9124/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
9125/// for the lower element.
9126///
9127/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_sh)
9128#[inline]
9129#[target_feature(enable = "avx512fp16")]
9130#[cfg_attr(test, assert_instr(vgetexpsh))]
9131#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9132pub fn _mm_mask_getexp_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9133    _mm_mask_getexp_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9134}
9135
9136/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9137/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9138/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
9139/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
9140/// lower element.
9141///
9142/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_sh)
9143#[inline]
9144#[target_feature(enable = "avx512fp16")]
9145#[cfg_attr(test, assert_instr(vgetexpsh))]
9146#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9147pub fn _mm_maskz_getexp_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9148    _mm_mask_getexp_sh(f16x8::ZERO.as_m128h(), k, a, b)
9149}
9150
9151/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9152/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9153/// of dst, and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially
9154/// calculates `floor(log2(x))` for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9155/// in the sae parameter
9156///
9157/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_round_sh)
9158#[inline]
9159#[target_feature(enable = "avx512fp16")]
9160#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9161#[rustc_legacy_const_generics(2)]
9162#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9163pub fn _mm_getexp_round_sh<const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
9164    static_assert_sae!(SAE);
9165    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
9166}
9167
9168/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9169/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9170/// of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 7
9171/// packed elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))`
9172/// for the lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9173///
9174/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_round_sh)
9175#[inline]
9176#[target_feature(enable = "avx512fp16")]
9177#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9178#[rustc_legacy_const_generics(4)]
9179#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9180pub fn _mm_mask_getexp_round_sh<const SAE: i32>(
9181    src: __m128h,
9182    k: __mmask8,
9183    a: __m128h,
9184    b: __m128h,
9185) -> __m128h {
9186    unsafe {
9187        static_assert_sae!(SAE);
9188        vgetexpsh(a, b, src, k, SAE)
9189    }
9190}
9191
9192/// Convert the exponent of the lower half-precision (16-bit) floating-point element in b to a half-precision
9193/// (16-bit) floating-point number representing the integer exponent, store the result in the lower element
9194/// of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed
9195/// elements from a to the upper elements of dst. This intrinsic essentially calculates `floor(log2(x))` for the
9196/// lower element. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9197///
9198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_round_sh)
9199#[inline]
9200#[target_feature(enable = "avx512fp16")]
9201#[cfg_attr(test, assert_instr(vgetexpsh, SAE = 8))]
9202#[rustc_legacy_const_generics(3)]
9203#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9204pub fn _mm_maskz_getexp_round_sh<const SAE: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
9205    static_assert_sae!(SAE);
9206    _mm_mask_getexp_round_sh::<SAE>(f16x8::ZERO.as_m128h(), k, a, b)
9207}
9208
9209/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9210/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9211/// on the interval range defined by norm and the sign depends on sign and the source sign.
9212///
9213/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9214///
9215///     _MM_MANT_NORM_1_2     // interval [1, 2)
9216///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9217///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9218///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9219///
9220/// The sign is determined by sc which can take the following values:
9221///
9222///     _MM_MANT_SIGN_src     // sign = sign(src)
9223///     _MM_MANT_SIGN_zero    // sign = 0
9224///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9225///
9226/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ph)
9227#[inline]
9228#[target_feature(enable = "avx512fp16,avx512vl")]
9229#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9230#[rustc_legacy_const_generics(1, 2)]
9231#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9232pub fn _mm_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9233    a: __m128h,
9234) -> __m128h {
9235    static_assert_uimm_bits!(NORM, 4);
9236    static_assert_uimm_bits!(SIGN, 2);
9237    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_undefined_ph(), 0xff, a)
9238}
9239
9240/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9241/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9242/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9243/// by norm and the sign depends on sign and the source sign.
9244///
9245/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9246///
9247///     _MM_MANT_NORM_1_2     // interval [1, 2)
9248///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9249///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9250///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9251///
9252/// The sign is determined by sc which can take the following values:
9253///
9254///     _MM_MANT_SIGN_src     // sign = sign(src)
9255///     _MM_MANT_SIGN_zero    // sign = 0
9256///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9257///
9258/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ph)
9259#[inline]
9260#[target_feature(enable = "avx512fp16,avx512vl")]
9261#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9262#[rustc_legacy_const_generics(3, 4)]
9263#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9264pub fn _mm_mask_getmant_ph<
9265    const NORM: _MM_MANTISSA_NORM_ENUM,
9266    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9267>(
9268    src: __m128h,
9269    k: __mmask8,
9270    a: __m128h,
9271) -> __m128h {
9272    unsafe {
9273        static_assert_uimm_bits!(NORM, 4);
9274        static_assert_uimm_bits!(SIGN, 2);
9275        vgetmantph_128(a, (SIGN << 2) | NORM, src, k)
9276    }
9277}
9278
9279/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9280/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9281/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9282/// by norm and the sign depends on sign and the source sign.
9283///
9284/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9285///
9286///     _MM_MANT_NORM_1_2     // interval [1, 2)
9287///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9288///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9289///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9290///
9291/// The sign is determined by sc which can take the following values:
9292///
9293///     _MM_MANT_SIGN_src     // sign = sign(src)
9294///     _MM_MANT_SIGN_zero    // sign = 0
9295///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9296///
9297/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ph)
9298#[inline]
9299#[target_feature(enable = "avx512fp16,avx512vl")]
9300#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9301#[rustc_legacy_const_generics(2, 3)]
9302#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9303pub fn _mm_maskz_getmant_ph<
9304    const NORM: _MM_MANTISSA_NORM_ENUM,
9305    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9306>(
9307    k: __mmask8,
9308    a: __m128h,
9309) -> __m128h {
9310    static_assert_uimm_bits!(NORM, 4);
9311    static_assert_uimm_bits!(SIGN, 2);
9312    _mm_mask_getmant_ph::<NORM, SIGN>(_mm_setzero_ph(), k, a)
9313}
9314
9315/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9316/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9317/// on the interval range defined by norm and the sign depends on sign and the source sign.
9318///
9319/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9320///
9321///     _MM_MANT_NORM_1_2     // interval [1, 2)
9322///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9323///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9324///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9325///
9326/// The sign is determined by sc which can take the following values:
9327///
9328///     _MM_MANT_SIGN_src     // sign = sign(src)
9329///     _MM_MANT_SIGN_zero    // sign = 0
9330///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9331///
9332/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_ph)
9333#[inline]
9334#[target_feature(enable = "avx512fp16,avx512vl")]
9335#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9336#[rustc_legacy_const_generics(1, 2)]
9337#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9338pub fn _mm256_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9339    a: __m256h,
9340) -> __m256h {
9341    static_assert_uimm_bits!(NORM, 4);
9342    static_assert_uimm_bits!(SIGN, 2);
9343    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_undefined_ph(), 0xffff, a)
9344}
9345
9346/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9347/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9348/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9349/// by norm and the sign depends on sign and the source sign.
9350///
9351/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9352///
9353///     _MM_MANT_NORM_1_2     // interval [1, 2)
9354///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9355///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9356///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9357///
9358/// The sign is determined by sc which can take the following values:
9359///
9360///     _MM_MANT_SIGN_src     // sign = sign(src)
9361///     _MM_MANT_SIGN_zero    // sign = 0
9362///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9363///
9364/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ph)
9365#[inline]
9366#[target_feature(enable = "avx512fp16,avx512vl")]
9367#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9368#[rustc_legacy_const_generics(3, 4)]
9369#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9370pub fn _mm256_mask_getmant_ph<
9371    const NORM: _MM_MANTISSA_NORM_ENUM,
9372    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9373>(
9374    src: __m256h,
9375    k: __mmask16,
9376    a: __m256h,
9377) -> __m256h {
9378    unsafe {
9379        static_assert_uimm_bits!(NORM, 4);
9380        static_assert_uimm_bits!(SIGN, 2);
9381        vgetmantph_256(a, (SIGN << 2) | NORM, src, k)
9382    }
9383}
9384
9385/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9386/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9387/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9388/// by norm and the sign depends on sign and the source sign.
9389///
9390/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9391///
9392///     _MM_MANT_NORM_1_2     // interval [1, 2)
9393///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9394///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9395///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9396///
9397/// The sign is determined by sc which can take the following values:
9398///
9399///     _MM_MANT_SIGN_src     // sign = sign(src)
9400///     _MM_MANT_SIGN_zero    // sign = 0
9401///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9402///
9403/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ph)
9404#[inline]
9405#[target_feature(enable = "avx512fp16,avx512vl")]
9406#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9407#[rustc_legacy_const_generics(2, 3)]
9408#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9409pub fn _mm256_maskz_getmant_ph<
9410    const NORM: _MM_MANTISSA_NORM_ENUM,
9411    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9412>(
9413    k: __mmask16,
9414    a: __m256h,
9415) -> __m256h {
9416    static_assert_uimm_bits!(NORM, 4);
9417    static_assert_uimm_bits!(SIGN, 2);
9418    _mm256_mask_getmant_ph::<NORM, SIGN>(_mm256_setzero_ph(), k, a)
9419}
9420
9421/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9422/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9423/// on the interval range defined by norm and the sign depends on sign and the source sign.
9424///
9425/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9426///
9427///     _MM_MANT_NORM_1_2     // interval [1, 2)
9428///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9429///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9430///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9431///
9432/// The sign is determined by sc which can take the following values:
9433///
9434///     _MM_MANT_SIGN_src     // sign = sign(src)
9435///     _MM_MANT_SIGN_zero    // sign = 0
9436///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9437///
9438/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_ph)
9439#[inline]
9440#[target_feature(enable = "avx512fp16")]
9441#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9442#[rustc_legacy_const_generics(1, 2)]
9443#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9444pub fn _mm512_getmant_ph<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9445    a: __m512h,
9446) -> __m512h {
9447    static_assert_uimm_bits!(NORM, 4);
9448    static_assert_uimm_bits!(SIGN, 2);
9449    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_undefined_ph(), 0xffffffff, a)
9450}
9451
9452/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9453/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9454/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9455/// by norm and the sign depends on sign and the source sign.
9456///
9457/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9458///
9459///     _MM_MANT_NORM_1_2     // interval [1, 2)
9460///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9461///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9462///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9463///
9464/// The sign is determined by sc which can take the following values:
9465///
9466///     _MM_MANT_SIGN_src     // sign = sign(src)
9467///     _MM_MANT_SIGN_zero    // sign = 0
9468///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9469///
9470/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_ph)
9471#[inline]
9472#[target_feature(enable = "avx512fp16")]
9473#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9474#[rustc_legacy_const_generics(3, 4)]
9475#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9476pub fn _mm512_mask_getmant_ph<
9477    const NORM: _MM_MANTISSA_NORM_ENUM,
9478    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9479>(
9480    src: __m512h,
9481    k: __mmask32,
9482    a: __m512h,
9483) -> __m512h {
9484    static_assert_uimm_bits!(NORM, 4);
9485    static_assert_uimm_bits!(SIGN, 2);
9486    _mm512_mask_getmant_round_ph::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a)
9487}
9488
9489/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9490/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9491/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9492/// by norm and the sign depends on sign and the source sign.
9493///
9494/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9495///
9496///     _MM_MANT_NORM_1_2     // interval [1, 2)
9497///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9498///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9499///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9500///
9501/// The sign is determined by sc which can take the following values:
9502///
9503///     _MM_MANT_SIGN_src     // sign = sign(src)
9504///     _MM_MANT_SIGN_zero    // sign = 0
9505///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9506///
9507/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ph)
9508#[inline]
9509#[target_feature(enable = "avx512fp16")]
9510#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0))]
9511#[rustc_legacy_const_generics(2, 3)]
9512#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9513pub fn _mm512_maskz_getmant_ph<
9514    const NORM: _MM_MANTISSA_NORM_ENUM,
9515    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9516>(
9517    k: __mmask32,
9518    a: __m512h,
9519) -> __m512h {
9520    static_assert_uimm_bits!(NORM, 4);
9521    static_assert_uimm_bits!(SIGN, 2);
9522    _mm512_mask_getmant_ph::<NORM, SIGN>(_mm512_setzero_ph(), k, a)
9523}
9524
9525/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9526/// the results in dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9527/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9528/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9529///
9530/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9531///
9532///     _MM_MANT_NORM_1_2     // interval [1, 2)
9533///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9534///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9535///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9536///
9537/// The sign is determined by sc which can take the following values:
9538///
9539///     _MM_MANT_SIGN_src     // sign = sign(src)
9540///     _MM_MANT_SIGN_zero    // sign = 0
9541///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9542///
9543/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9544///
9545/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_ph)
9546#[inline]
9547#[target_feature(enable = "avx512fp16")]
9548#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9549#[rustc_legacy_const_generics(1, 2, 3)]
9550#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9551pub fn _mm512_getmant_round_ph<
9552    const NORM: _MM_MANTISSA_NORM_ENUM,
9553    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9554    const SAE: i32,
9555>(
9556    a: __m512h,
9557) -> __m512h {
9558    static_assert_uimm_bits!(NORM, 4);
9559    static_assert_uimm_bits!(SIGN, 2);
9560    static_assert_sae!(SAE);
9561    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
9562}
9563
9564/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9565/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
9566/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9567/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9568/// in the sae parameter
9569///
9570/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9571///
9572///     _MM_MANT_NORM_1_2     // interval [1, 2)
9573///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9574///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9575///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9576///
9577/// The sign is determined by sc which can take the following values:
9578///
9579///     _MM_MANT_SIGN_src     // sign = sign(src)
9580///     _MM_MANT_SIGN_zero    // sign = 0
9581///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9582///
9583/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9584///
9585/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_ph)
9586#[inline]
9587#[target_feature(enable = "avx512fp16")]
9588#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9589#[rustc_legacy_const_generics(3, 4, 5)]
9590#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9591pub fn _mm512_mask_getmant_round_ph<
9592    const NORM: _MM_MANTISSA_NORM_ENUM,
9593    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9594    const SAE: i32,
9595>(
9596    src: __m512h,
9597    k: __mmask32,
9598    a: __m512h,
9599) -> __m512h {
9600    unsafe {
9601        static_assert_uimm_bits!(NORM, 4);
9602        static_assert_uimm_bits!(SIGN, 2);
9603        static_assert_sae!(SAE);
9604        vgetmantph_512(a, (SIGN << 2) | NORM, src, k, SAE)
9605    }
9606}
9607
9608/// Normalize the mantissas of packed half-precision (16-bit) floating-point elements in a, and store
9609/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
9610/// This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends on the interval range defined
9611/// by norm and the sign depends on sign and the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
9612/// in the sae parameter
9613///
9614/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9615///
9616///     _MM_MANT_NORM_1_2     // interval [1, 2)
9617///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9618///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9619///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9620///
9621/// The sign is determined by sc which can take the following values:
9622///
9623///     _MM_MANT_SIGN_src     // sign = sign(src)
9624///     _MM_MANT_SIGN_zero    // sign = 0
9625///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9626///
9627/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9628///
9629/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ph)
9630#[inline]
9631#[target_feature(enable = "avx512fp16")]
9632#[cfg_attr(test, assert_instr(vgetmantph, NORM = 0, SIGN = 0, SAE = 8))]
9633#[rustc_legacy_const_generics(2, 3, 4)]
9634#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9635pub fn _mm512_maskz_getmant_round_ph<
9636    const NORM: _MM_MANTISSA_NORM_ENUM,
9637    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9638    const SAE: i32,
9639>(
9640    k: __mmask32,
9641    a: __m512h,
9642) -> __m512h {
9643    static_assert_uimm_bits!(NORM, 4);
9644    static_assert_uimm_bits!(SIGN, 2);
9645    static_assert_sae!(SAE);
9646    _mm512_mask_getmant_round_ph::<NORM, SIGN, SAE>(_mm512_setzero_ph(), k, a)
9647}
9648
9649/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9650/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9651/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9652/// on the interval range defined by norm and the sign depends on sign and the source sign.
9653///
9654/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9655///
9656///     _MM_MANT_NORM_1_2     // interval [1, 2)
9657///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9658///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9659///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9660///
9661/// The sign is determined by sc which can take the following values:
9662///
9663///     _MM_MANT_SIGN_src     // sign = sign(src)
9664///     _MM_MANT_SIGN_zero    // sign = 0
9665///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9666///
9667/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_sh)
9668#[inline]
9669#[target_feature(enable = "avx512fp16")]
9670#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9671#[rustc_legacy_const_generics(2, 3)]
9672#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9673pub fn _mm_getmant_sh<const NORM: _MM_MANTISSA_NORM_ENUM, const SIGN: _MM_MANTISSA_SIGN_ENUM>(
9674    a: __m128h,
9675    b: __m128h,
9676) -> __m128h {
9677    static_assert_uimm_bits!(NORM, 4);
9678    static_assert_uimm_bits!(SIGN, 2);
9679    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), 0xff, a, b)
9680}
9681
9682/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9683/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9684/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9685/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9686/// the source sign.
9687///
9688/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9689///
9690///     _MM_MANT_NORM_1_2     // interval [1, 2)
9691///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9692///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9693///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9694///
9695/// The sign is determined by sc which can take the following values:
9696///
9697///     _MM_MANT_SIGN_src     // sign = sign(src)
9698///     _MM_MANT_SIGN_zero    // sign = 0
9699///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9700///
9701/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_sh)
9702#[inline]
9703#[target_feature(enable = "avx512fp16")]
9704#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9705#[rustc_legacy_const_generics(4, 5)]
9706#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9707pub fn _mm_mask_getmant_sh<
9708    const NORM: _MM_MANTISSA_NORM_ENUM,
9709    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9710>(
9711    src: __m128h,
9712    k: __mmask8,
9713    a: __m128h,
9714    b: __m128h,
9715) -> __m128h {
9716    static_assert_uimm_bits!(NORM, 4);
9717    static_assert_uimm_bits!(SIGN, 2);
9718    _mm_mask_getmant_round_sh::<NORM, SIGN, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
9719}
9720
9721/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9722/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
9723/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9724/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9725/// the source sign.
9726///
9727/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9728///
9729///     _MM_MANT_NORM_1_2     // interval [1, 2)
9730///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9731///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9732///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9733///
9734/// The sign is determined by sc which can take the following values:
9735///
9736///     _MM_MANT_SIGN_src     // sign = sign(src)
9737///     _MM_MANT_SIGN_zero    // sign = 0
9738///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9739///
9740/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_sh)
9741#[inline]
9742#[target_feature(enable = "avx512fp16")]
9743#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0))]
9744#[rustc_legacy_const_generics(3, 4)]
9745#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9746pub fn _mm_maskz_getmant_sh<
9747    const NORM: _MM_MANTISSA_NORM_ENUM,
9748    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9749>(
9750    k: __mmask8,
9751    a: __m128h,
9752    b: __m128h,
9753) -> __m128h {
9754    static_assert_uimm_bits!(NORM, 4);
9755    static_assert_uimm_bits!(SIGN, 2);
9756    _mm_mask_getmant_sh::<NORM, SIGN>(f16x8::ZERO.as_m128h(), k, a, b)
9757}
9758
9759/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9760/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
9761/// elements of dst. This intrinsic essentially calculates `±(2^k)*|x.significand|`, where k depends
9762/// on the interval range defined by norm and the sign depends on sign and the source sign. Exceptions can
9763/// be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9764///
9765/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9766///
9767///     _MM_MANT_NORM_1_2     // interval [1, 2)
9768///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9769///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9770///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9771///
9772/// The sign is determined by sc which can take the following values:
9773///
9774///     _MM_MANT_SIGN_src     // sign = sign(src)
9775///     _MM_MANT_SIGN_zero    // sign = 0
9776///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9777///
9778/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9779///
9780/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_round_sh)
9781#[inline]
9782#[target_feature(enable = "avx512fp16")]
9783#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9784#[rustc_legacy_const_generics(2, 3, 4)]
9785#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9786pub fn _mm_getmant_round_sh<
9787    const NORM: _MM_MANTISSA_NORM_ENUM,
9788    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9789    const SAE: i32,
9790>(
9791    a: __m128h,
9792    b: __m128h,
9793) -> __m128h {
9794    static_assert_uimm_bits!(NORM, 4);
9795    static_assert_uimm_bits!(SIGN, 2);
9796    static_assert_sae!(SAE);
9797    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
9798}
9799
9800/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9801/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
9802/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9803/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9804/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9805///
9806/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9807///
9808///     _MM_MANT_NORM_1_2     // interval [1, 2)
9809///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9810///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9811///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9812///
9813/// The sign is determined by sc which can take the following values:
9814///
9815///     _MM_MANT_SIGN_src     // sign = sign(src)
9816///     _MM_MANT_SIGN_zero    // sign = 0
9817///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9818///
9819/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9820///
9821/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_round_sh)
9822#[inline]
9823#[target_feature(enable = "avx512fp16")]
9824#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9825#[rustc_legacy_const_generics(4, 5, 6)]
9826#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9827pub fn _mm_mask_getmant_round_sh<
9828    const NORM: _MM_MANTISSA_NORM_ENUM,
9829    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9830    const SAE: i32,
9831>(
9832    src: __m128h,
9833    k: __mmask8,
9834    a: __m128h,
9835    b: __m128h,
9836) -> __m128h {
9837    unsafe {
9838        static_assert_uimm_bits!(NORM, 4);
9839        static_assert_uimm_bits!(SIGN, 2);
9840        static_assert_sae!(SAE);
9841        vgetmantsh(a, b, (SIGN << 2) | NORM, src, k, SAE)
9842    }
9843}
9844
9845/// Normalize the mantissas of the lower half-precision (16-bit) floating-point element in b, store
9846/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
9847/// and copy the upper 7 packed elements from a to the upper elements of dst. This intrinsic essentially calculates
9848/// `±(2^k)*|x.significand|`, where k depends on the interval range defined by norm and the sign depends on sign and
9849/// the source sign. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9850///
9851/// The mantissa is normalized to the interval specified by interv, which can take the following values:
9852///
9853///     _MM_MANT_NORM_1_2     // interval [1, 2)
9854///     _MM_MANT_NORM_p5_2    // interval [0.5, 2)
9855///     _MM_MANT_NORM_p5_1    // interval [0.5, 1)
9856///     _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
9857///
9858/// The sign is determined by sc which can take the following values:
9859///
9860///     _MM_MANT_SIGN_src     // sign = sign(src)
9861///     _MM_MANT_SIGN_zero    // sign = 0
9862///     _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
9863///
9864/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
9865///
9866/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_round_sh)
9867#[inline]
9868#[target_feature(enable = "avx512fp16")]
9869#[cfg_attr(test, assert_instr(vgetmantsh, NORM = 0, SIGN = 0, SAE = 8))]
9870#[rustc_legacy_const_generics(3, 4, 5)]
9871#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9872pub fn _mm_maskz_getmant_round_sh<
9873    const NORM: _MM_MANTISSA_NORM_ENUM,
9874    const SIGN: _MM_MANTISSA_SIGN_ENUM,
9875    const SAE: i32,
9876>(
9877    k: __mmask8,
9878    a: __m128h,
9879    b: __m128h,
9880) -> __m128h {
9881    static_assert_uimm_bits!(NORM, 4);
9882    static_assert_uimm_bits!(SIGN, 2);
9883    static_assert_sae!(SAE);
9884    _mm_mask_getmant_round_sh::<NORM, SIGN, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
9885}
9886
9887/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9888/// specified by imm8, and store the results in dst.
9889///
9890/// Rounding is done according to the imm8 parameter, which can be one of:
9891///
9892/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9893/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9894/// * [`_MM_FROUND_TO_POS_INF`] : round up
9895/// * [`_MM_FROUND_TO_ZERO`] : truncate
9896/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9897///
9898/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ph)
9899#[inline]
9900#[target_feature(enable = "avx512fp16,avx512vl")]
9901#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9902#[rustc_legacy_const_generics(1)]
9903#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9904pub fn _mm_roundscale_ph<const IMM8: i32>(a: __m128h) -> __m128h {
9905    static_assert_uimm_bits!(IMM8, 8);
9906    _mm_mask_roundscale_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
9907}
9908
9909/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9910/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9911/// the corresponding mask bit is not set).
9912///
9913/// Rounding is done according to the imm8 parameter, which can be one of:
9914///
9915/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9916/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9917/// * [`_MM_FROUND_TO_POS_INF`] : round up
9918/// * [`_MM_FROUND_TO_ZERO`] : truncate
9919/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9920///
9921/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ph)
9922#[inline]
9923#[target_feature(enable = "avx512fp16,avx512vl")]
9924#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9925#[rustc_legacy_const_generics(3)]
9926#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9927pub fn _mm_mask_roundscale_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
9928    unsafe {
9929        static_assert_uimm_bits!(IMM8, 8);
9930        vrndscaleph_128(a, IMM8, src, k)
9931    }
9932}
9933
9934/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9935/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
9936/// mask bit is not set).
9937///
9938/// Rounding is done according to the imm8 parameter, which can be one of:
9939///
9940/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9941/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9942/// * [`_MM_FROUND_TO_POS_INF`] : round up
9943/// * [`_MM_FROUND_TO_ZERO`] : truncate
9944/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9945///
9946/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ph)
9947#[inline]
9948#[target_feature(enable = "avx512fp16,avx512vl")]
9949#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9950#[rustc_legacy_const_generics(2)]
9951#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9952pub fn _mm_maskz_roundscale_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
9953    static_assert_uimm_bits!(IMM8, 8);
9954    _mm_mask_roundscale_ph::<IMM8>(_mm_setzero_ph(), k, a)
9955}
9956
9957/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9958/// specified by imm8, and store the results in dst.
9959///
9960/// Rounding is done according to the imm8 parameter, which can be one of:
9961///
9962/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9963/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9964/// * [`_MM_FROUND_TO_POS_INF`] : round up
9965/// * [`_MM_FROUND_TO_ZERO`] : truncate
9966/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9967///
9968/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_ph)
9969#[inline]
9970#[target_feature(enable = "avx512fp16,avx512vl")]
9971#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9972#[rustc_legacy_const_generics(1)]
9973#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9974pub fn _mm256_roundscale_ph<const IMM8: i32>(a: __m256h) -> __m256h {
9975    static_assert_uimm_bits!(IMM8, 8);
9976    _mm256_mask_roundscale_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
9977}
9978
9979/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
9980/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
9981/// the corresponding mask bit is not set).
9982///
9983/// Rounding is done according to the imm8 parameter, which can be one of:
9984///
9985/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
9986/// * [`_MM_FROUND_TO_NEG_INF`] : round down
9987/// * [`_MM_FROUND_TO_POS_INF`] : round up
9988/// * [`_MM_FROUND_TO_ZERO`] : truncate
9989/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
9990///
9991/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ph)
9992#[inline]
9993#[target_feature(enable = "avx512fp16,avx512vl")]
9994#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
9995#[rustc_legacy_const_generics(3)]
9996#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
9997pub fn _mm256_mask_roundscale_ph<const IMM8: i32>(
9998    src: __m256h,
9999    k: __mmask16,
10000    a: __m256h,
10001) -> __m256h {
10002    unsafe {
10003        static_assert_uimm_bits!(IMM8, 8);
10004        vrndscaleph_256(a, IMM8, src, k)
10005    }
10006}
10007
10008/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10009/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10010/// mask bit is not set).
10011///
10012/// Rounding is done according to the imm8 parameter, which can be one of:
10013///
10014/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10015/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10016/// * [`_MM_FROUND_TO_POS_INF`] : round up
10017/// * [`_MM_FROUND_TO_ZERO`] : truncate
10018/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10019///
10020/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ph)
10021#[inline]
10022#[target_feature(enable = "avx512fp16,avx512vl")]
10023#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10024#[rustc_legacy_const_generics(2)]
10025#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10026pub fn _mm256_maskz_roundscale_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
10027    static_assert_uimm_bits!(IMM8, 8);
10028    _mm256_mask_roundscale_ph::<IMM8>(_mm256_setzero_ph(), k, a)
10029}
10030
10031/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10032/// specified by imm8, and store the results in dst.
10033///
10034/// Rounding is done according to the imm8 parameter, which can be one of:
10035///
10036/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10037/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10038/// * [`_MM_FROUND_TO_POS_INF`] : round up
10039/// * [`_MM_FROUND_TO_ZERO`] : truncate
10040/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10041///
10042/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ph)
10043#[inline]
10044#[target_feature(enable = "avx512fp16")]
10045#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10046#[rustc_legacy_const_generics(1)]
10047#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10048pub fn _mm512_roundscale_ph<const IMM8: i32>(a: __m512h) -> __m512h {
10049    static_assert_uimm_bits!(IMM8, 8);
10050    _mm512_mask_roundscale_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
10051}
10052
10053/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10054/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10055/// the corresponding mask bit is not set).
10056///
10057/// Rounding is done according to the imm8 parameter, which can be one of:
10058///
10059/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10060/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10061/// * [`_MM_FROUND_TO_POS_INF`] : round up
10062/// * [`_MM_FROUND_TO_ZERO`] : truncate
10063/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10064///
10065/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ph)
10066#[inline]
10067#[target_feature(enable = "avx512fp16")]
10068#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10069#[rustc_legacy_const_generics(3)]
10070#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10071pub fn _mm512_mask_roundscale_ph<const IMM8: i32>(
10072    src: __m512h,
10073    k: __mmask32,
10074    a: __m512h,
10075) -> __m512h {
10076    static_assert_uimm_bits!(IMM8, 8);
10077    _mm512_mask_roundscale_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
10078}
10079
10080/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10081/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10082/// mask bit is not set).
10083///
10084/// Rounding is done according to the imm8 parameter, which can be one of:
10085///
10086/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10087/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10088/// * [`_MM_FROUND_TO_POS_INF`] : round up
10089/// * [`_MM_FROUND_TO_ZERO`] : truncate
10090/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10091///
10092/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ph)
10093#[inline]
10094#[target_feature(enable = "avx512fp16")]
10095#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0))]
10096#[rustc_legacy_const_generics(2)]
10097#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10098pub fn _mm512_maskz_roundscale_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
10099    static_assert_uimm_bits!(IMM8, 8);
10100    _mm512_mask_roundscale_ph::<IMM8>(_mm512_setzero_ph(), k, a)
10101}
10102
10103/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10104/// specified by imm8, and store the results in dst. Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
10105/// in the sae parameter
10106///
10107/// Rounding is done according to the imm8 parameter, which can be one of:
10108///
10109/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10110/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10111/// * [`_MM_FROUND_TO_POS_INF`] : round up
10112/// * [`_MM_FROUND_TO_ZERO`] : truncate
10113/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10114///
10115/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ph)
10116#[inline]
10117#[target_feature(enable = "avx512fp16")]
10118#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10119#[rustc_legacy_const_generics(1, 2)]
10120#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10121pub fn _mm512_roundscale_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
10122    static_assert_uimm_bits!(IMM8, 8);
10123    static_assert_sae!(SAE);
10124    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
10125}
10126
10127/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10128/// specified by imm8, and store the results in dst using writemask k (elements are copied from src when
10129/// the corresponding mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC
10130/// in the sae parameter
10131///
10132/// Rounding is done according to the imm8 parameter, which can be one of:
10133///
10134/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10135/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10136/// * [`_MM_FROUND_TO_POS_INF`] : round up
10137/// * [`_MM_FROUND_TO_ZERO`] : truncate
10138/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10139///
10140/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ph)
10141#[inline]
10142#[target_feature(enable = "avx512fp16")]
10143#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10144#[rustc_legacy_const_generics(3, 4)]
10145#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10146pub fn _mm512_mask_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
10147    src: __m512h,
10148    k: __mmask32,
10149    a: __m512h,
10150) -> __m512h {
10151    unsafe {
10152        static_assert_uimm_bits!(IMM8, 8);
10153        static_assert_sae!(SAE);
10154        vrndscaleph_512(a, IMM8, src, k, SAE)
10155    }
10156}
10157
10158/// Round packed half-precision (16-bit) floating-point elements in a to the number of fraction bits
10159/// specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
10160/// mask bit is not set). Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10161///
10162/// Rounding is done according to the imm8 parameter, which can be one of:
10163///
10164/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10165/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10166/// * [`_MM_FROUND_TO_POS_INF`] : round up
10167/// * [`_MM_FROUND_TO_ZERO`] : truncate
10168/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10169///
10170/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ph)
10171#[inline]
10172#[target_feature(enable = "avx512fp16")]
10173#[cfg_attr(test, assert_instr(vrndscaleph, IMM8 = 0, SAE = 8))]
10174#[rustc_legacy_const_generics(2, 3)]
10175#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10176pub fn _mm512_maskz_roundscale_round_ph<const IMM8: i32, const SAE: i32>(
10177    k: __mmask32,
10178    a: __m512h,
10179) -> __m512h {
10180    static_assert_uimm_bits!(IMM8, 8);
10181    static_assert_sae!(SAE);
10182    _mm512_mask_roundscale_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
10183}
10184
10185/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10186/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
10187/// from a to the upper elements of dst.
10188///
10189/// Rounding is done according to the imm8 parameter, which can be one of:
10190///
10191/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10192/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10193/// * [`_MM_FROUND_TO_POS_INF`] : round up
10194/// * [`_MM_FROUND_TO_ZERO`] : truncate
10195/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10196///
10197/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_sh)
10198#[inline]
10199#[target_feature(enable = "avx512fp16")]
10200#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10201#[rustc_legacy_const_generics(2)]
10202#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10203pub fn _mm_roundscale_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
10204    static_assert_uimm_bits!(IMM8, 8);
10205    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10206}
10207
10208/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10209/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10210/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10211///
10212/// Rounding is done according to the imm8 parameter, which can be one of:
10213///
10214/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10215/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10216/// * [`_MM_FROUND_TO_POS_INF`] : round up
10217/// * [`_MM_FROUND_TO_ZERO`] : truncate
10218/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10219///
10220/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_sh)
10221#[inline]
10222#[target_feature(enable = "avx512fp16")]
10223#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10224#[rustc_legacy_const_generics(4)]
10225#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10226pub fn _mm_mask_roundscale_sh<const IMM8: i32>(
10227    src: __m128h,
10228    k: __mmask8,
10229    a: __m128h,
10230    b: __m128h,
10231) -> __m128h {
10232    static_assert_uimm_bits!(IMM8, 8);
10233    _mm_mask_roundscale_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10234}
10235
10236/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10237/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10238/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10239///
10240/// Rounding is done according to the imm8 parameter, which can be one of:
10241///
10242/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10243/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10244/// * [`_MM_FROUND_TO_POS_INF`] : round up
10245/// * [`_MM_FROUND_TO_ZERO`] : truncate
10246/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10247///
10248/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_sh)
10249#[inline]
10250#[target_feature(enable = "avx512fp16")]
10251#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0))]
10252#[rustc_legacy_const_generics(3)]
10253#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10254pub fn _mm_maskz_roundscale_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10255    static_assert_uimm_bits!(IMM8, 8);
10256    _mm_mask_roundscale_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
10257}
10258
10259/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10260/// specified by imm8, store the result in the lower element of dst, and copy the upper 7 packed elements
10261/// from a to the upper elements of dst.
10262///
10263/// Rounding is done according to the imm8 parameter, which can be one of:
10264///
10265/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10266/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10267/// * [`_MM_FROUND_TO_POS_INF`] : round up
10268/// * [`_MM_FROUND_TO_ZERO`] : truncate
10269/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10270///
10271/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10272///
10273/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_round_sh)
10274#[inline]
10275#[target_feature(enable = "avx512fp16")]
10276#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10277#[rustc_legacy_const_generics(2, 3)]
10278#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10279pub fn _mm_roundscale_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
10280    static_assert_uimm_bits!(IMM8, 8);
10281    static_assert_sae!(SAE);
10282    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10283}
10284
10285/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10286/// specified by imm8, store the result in the lower element of dst using writemask k (the element is copied
10287/// from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10288///
10289/// Rounding is done according to the imm8 parameter, which can be one of:
10290///
10291/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10292/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10293/// * [`_MM_FROUND_TO_POS_INF`] : round up
10294/// * [`_MM_FROUND_TO_ZERO`] : truncate
10295/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10296///
10297/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10298///
10299/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_round_sh)
10300#[inline]
10301#[target_feature(enable = "avx512fp16")]
10302#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10303#[rustc_legacy_const_generics(4, 5)]
10304#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10305pub fn _mm_mask_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10306    src: __m128h,
10307    k: __mmask8,
10308    a: __m128h,
10309    b: __m128h,
10310) -> __m128h {
10311    unsafe {
10312        static_assert_uimm_bits!(IMM8, 8);
10313        static_assert_sae!(SAE);
10314        vrndscalesh(a, b, src, k, IMM8, SAE)
10315    }
10316}
10317
10318/// Round the lower half-precision (16-bit) floating-point element in b to the number of fraction bits
10319/// specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed
10320/// out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper elements of dst.
10321///
10322/// Rounding is done according to the imm8 parameter, which can be one of:
10323///
10324/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10325/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10326/// * [`_MM_FROUND_TO_POS_INF`] : round up
10327/// * [`_MM_FROUND_TO_ZERO`] : truncate
10328/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10329///
10330/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter
10331///
10332/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_round_sh)
10333#[inline]
10334#[target_feature(enable = "avx512fp16")]
10335#[cfg_attr(test, assert_instr(vrndscalesh, IMM8 = 0, SAE = 8))]
10336#[rustc_legacy_const_generics(3, 4)]
10337#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10338pub fn _mm_maskz_roundscale_round_sh<const IMM8: i32, const SAE: i32>(
10339    k: __mmask8,
10340    a: __m128h,
10341    b: __m128h,
10342) -> __m128h {
10343    static_assert_uimm_bits!(IMM8, 8);
10344    static_assert_sae!(SAE);
10345    _mm_mask_roundscale_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
10346}
10347
10348/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10349/// the results in dst.
10350///
10351/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ph)
10352#[inline]
10353#[target_feature(enable = "avx512fp16,avx512vl")]
10354#[cfg_attr(test, assert_instr(vscalefph))]
10355#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10356pub fn _mm_scalef_ph(a: __m128h, b: __m128h) -> __m128h {
10357    _mm_mask_scalef_ph(_mm_undefined_ph(), 0xff, a, b)
10358}
10359
10360/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10361/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10362///
10363/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ph)
10364#[inline]
10365#[target_feature(enable = "avx512fp16,avx512vl")]
10366#[cfg_attr(test, assert_instr(vscalefph))]
10367#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10368pub fn _mm_mask_scalef_ph(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10369    unsafe { vscalefph_128(a, b, src, k) }
10370}
10371
10372/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10373/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10374///
10375/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ph)
10376#[inline]
10377#[target_feature(enable = "avx512fp16,avx512vl")]
10378#[cfg_attr(test, assert_instr(vscalefph))]
10379#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10380pub fn _mm_maskz_scalef_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10381    _mm_mask_scalef_ph(_mm_setzero_ph(), k, a, b)
10382}
10383
10384/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10385/// the results in dst.
10386///
10387/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_ph)
10388#[inline]
10389#[target_feature(enable = "avx512fp16,avx512vl")]
10390#[cfg_attr(test, assert_instr(vscalefph))]
10391#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10392pub fn _mm256_scalef_ph(a: __m256h, b: __m256h) -> __m256h {
10393    _mm256_mask_scalef_ph(_mm256_undefined_ph(), 0xffff, a, b)
10394}
10395
10396/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10397/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10398///
10399/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ph)
10400#[inline]
10401#[target_feature(enable = "avx512fp16,avx512vl")]
10402#[cfg_attr(test, assert_instr(vscalefph))]
10403#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10404pub fn _mm256_mask_scalef_ph(src: __m256h, k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10405    unsafe { vscalefph_256(a, b, src, k) }
10406}
10407
10408/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10409/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10410///
10411/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_ph)
10412#[inline]
10413#[target_feature(enable = "avx512fp16,avx512vl")]
10414#[cfg_attr(test, assert_instr(vscalefph))]
10415#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10416pub fn _mm256_maskz_scalef_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
10417    _mm256_mask_scalef_ph(_mm256_setzero_ph(), k, a, b)
10418}
10419
10420/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10421/// the results in dst.
10422///
10423/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ph)
10424#[inline]
10425#[target_feature(enable = "avx512fp16")]
10426#[cfg_attr(test, assert_instr(vscalefph))]
10427#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10428pub fn _mm512_scalef_ph(a: __m512h, b: __m512h) -> __m512h {
10429    _mm512_mask_scalef_ph(_mm512_undefined_ph(), 0xffffffff, a, b)
10430}
10431
10432/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10433/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10434///
10435/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ph)
10436#[inline]
10437#[target_feature(enable = "avx512fp16")]
10438#[cfg_attr(test, assert_instr(vscalefph))]
10439#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10440pub fn _mm512_mask_scalef_ph(src: __m512h, k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10441    _mm512_mask_scalef_round_ph::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10442}
10443
10444/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10445/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10446///
10447/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ph)
10448#[inline]
10449#[target_feature(enable = "avx512fp16")]
10450#[cfg_attr(test, assert_instr(vscalefph))]
10451#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10452pub fn _mm512_maskz_scalef_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
10453    _mm512_mask_scalef_ph(_mm512_setzero_ph(), k, a, b)
10454}
10455
10456/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10457/// the results in dst.
10458///
10459/// Rounding is done according to the rounding parameter, which can be one of:
10460///
10461/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10462/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10463/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10464/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10465/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10466///
10467/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ph)
10468#[inline]
10469#[target_feature(enable = "avx512fp16")]
10470#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10471#[rustc_legacy_const_generics(2)]
10472#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10473pub fn _mm512_scalef_round_ph<const ROUNDING: i32>(a: __m512h, b: __m512h) -> __m512h {
10474    static_assert_rounding!(ROUNDING);
10475    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_undefined_ph(), 0xffffffff, a, b)
10476}
10477
10478/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10479/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
10480///
10481/// Rounding is done according to the rounding parameter, which can be one of:
10482///
10483/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10484/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10485/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10486/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10487/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10488///
10489/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ph)
10490#[inline]
10491#[target_feature(enable = "avx512fp16")]
10492#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10493#[rustc_legacy_const_generics(4)]
10494#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10495pub fn _mm512_mask_scalef_round_ph<const ROUNDING: i32>(
10496    src: __m512h,
10497    k: __mmask32,
10498    a: __m512h,
10499    b: __m512h,
10500) -> __m512h {
10501    unsafe {
10502        static_assert_rounding!(ROUNDING);
10503        vscalefph_512(a, b, src, k, ROUNDING)
10504    }
10505}
10506
10507/// Scale the packed half-precision (16-bit) floating-point elements in a using values from b, and store
10508/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
10509///
10510/// Rounding is done according to the rounding parameter, which can be one of:
10511///
10512/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10513/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10514/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10515/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10516/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10517///
10518/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ph)
10519#[inline]
10520#[target_feature(enable = "avx512fp16")]
10521#[cfg_attr(test, assert_instr(vscalefph, ROUNDING = 8))]
10522#[rustc_legacy_const_generics(3)]
10523#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10524pub fn _mm512_maskz_scalef_round_ph<const ROUNDING: i32>(
10525    k: __mmask32,
10526    a: __m512h,
10527    b: __m512h,
10528) -> __m512h {
10529    static_assert_rounding!(ROUNDING);
10530    _mm512_mask_scalef_round_ph::<ROUNDING>(_mm512_setzero_ph(), k, a, b)
10531}
10532
10533/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10534/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10535/// elements of dst.
10536///
10537/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_sh)
10538#[inline]
10539#[target_feature(enable = "avx512fp16")]
10540#[cfg_attr(test, assert_instr(vscalefsh))]
10541#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10542pub fn _mm_scalef_sh(a: __m128h, b: __m128h) -> __m128h {
10543    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
10544}
10545
10546/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10547/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10548/// and copy the upper 7 packed elements from a to the upper elements of dst.
10549///
10550/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_sh)
10551#[inline]
10552#[target_feature(enable = "avx512fp16")]
10553#[cfg_attr(test, assert_instr(vscalefsh))]
10554#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10555pub fn _mm_mask_scalef_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10556    _mm_mask_scalef_round_sh::<_MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10557}
10558
10559/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10560/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10561/// and copy the upper 7 packed elements from a to the upper elements of dst.
10562///
10563/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_sh)
10564#[inline]
10565#[target_feature(enable = "avx512fp16")]
10566#[cfg_attr(test, assert_instr(vscalefsh))]
10567#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10568pub fn _mm_maskz_scalef_sh(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
10569    _mm_mask_scalef_sh(f16x8::ZERO.as_m128h(), k, a, b)
10570}
10571
10572/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10573/// the result in the lower element of dst, and copy the upper 7 packed elements from a to the upper
10574/// elements of dst.
10575///
10576/// Rounding is done according to the rounding parameter, which can be one of:
10577///
10578/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10579/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10580/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10581/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10582/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10583///
10584/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_round_sh)
10585#[inline]
10586#[target_feature(enable = "avx512fp16")]
10587#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10588#[rustc_legacy_const_generics(2)]
10589#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10590pub fn _mm_scalef_round_sh<const ROUNDING: i32>(a: __m128h, b: __m128h) -> __m128h {
10591    static_assert_rounding!(ROUNDING);
10592    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10593}
10594
10595/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10596/// the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set),
10597/// and copy the upper 7 packed elements from a to the upper elements of dst.
10598///
10599/// Rounding is done according to the rounding parameter, which can be one of:
10600///
10601/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10602/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10603/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10604/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10605/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10606///
10607/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_round_sh)
10608#[inline]
10609#[target_feature(enable = "avx512fp16")]
10610#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10611#[rustc_legacy_const_generics(4)]
10612#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10613pub fn _mm_mask_scalef_round_sh<const ROUNDING: i32>(
10614    src: __m128h,
10615    k: __mmask8,
10616    a: __m128h,
10617    b: __m128h,
10618) -> __m128h {
10619    unsafe {
10620        static_assert_rounding!(ROUNDING);
10621        vscalefsh(a, b, src, k, ROUNDING)
10622    }
10623}
10624
10625/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store
10626/// the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set),
10627/// and copy the upper 7 packed elements from a to the upper elements of dst.
10628///
10629/// Rounding is done according to the rounding parameter, which can be one of:
10630///
10631/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
10632/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
10633/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
10634/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
10635/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10636///
10637/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_round_sh)
10638#[inline]
10639#[target_feature(enable = "avx512fp16")]
10640#[cfg_attr(test, assert_instr(vscalefsh, ROUNDING = 8))]
10641#[rustc_legacy_const_generics(3)]
10642#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10643pub fn _mm_maskz_scalef_round_sh<const ROUNDING: i32>(
10644    k: __mmask8,
10645    a: __m128h,
10646    b: __m128h,
10647) -> __m128h {
10648    static_assert_rounding!(ROUNDING);
10649    _mm_mask_scalef_round_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
10650}
10651
10652/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10653/// number of bits specified by imm8, and store the results in dst.
10654///
10655/// Rounding is done according to the imm8 parameter, which can be one of:
10656///
10657/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10658/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10659/// * [`_MM_FROUND_TO_POS_INF`] : round up
10660/// * [`_MM_FROUND_TO_ZERO`] : truncate
10661/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10662///
10663/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_ph)
10664#[inline]
10665#[target_feature(enable = "avx512fp16,avx512vl")]
10666#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10667#[rustc_legacy_const_generics(1)]
10668#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10669pub fn _mm_reduce_ph<const IMM8: i32>(a: __m128h) -> __m128h {
10670    static_assert_uimm_bits!(IMM8, 8);
10671    _mm_mask_reduce_ph::<IMM8>(_mm_undefined_ph(), 0xff, a)
10672}
10673
10674/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10675/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10676/// from src when the corresponding mask bit is not set).
10677///
10678/// Rounding is done according to the imm8 parameter, which can be one of:
10679///
10680/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10681/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10682/// * [`_MM_FROUND_TO_POS_INF`] : round up
10683/// * [`_MM_FROUND_TO_ZERO`] : truncate
10684/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10685///
10686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_ph)
10687#[inline]
10688#[target_feature(enable = "avx512fp16,avx512vl")]
10689#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10690#[rustc_legacy_const_generics(3)]
10691#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10692pub fn _mm_mask_reduce_ph<const IMM8: i32>(src: __m128h, k: __mmask8, a: __m128h) -> __m128h {
10693    unsafe {
10694        static_assert_uimm_bits!(IMM8, 8);
10695        vreduceph_128(a, IMM8, src, k)
10696    }
10697}
10698
10699/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10700/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10701/// out when the corresponding mask bit is not set).
10702///
10703/// Rounding is done according to the imm8 parameter, which can be one of:
10704///
10705/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10706/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10707/// * [`_MM_FROUND_TO_POS_INF`] : round up
10708/// * [`_MM_FROUND_TO_ZERO`] : truncate
10709/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10710///
10711/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_ph)
10712#[inline]
10713#[target_feature(enable = "avx512fp16,avx512vl")]
10714#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10715#[rustc_legacy_const_generics(2)]
10716#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10717pub fn _mm_maskz_reduce_ph<const IMM8: i32>(k: __mmask8, a: __m128h) -> __m128h {
10718    static_assert_uimm_bits!(IMM8, 8);
10719    _mm_mask_reduce_ph::<IMM8>(_mm_setzero_ph(), k, a)
10720}
10721
10722/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10723/// number of bits specified by imm8, and store the results in dst.
10724///
10725/// Rounding is done according to the imm8 parameter, which can be one of:
10726///
10727/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10728/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10729/// * [`_MM_FROUND_TO_POS_INF`] : round up
10730/// * [`_MM_FROUND_TO_ZERO`] : truncate
10731/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10732///
10733/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_ph)
10734#[inline]
10735#[target_feature(enable = "avx512fp16,avx512vl")]
10736#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10737#[rustc_legacy_const_generics(1)]
10738#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10739pub fn _mm256_reduce_ph<const IMM8: i32>(a: __m256h) -> __m256h {
10740    static_assert_uimm_bits!(IMM8, 8);
10741    _mm256_mask_reduce_ph::<IMM8>(_mm256_undefined_ph(), 0xffff, a)
10742}
10743
10744/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10745/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10746/// from src when the corresponding mask bit is not set).
10747///
10748/// Rounding is done according to the imm8 parameter, which can be one of:
10749///
10750/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10751/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10752/// * [`_MM_FROUND_TO_POS_INF`] : round up
10753/// * [`_MM_FROUND_TO_ZERO`] : truncate
10754/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10755///
10756/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_reduce_ph)
10757#[inline]
10758#[target_feature(enable = "avx512fp16,avx512vl")]
10759#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10760#[rustc_legacy_const_generics(3)]
10761#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10762pub fn _mm256_mask_reduce_ph<const IMM8: i32>(src: __m256h, k: __mmask16, a: __m256h) -> __m256h {
10763    unsafe {
10764        static_assert_uimm_bits!(IMM8, 8);
10765        vreduceph_256(a, IMM8, src, k)
10766    }
10767}
10768
10769/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10770/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10771/// out when the corresponding mask bit is not set).
10772///
10773/// Rounding is done according to the imm8 parameter, which can be one of:
10774///
10775/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10776/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10777/// * [`_MM_FROUND_TO_POS_INF`] : round up
10778/// * [`_MM_FROUND_TO_ZERO`] : truncate
10779/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10780///
10781/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_reduce_ph)
10782#[inline]
10783#[target_feature(enable = "avx512fp16,avx512vl")]
10784#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10785#[rustc_legacy_const_generics(2)]
10786#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10787pub fn _mm256_maskz_reduce_ph<const IMM8: i32>(k: __mmask16, a: __m256h) -> __m256h {
10788    static_assert_uimm_bits!(IMM8, 8);
10789    _mm256_mask_reduce_ph::<IMM8>(_mm256_setzero_ph(), k, a)
10790}
10791
10792/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10793/// number of bits specified by imm8, and store the results in dst.
10794///
10795/// Rounding is done according to the imm8 parameter, which can be one of:
10796///
10797/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10798/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10799/// * [`_MM_FROUND_TO_POS_INF`] : round up
10800/// * [`_MM_FROUND_TO_ZERO`] : truncate
10801/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10802///
10803/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_ph)
10804#[inline]
10805#[target_feature(enable = "avx512fp16")]
10806#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10807#[rustc_legacy_const_generics(1)]
10808#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10809pub fn _mm512_reduce_ph<const IMM8: i32>(a: __m512h) -> __m512h {
10810    static_assert_uimm_bits!(IMM8, 8);
10811    _mm512_mask_reduce_ph::<IMM8>(_mm512_undefined_ph(), 0xffffffff, a)
10812}
10813
10814/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10815/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10816/// from src when the corresponding mask bit is not set).
10817///
10818/// Rounding is done according to the imm8 parameter, which can be one of:
10819///
10820/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10821/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10822/// * [`_MM_FROUND_TO_POS_INF`] : round up
10823/// * [`_MM_FROUND_TO_ZERO`] : truncate
10824/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10825///
10826/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_ph)
10827#[inline]
10828#[target_feature(enable = "avx512fp16")]
10829#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10830#[rustc_legacy_const_generics(3)]
10831#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10832pub fn _mm512_mask_reduce_ph<const IMM8: i32>(src: __m512h, k: __mmask32, a: __m512h) -> __m512h {
10833    static_assert_uimm_bits!(IMM8, 8);
10834    _mm512_mask_reduce_round_ph::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a)
10835}
10836
10837/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10838/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10839/// out when the corresponding mask bit is not set).
10840///
10841/// Rounding is done according to the imm8 parameter, which can be one of:
10842///
10843/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10844/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10845/// * [`_MM_FROUND_TO_POS_INF`] : round up
10846/// * [`_MM_FROUND_TO_ZERO`] : truncate
10847/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10848///
10849/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_ph)
10850#[inline]
10851#[target_feature(enable = "avx512fp16")]
10852#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0))]
10853#[rustc_legacy_const_generics(2)]
10854#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10855pub fn _mm512_maskz_reduce_ph<const IMM8: i32>(k: __mmask32, a: __m512h) -> __m512h {
10856    static_assert_uimm_bits!(IMM8, 8);
10857    _mm512_mask_reduce_ph::<IMM8>(_mm512_setzero_ph(), k, a)
10858}
10859
10860/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10861/// number of bits specified by imm8, and store the results in dst.
10862///
10863/// Rounding is done according to the imm8 parameter, which can be one of:
10864///
10865/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10866/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10867/// * [`_MM_FROUND_TO_POS_INF`] : round up
10868/// * [`_MM_FROUND_TO_ZERO`] : truncate
10869/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10870///
10871/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10872///
10873/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_round_ph)
10874#[inline]
10875#[target_feature(enable = "avx512fp16")]
10876#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10877#[rustc_legacy_const_generics(1, 2)]
10878#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10879pub fn _mm512_reduce_round_ph<const IMM8: i32, const SAE: i32>(a: __m512h) -> __m512h {
10880    static_assert_uimm_bits!(IMM8, 8);
10881    static_assert_sae!(SAE);
10882    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_undefined_ph(), 0xffffffff, a)
10883}
10884
10885/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10886/// number of bits specified by imm8, and store the results in dst using writemask k (elements are copied
10887/// from src when the corresponding mask bit is not set).
10888///
10889/// Rounding is done according to the imm8 parameter, which can be one of:
10890///
10891/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10892/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10893/// * [`_MM_FROUND_TO_POS_INF`] : round up
10894/// * [`_MM_FROUND_TO_ZERO`] : truncate
10895/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10896///
10897/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10898///
10899/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_round_ph)
10900#[inline]
10901#[target_feature(enable = "avx512fp16")]
10902#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10903#[rustc_legacy_const_generics(3, 4)]
10904#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10905pub fn _mm512_mask_reduce_round_ph<const IMM8: i32, const SAE: i32>(
10906    src: __m512h,
10907    k: __mmask32,
10908    a: __m512h,
10909) -> __m512h {
10910    unsafe {
10911        static_assert_uimm_bits!(IMM8, 8);
10912        static_assert_sae!(SAE);
10913        vreduceph_512(a, IMM8, src, k, SAE)
10914    }
10915}
10916
10917/// Extract the reduced argument of packed half-precision (16-bit) floating-point elements in a by the
10918/// number of bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed
10919/// out when the corresponding mask bit is not set).
10920///
10921/// Rounding is done according to the imm8 parameter, which can be one of:
10922///
10923/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10924/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10925/// * [`_MM_FROUND_TO_POS_INF`] : round up
10926/// * [`_MM_FROUND_TO_ZERO`] : truncate
10927/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10928///
10929/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
10930///
10931/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_reduce_round_ph)
10932#[inline]
10933#[target_feature(enable = "avx512fp16")]
10934#[cfg_attr(test, assert_instr(vreduceph, IMM8 = 0, SAE = 8))]
10935#[rustc_legacy_const_generics(2, 3)]
10936#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10937pub fn _mm512_maskz_reduce_round_ph<const IMM8: i32, const SAE: i32>(
10938    k: __mmask32,
10939    a: __m512h,
10940) -> __m512h {
10941    static_assert_uimm_bits!(IMM8, 8);
10942    static_assert_sae!(SAE);
10943    _mm512_mask_reduce_round_ph::<IMM8, SAE>(_mm512_setzero_ph(), k, a)
10944}
10945
10946/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10947/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the
10948/// upper 7 packed elements from a to the upper elements of dst.
10949///
10950/// Rounding is done according to the imm8 parameter, which can be one of:
10951///
10952/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10953/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10954/// * [`_MM_FROUND_TO_POS_INF`] : round up
10955/// * [`_MM_FROUND_TO_ZERO`] : truncate
10956/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10957///
10958/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_sh)
10959#[inline]
10960#[target_feature(enable = "avx512fp16")]
10961#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10962#[rustc_legacy_const_generics(2)]
10963#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10964pub fn _mm_reduce_sh<const IMM8: i32>(a: __m128h, b: __m128h) -> __m128h {
10965    static_assert_uimm_bits!(IMM8, 8);
10966    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), 0xff, a, b)
10967}
10968
10969/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10970/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
10971/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from
10972/// a to the upper elements of dst.
10973///
10974/// Rounding is done according to the imm8 parameter, which can be one of:
10975///
10976/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
10977/// * [`_MM_FROUND_TO_NEG_INF`] : round down
10978/// * [`_MM_FROUND_TO_POS_INF`] : round up
10979/// * [`_MM_FROUND_TO_ZERO`] : truncate
10980/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
10981///
10982/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_sh)
10983#[inline]
10984#[target_feature(enable = "avx512fp16")]
10985#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
10986#[rustc_legacy_const_generics(4)]
10987#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
10988pub fn _mm_mask_reduce_sh<const IMM8: i32>(
10989    src: __m128h,
10990    k: __mmask8,
10991    a: __m128h,
10992    b: __m128h,
10993) -> __m128h {
10994    static_assert_uimm_bits!(IMM8, 8);
10995    _mm_mask_reduce_round_sh::<IMM8, _MM_FROUND_CUR_DIRECTION>(src, k, a, b)
10996}
10997
10998/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
10999/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
11000/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
11001/// to the upper elements of dst.
11002///
11003/// Rounding is done according to the imm8 parameter, which can be one of:
11004///
11005/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11006/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11007/// * [`_MM_FROUND_TO_POS_INF`] : round up
11008/// * [`_MM_FROUND_TO_ZERO`] : truncate
11009/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11010///
11011/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_sh)
11012#[inline]
11013#[target_feature(enable = "avx512fp16")]
11014#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0))]
11015#[rustc_legacy_const_generics(3)]
11016#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11017pub fn _mm_maskz_reduce_sh<const IMM8: i32>(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
11018    static_assert_uimm_bits!(IMM8, 8);
11019    _mm_mask_reduce_sh::<IMM8>(f16x8::ZERO.as_m128h(), k, a, b)
11020}
11021
11022/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11023/// the number of bits specified by imm8, store the result in the lower element of dst, and copy the upper
11024/// 7 packed elements from a to the upper elements of dst.
11025///
11026/// Rounding is done according to the imm8 parameter, which can be one of:
11027///
11028/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11029/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11030/// * [`_MM_FROUND_TO_POS_INF`] : round up
11031/// * [`_MM_FROUND_TO_ZERO`] : truncate
11032/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11033///
11034/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11035///
11036/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_round_sh)
11037#[inline]
11038#[target_feature(enable = "avx512fp16")]
11039#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
11040#[rustc_legacy_const_generics(2, 3)]
11041#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11042pub fn _mm_reduce_round_sh<const IMM8: i32, const SAE: i32>(a: __m128h, b: __m128h) -> __m128h {
11043    static_assert_uimm_bits!(IMM8, 8);
11044    static_assert_sae!(SAE);
11045    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), 0xff, a, b)
11046}
11047
11048/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11049/// the number of bits specified by imm8, store the result in the lower element of dst using writemask k
11050/// (the element is copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a
11051/// to the upper elements of dst.
11052///
11053/// Rounding is done according to the imm8 parameter, which can be one of:
11054///
11055/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11056/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11057/// * [`_MM_FROUND_TO_POS_INF`] : round up
11058/// * [`_MM_FROUND_TO_ZERO`] : truncate
11059/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11060///
11061/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11062///
11063/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_reduce_round_sh)
11064#[inline]
11065#[target_feature(enable = "avx512fp16")]
11066#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
11067#[rustc_legacy_const_generics(4, 5)]
11068#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11069pub fn _mm_mask_reduce_round_sh<const IMM8: i32, const SAE: i32>(
11070    src: __m128h,
11071    k: __mmask8,
11072    a: __m128h,
11073    b: __m128h,
11074) -> __m128h {
11075    unsafe {
11076        static_assert_uimm_bits!(IMM8, 8);
11077        static_assert_sae!(SAE);
11078        vreducesh(a, b, src, k, IMM8, SAE)
11079    }
11080}
11081
11082/// Extract the reduced argument of the lower half-precision (16-bit) floating-point element in b by
11083/// the number of bits specified by imm8, store the result in the lower element of dst using zeromask k
11084/// (the element is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a
11085/// to the upper elements of dst.
11086///
11087/// Rounding is done according to the imm8 parameter, which can be one of:
11088///
11089/// * [`_MM_FROUND_TO_NEAREST_INT`] : round to nearest
11090/// * [`_MM_FROUND_TO_NEG_INF`] : round down
11091/// * [`_MM_FROUND_TO_POS_INF`] : round up
11092/// * [`_MM_FROUND_TO_ZERO`] : truncate
11093/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11094///
11095/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
11096///
11097/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_reduce_round_sh)
11098#[inline]
11099#[target_feature(enable = "avx512fp16")]
11100#[cfg_attr(test, assert_instr(vreducesh, IMM8 = 0, SAE = 8))]
11101#[rustc_legacy_const_generics(3, 4)]
11102#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11103pub fn _mm_maskz_reduce_round_sh<const IMM8: i32, const SAE: i32>(
11104    k: __mmask8,
11105    a: __m128h,
11106    b: __m128h,
11107) -> __m128h {
11108    static_assert_uimm_bits!(IMM8, 8);
11109    static_assert_sae!(SAE);
11110    _mm_mask_reduce_round_sh::<IMM8, SAE>(f16x8::ZERO.as_m128h(), k, a, b)
11111}
11112
11113/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11114/// sum of all elements in a.
11115///
11116/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_add_ph)
11117#[inline]
11118#[target_feature(enable = "avx512fp16,avx512vl")]
11119#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11120pub fn _mm_reduce_add_ph(a: __m128h) -> f16 {
11121    unsafe {
11122        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11123        let a = _mm_add_ph(a, b);
11124        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11125        let a = _mm_add_ph(a, b);
11126        simd_extract!(a, 0, f16) + simd_extract!(a, 1, f16)
11127    }
11128}
11129
11130/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11131/// sum of all elements in a.
11132///
11133/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_add_ph)
11134#[inline]
11135#[target_feature(enable = "avx512fp16,avx512vl")]
11136#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11137pub fn _mm256_reduce_add_ph(a: __m256h) -> f16 {
11138    unsafe {
11139        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11140        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11141        _mm_reduce_add_ph(_mm_add_ph(p, q))
11142    }
11143}
11144
11145/// Reduce the packed half-precision (16-bit) floating-point elements in a by addition. Returns the
11146/// sum of all elements in a.
11147///
11148/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ph)
11149#[inline]
11150#[target_feature(enable = "avx512fp16")]
11151#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11152pub fn _mm512_reduce_add_ph(a: __m512h) -> f16 {
11153    unsafe {
11154        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11155        let q = simd_shuffle!(
11156            a,
11157            a,
11158            [
11159                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11160            ]
11161        );
11162        _mm256_reduce_add_ph(_mm256_add_ph(p, q))
11163    }
11164}
11165
11166/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11167/// the product of all elements in a.
11168///
11169/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_mul_ph)
11170#[inline]
11171#[target_feature(enable = "avx512fp16,avx512vl")]
11172#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11173pub fn _mm_reduce_mul_ph(a: __m128h) -> f16 {
11174    unsafe {
11175        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11176        let a = _mm_mul_ph(a, b);
11177        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11178        let a = _mm_mul_ph(a, b);
11179        simd_extract!(a, 0, f16) * simd_extract!(a, 1, f16)
11180    }
11181}
11182
11183/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11184/// the product of all elements in a.
11185///
11186/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_mul_ph)
11187#[inline]
11188#[target_feature(enable = "avx512fp16,avx512vl")]
11189#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11190pub fn _mm256_reduce_mul_ph(a: __m256h) -> f16 {
11191    unsafe {
11192        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11193        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11194        _mm_reduce_mul_ph(_mm_mul_ph(p, q))
11195    }
11196}
11197
11198/// Reduce the packed half-precision (16-bit) floating-point elements in a by multiplication. Returns
11199/// the product of all elements in a.
11200///
11201/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ph)
11202#[inline]
11203#[target_feature(enable = "avx512fp16")]
11204#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11205pub fn _mm512_reduce_mul_ph(a: __m512h) -> f16 {
11206    unsafe {
11207        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11208        let q = simd_shuffle!(
11209            a,
11210            a,
11211            [
11212                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11213            ]
11214        );
11215        _mm256_reduce_mul_ph(_mm256_mul_ph(p, q))
11216    }
11217}
11218
11219/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11220/// minimum of all elements in a.
11221///
11222/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_min_ph)
11223#[inline]
11224#[target_feature(enable = "avx512fp16,avx512vl")]
11225#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11226pub fn _mm_reduce_min_ph(a: __m128h) -> f16 {
11227    unsafe {
11228        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11229        let a = _mm_min_ph(a, b);
11230        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11231        let a = _mm_min_ph(a, b);
11232        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
11233        simd_extract!(_mm_min_sh(a, b), 0)
11234    }
11235}
11236
11237/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11238/// minimum of all elements in a.
11239///
11240/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_min_ph)
11241#[inline]
11242#[target_feature(enable = "avx512fp16,avx512vl")]
11243#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11244pub fn _mm256_reduce_min_ph(a: __m256h) -> f16 {
11245    unsafe {
11246        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11247        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11248        _mm_reduce_min_ph(_mm_min_ph(p, q))
11249    }
11250}
11251
11252/// Reduce the packed half-precision (16-bit) floating-point elements in a by minimum. Returns the
11253/// minimum of all elements in a.
11254///
11255/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ph)
11256#[inline]
11257#[target_feature(enable = "avx512fp16")]
11258#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11259pub fn _mm512_reduce_min_ph(a: __m512h) -> f16 {
11260    unsafe {
11261        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11262        let q = simd_shuffle!(
11263            a,
11264            a,
11265            [
11266                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11267            ]
11268        );
11269        _mm256_reduce_min_ph(_mm256_min_ph(p, q))
11270    }
11271}
11272
11273/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11274/// maximum of all elements in a.
11275///
11276/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_reduce_max_ph)
11277#[inline]
11278#[target_feature(enable = "avx512fp16,avx512vl")]
11279#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11280pub fn _mm_reduce_max_ph(a: __m128h) -> f16 {
11281    unsafe {
11282        let b = simd_shuffle!(a, a, [4, 5, 6, 7, 0, 1, 2, 3]);
11283        let a = _mm_max_ph(a, b);
11284        let b = simd_shuffle!(a, a, [2, 3, 0, 1, 4, 5, 6, 7]);
11285        let a = _mm_max_ph(a, b);
11286        let b = simd_shuffle!(a, a, [1, 0, 2, 3, 4, 5, 6, 7]);
11287        simd_extract!(_mm_max_sh(a, b), 0)
11288    }
11289}
11290
11291/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11292/// maximum of all elements in a.
11293///
11294/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_reduce_max_ph)
11295#[inline]
11296#[target_feature(enable = "avx512fp16,avx512vl")]
11297#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11298pub fn _mm256_reduce_max_ph(a: __m256h) -> f16 {
11299    unsafe {
11300        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
11301        let q = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
11302        _mm_reduce_max_ph(_mm_max_ph(p, q))
11303    }
11304}
11305
11306/// Reduce the packed half-precision (16-bit) floating-point elements in a by maximum. Returns the
11307/// maximum of all elements in a.
11308///
11309/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ph)
11310#[inline]
11311#[target_feature(enable = "avx512fp16")]
11312#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11313pub fn _mm512_reduce_max_ph(a: __m512h) -> f16 {
11314    unsafe {
11315        let p = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]);
11316        let q = simd_shuffle!(
11317            a,
11318            a,
11319            [
11320                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
11321            ]
11322        );
11323        _mm256_reduce_max_ph(_mm256_max_ph(p, q))
11324    }
11325}
11326
11327macro_rules! fpclass_asm { // FIXME: use LLVM intrinsics
11328    ($mask_type: ty, $reg: ident, $a: expr) => {{
11329        let dst: $mask_type;
11330        asm!(
11331            "vfpclassph {k}, {src}, {imm8}",
11332            k = lateout(kreg) dst,
11333            src = in($reg) $a,
11334            imm8 = const IMM8,
11335            options(pure, nomem, nostack)
11336        );
11337        dst
11338    }};
11339    ($mask_type: ty, $mask: expr, $reg: ident, $a: expr) => {{
11340        let dst: $mask_type;
11341        asm!(
11342            "vfpclassph {k} {{ {mask} }}, {src}, {imm8}",
11343            k = lateout(kreg) dst,
11344            mask = in(kreg) $mask,
11345            src = in($reg) $a,
11346            imm8 = const IMM8,
11347            options(pure, nomem, nostack)
11348        );
11349        dst
11350    }};
11351}
11352
11353/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11354/// by imm8, and store the results in mask vector k.
11355/// imm can be a combination of:
11356///
11357///     0x01 // QNaN
11358///     0x02 // Positive Zero
11359///     0x04 // Negative Zero
11360///     0x08 // Positive Infinity
11361///     0x10 // Negative Infinity
11362///     0x20 // Denormal
11363///     0x40 // Negative
11364///     0x80 // SNaN
11365///
11366/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_ph_mask)
11367#[inline]
11368#[target_feature(enable = "avx512fp16,avx512vl")]
11369#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11370#[rustc_legacy_const_generics(1)]
11371#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11372pub fn _mm_fpclass_ph_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11373    unsafe {
11374        static_assert_uimm_bits!(IMM8, 8);
11375        fpclass_asm!(__mmask8, xmm_reg, a)
11376    }
11377}
11378
11379/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11380/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11381/// corresponding mask bit is not set).
11382/// imm can be a combination of:
11383///
11384///     0x01 // QNaN
11385///     0x02 // Positive Zero
11386///     0x04 // Negative Zero
11387///     0x08 // Positive Infinity
11388///     0x10 // Negative Infinity
11389///     0x20 // Denormal
11390///     0x40 // Negative
11391///     0x80 // SNaN
11392///
11393/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_ph_mask)
11394#[inline]
11395#[target_feature(enable = "avx512fp16,avx512vl")]
11396#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11397#[rustc_legacy_const_generics(2)]
11398#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11399pub fn _mm_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11400    unsafe {
11401        static_assert_uimm_bits!(IMM8, 8);
11402        fpclass_asm!(__mmask8, k1, xmm_reg, a)
11403    }
11404}
11405
11406/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11407/// by imm8, and store the results in mask vector k.
11408/// imm can be a combination of:
11409///
11410///     0x01 // QNaN
11411///     0x02 // Positive Zero
11412///     0x04 // Negative Zero
11413///     0x08 // Positive Infinity
11414///     0x10 // Negative Infinity
11415///     0x20 // Denormal
11416///     0x40 // Negative
11417///     0x80 // SNaN
11418///
11419/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fpclass_ph_mask)
11420#[inline]
11421#[target_feature(enable = "avx512fp16,avx512vl")]
11422#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11423#[rustc_legacy_const_generics(1)]
11424#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11425pub fn _mm256_fpclass_ph_mask<const IMM8: i32>(a: __m256h) -> __mmask16 {
11426    unsafe {
11427        static_assert_uimm_bits!(IMM8, 8);
11428        fpclass_asm!(__mmask16, ymm_reg, a)
11429    }
11430}
11431
11432/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11433/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11434/// corresponding mask bit is not set).
11435/// imm can be a combination of:
11436///
11437///     0x01 // QNaN
11438///     0x02 // Positive Zero
11439///     0x04 // Negative Zero
11440///     0x08 // Positive Infinity
11441///     0x10 // Negative Infinity
11442///     0x20 // Denormal
11443///     0x40 // Negative
11444///     0x80 // SNaN
11445///
11446/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fpclass_ph_mask)
11447#[inline]
11448#[target_feature(enable = "avx512fp16,avx512vl")]
11449#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11450#[rustc_legacy_const_generics(2)]
11451#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11452pub fn _mm256_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask16, a: __m256h) -> __mmask16 {
11453    unsafe {
11454        static_assert_uimm_bits!(IMM8, 8);
11455        fpclass_asm!(__mmask16, k1, ymm_reg, a)
11456    }
11457}
11458
11459/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11460/// by imm8, and store the results in mask vector k.
11461/// imm can be a combination of:
11462///
11463///     0x01 // QNaN
11464///     0x02 // Positive Zero
11465///     0x04 // Negative Zero
11466///     0x08 // Positive Infinity
11467///     0x10 // Negative Infinity
11468///     0x20 // Denormal
11469///     0x40 // Negative
11470///     0x80 // SNaN
11471///
11472/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fpclass_ph_mask)
11473#[inline]
11474#[target_feature(enable = "avx512fp16")]
11475#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11476#[rustc_legacy_const_generics(1)]
11477#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11478pub fn _mm512_fpclass_ph_mask<const IMM8: i32>(a: __m512h) -> __mmask32 {
11479    unsafe {
11480        static_assert_uimm_bits!(IMM8, 8);
11481        fpclass_asm!(__mmask32, zmm_reg, a)
11482    }
11483}
11484
11485/// Test packed half-precision (16-bit) floating-point elements in a for special categories specified
11486/// by imm8, and store the results in mask vector k using zeromask k (elements are zeroed out when the
11487/// corresponding mask bit is not set).
11488/// imm can be a combination of:
11489///
11490///     0x01 // QNaN
11491///     0x02 // Positive Zero
11492///     0x04 // Negative Zero
11493///     0x08 // Positive Infinity
11494///     0x10 // Negative Infinity
11495///     0x20 // Denormal
11496///     0x40 // Negative
11497///     0x80 // SNaN
11498///
11499/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fpclass_ph_mask)
11500#[inline]
11501#[target_feature(enable = "avx512fp16")]
11502#[cfg_attr(test, assert_instr(vfpclassph, IMM8 = 0))]
11503#[rustc_legacy_const_generics(2)]
11504#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11505pub fn _mm512_mask_fpclass_ph_mask<const IMM8: i32>(k1: __mmask32, a: __m512h) -> __mmask32 {
11506    unsafe {
11507        static_assert_uimm_bits!(IMM8, 8);
11508        fpclass_asm!(__mmask32, k1, zmm_reg, a)
11509    }
11510}
11511
11512/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11513/// by imm8, and store the result in mask vector k.
11514/// imm can be a combination of:
11515///
11516///     0x01 // QNaN
11517///     0x02 // Positive Zero
11518///     0x04 // Negative Zero
11519///     0x08 // Positive Infinity
11520///     0x10 // Negative Infinity
11521///     0x20 // Denormal
11522///     0x40 // Negative
11523///     0x80 // SNaN
11524///
11525/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fpclass_sh_mask)
11526#[inline]
11527#[target_feature(enable = "avx512fp16")]
11528#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11529#[rustc_legacy_const_generics(1)]
11530#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11531pub fn _mm_fpclass_sh_mask<const IMM8: i32>(a: __m128h) -> __mmask8 {
11532    _mm_mask_fpclass_sh_mask::<IMM8>(0xff, a)
11533}
11534
11535/// Test the lower half-precision (16-bit) floating-point element in a for special categories specified
11536/// by imm8, and store the result in mask vector k using zeromask k (elements are zeroed out when the
11537/// corresponding mask bit is not set).
11538/// imm can be a combination of:
11539///
11540///     0x01 // QNaN
11541///     0x02 // Positive Zero
11542///     0x04 // Negative Zero
11543///     0x08 // Positive Infinity
11544///     0x10 // Negative Infinity
11545///     0x20 // Denormal
11546///     0x40 // Negative
11547///     0x80 // SNaN
11548///
11549/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fpclass_sh_mask)
11550#[inline]
11551#[target_feature(enable = "avx512fp16")]
11552#[cfg_attr(test, assert_instr(vfpclasssh, IMM8 = 0))]
11553#[rustc_legacy_const_generics(2)]
11554#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11555pub fn _mm_mask_fpclass_sh_mask<const IMM8: i32>(k1: __mmask8, a: __m128h) -> __mmask8 {
11556    unsafe {
11557        static_assert_uimm_bits!(IMM8, 8);
11558        vfpclasssh(a, IMM8, k1)
11559    }
11560}
11561
11562/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11563/// and store the results in dst.
11564///
11565/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ph)
11566#[inline]
11567#[target_feature(enable = "avx512fp16,avx512vl")]
11568#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11569pub fn _mm_mask_blend_ph(k: __mmask8, a: __m128h, b: __m128h) -> __m128h {
11570    unsafe { simd_select_bitmask(k, b, a) }
11571}
11572
11573/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11574/// and store the results in dst.
11575///
11576/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ph)
11577#[inline]
11578#[target_feature(enable = "avx512fp16,avx512vl")]
11579#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11580pub fn _mm256_mask_blend_ph(k: __mmask16, a: __m256h, b: __m256h) -> __m256h {
11581    unsafe { simd_select_bitmask(k, b, a) }
11582}
11583
11584/// Blend packed half-precision (16-bit) floating-point elements from a and b using control mask k,
11585/// and store the results in dst.
11586///
11587/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ph)
11588#[inline]
11589#[target_feature(enable = "avx512fp16")]
11590#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11591pub fn _mm512_mask_blend_ph(k: __mmask32, a: __m512h, b: __m512h) -> __m512h {
11592    unsafe { simd_select_bitmask(k, b, a) }
11593}
11594
11595/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11596/// and index in idx, and store the results in dst.
11597///
11598/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_ph)
11599#[inline]
11600#[target_feature(enable = "avx512fp16,avx512vl")]
11601#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11602pub fn _mm_permutex2var_ph(a: __m128h, idx: __m128i, b: __m128h) -> __m128h {
11603    _mm_castsi128_ph(_mm_permutex2var_epi16(
11604        _mm_castph_si128(a),
11605        idx,
11606        _mm_castph_si128(b),
11607    ))
11608}
11609
11610/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11611/// and index in idx, and store the results in dst.
11612///
11613/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_ph)
11614#[inline]
11615#[target_feature(enable = "avx512fp16,avx512vl")]
11616#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11617pub fn _mm256_permutex2var_ph(a: __m256h, idx: __m256i, b: __m256h) -> __m256h {
11618    _mm256_castsi256_ph(_mm256_permutex2var_epi16(
11619        _mm256_castph_si256(a),
11620        idx,
11621        _mm256_castph_si256(b),
11622    ))
11623}
11624
11625/// Shuffle half-precision (16-bit) floating-point elements in a and b using the corresponding selector
11626/// and index in idx, and store the results in dst.
11627///
11628/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ph)
11629#[inline]
11630#[target_feature(enable = "avx512fp16")]
11631#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11632pub fn _mm512_permutex2var_ph(a: __m512h, idx: __m512i, b: __m512h) -> __m512h {
11633    _mm512_castsi512_ph(_mm512_permutex2var_epi16(
11634        _mm512_castph_si512(a),
11635        idx,
11636        _mm512_castph_si512(b),
11637    ))
11638}
11639
11640/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11641/// and store the results in dst.
11642///
11643/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutexvar_ph)
11644#[inline]
11645#[target_feature(enable = "avx512fp16,avx512vl")]
11646#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11647pub fn _mm_permutexvar_ph(idx: __m128i, a: __m128h) -> __m128h {
11648    _mm_castsi128_ph(_mm_permutexvar_epi16(idx, _mm_castph_si128(a)))
11649}
11650
11651/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11652/// and store the results in dst.
11653///
11654/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_ph)
11655#[inline]
11656#[target_feature(enable = "avx512fp16,avx512vl")]
11657#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11658pub fn _mm256_permutexvar_ph(idx: __m256i, a: __m256h) -> __m256h {
11659    _mm256_castsi256_ph(_mm256_permutexvar_epi16(idx, _mm256_castph_si256(a)))
11660}
11661
11662/// Shuffle half-precision (16-bit) floating-point elements in a using the corresponding index in idx,
11663/// and store the results in dst.
11664///
11665/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_ph)
11666#[inline]
11667#[target_feature(enable = "avx512fp16")]
11668#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11669pub fn _mm512_permutexvar_ph(idx: __m512i, a: __m512h) -> __m512h {
11670    _mm512_castsi512_ph(_mm512_permutexvar_epi16(idx, _mm512_castph_si512(a)))
11671}
11672
11673/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11674/// and store the results in dst.
11675///
11676/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi16_ph)
11677#[inline]
11678#[target_feature(enable = "avx512fp16,avx512vl")]
11679#[cfg_attr(test, assert_instr(vcvtw2ph))]
11680#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11681pub fn _mm_cvtepi16_ph(a: __m128i) -> __m128h {
11682    unsafe { vcvtw2ph_128(a.as_i16x8(), _MM_FROUND_CUR_DIRECTION) }
11683}
11684
11685/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11686/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11687/// mask bit is not set).
11688///
11689/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_ph)
11690#[inline]
11691#[target_feature(enable = "avx512fp16,avx512vl")]
11692#[cfg_attr(test, assert_instr(vcvtw2ph))]
11693#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11694pub fn _mm_mask_cvtepi16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11695    unsafe { simd_select_bitmask(k, _mm_cvtepi16_ph(a), src) }
11696}
11697
11698/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11699/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11700///
11701/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_ph)
11702#[inline]
11703#[target_feature(enable = "avx512fp16,avx512vl")]
11704#[cfg_attr(test, assert_instr(vcvtw2ph))]
11705#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11706pub fn _mm_maskz_cvtepi16_ph(k: __mmask8, a: __m128i) -> __m128h {
11707    _mm_mask_cvtepi16_ph(_mm_setzero_ph(), k, a)
11708}
11709
11710/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11711/// and store the results in dst.
11712///
11713/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi16_ph)
11714#[inline]
11715#[target_feature(enable = "avx512fp16,avx512vl")]
11716#[cfg_attr(test, assert_instr(vcvtw2ph))]
11717#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11718pub fn _mm256_cvtepi16_ph(a: __m256i) -> __m256h {
11719    unsafe { vcvtw2ph_256(a.as_i16x16(), _MM_FROUND_CUR_DIRECTION) }
11720}
11721
11722/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11723/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11724/// mask bit is not set).
11725///
11726/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_ph)
11727#[inline]
11728#[target_feature(enable = "avx512fp16,avx512vl")]
11729#[cfg_attr(test, assert_instr(vcvtw2ph))]
11730#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11731pub fn _mm256_mask_cvtepi16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
11732    unsafe { simd_select_bitmask(k, _mm256_cvtepi16_ph(a), src) }
11733}
11734
11735/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11736/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11737///
11738/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_ph)
11739#[inline]
11740#[target_feature(enable = "avx512fp16,avx512vl")]
11741#[cfg_attr(test, assert_instr(vcvtw2ph))]
11742#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11743pub fn _mm256_maskz_cvtepi16_ph(k: __mmask16, a: __m256i) -> __m256h {
11744    _mm256_mask_cvtepi16_ph(_mm256_setzero_ph(), k, a)
11745}
11746
11747/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11748/// and store the results in dst.
11749///
11750/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_ph)
11751#[inline]
11752#[target_feature(enable = "avx512fp16")]
11753#[cfg_attr(test, assert_instr(vcvtw2ph))]
11754#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11755pub fn _mm512_cvtepi16_ph(a: __m512i) -> __m512h {
11756    unsafe { vcvtw2ph_512(a.as_i16x32(), _MM_FROUND_CUR_DIRECTION) }
11757}
11758
11759/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11760/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11761/// mask bit is not set).
11762///
11763/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_ph)
11764#[inline]
11765#[target_feature(enable = "avx512fp16")]
11766#[cfg_attr(test, assert_instr(vcvtw2ph))]
11767#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11768pub fn _mm512_mask_cvtepi16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
11769    unsafe { simd_select_bitmask(k, _mm512_cvtepi16_ph(a), src) }
11770}
11771
11772/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11773/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11774///
11775/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_ph)
11776#[inline]
11777#[target_feature(enable = "avx512fp16")]
11778#[cfg_attr(test, assert_instr(vcvtw2ph))]
11779#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11780pub fn _mm512_maskz_cvtepi16_ph(k: __mmask32, a: __m512i) -> __m512h {
11781    _mm512_mask_cvtepi16_ph(_mm512_setzero_ph(), k, a)
11782}
11783
11784/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11785/// and store the results in dst.
11786///
11787/// Rounding is done according to the rounding parameter, which can be one of:
11788///
11789/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11790/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11791/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11792/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11793/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11794///
11795/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi16_ph)
11796#[inline]
11797#[target_feature(enable = "avx512fp16")]
11798#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11799#[rustc_legacy_const_generics(1)]
11800#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11801pub fn _mm512_cvt_roundepi16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
11802    unsafe {
11803        static_assert_rounding!(ROUNDING);
11804        vcvtw2ph_512(a.as_i16x32(), ROUNDING)
11805    }
11806}
11807
11808/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11809/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11810/// mask bit is not set).
11811///
11812/// Rounding is done according to the rounding parameter, which can be one of:
11813///
11814/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11815/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11816/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11817/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11818/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11819///
11820/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi16_ph)
11821#[inline]
11822#[target_feature(enable = "avx512fp16")]
11823#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11824#[rustc_legacy_const_generics(3)]
11825#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11826pub fn _mm512_mask_cvt_roundepi16_ph<const ROUNDING: i32>(
11827    src: __m512h,
11828    k: __mmask32,
11829    a: __m512i,
11830) -> __m512h {
11831    unsafe {
11832        static_assert_rounding!(ROUNDING);
11833        simd_select_bitmask(k, _mm512_cvt_roundepi16_ph::<ROUNDING>(a), src)
11834    }
11835}
11836
11837/// Convert packed signed 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11838/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11839///
11840/// Rounding is done according to the rounding parameter, which can be one of:
11841///
11842/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11843/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11844/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11845/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11846/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11847///
11848/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi16_ph)
11849#[inline]
11850#[target_feature(enable = "avx512fp16")]
11851#[cfg_attr(test, assert_instr(vcvtw2ph, ROUNDING = 8))]
11852#[rustc_legacy_const_generics(2)]
11853#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11854pub fn _mm512_maskz_cvt_roundepi16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
11855    static_assert_rounding!(ROUNDING);
11856    _mm512_mask_cvt_roundepi16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
11857}
11858
11859/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11860/// and store the results in dst.
11861///
11862/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu16_ph)
11863#[inline]
11864#[target_feature(enable = "avx512fp16,avx512vl")]
11865#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11866#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11867pub fn _mm_cvtepu16_ph(a: __m128i) -> __m128h {
11868    unsafe { vcvtuw2ph_128(a.as_u16x8(), _MM_FROUND_CUR_DIRECTION) }
11869}
11870
11871/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11872/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11873/// mask bit is not set).
11874///
11875/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_ph)
11876#[inline]
11877#[target_feature(enable = "avx512fp16,avx512vl")]
11878#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11879#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11880pub fn _mm_mask_cvtepu16_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
11881    unsafe { simd_select_bitmask(k, _mm_cvtepu16_ph(a), src) }
11882}
11883
11884/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11885/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11886///
11887/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_ph)
11888#[inline]
11889#[target_feature(enable = "avx512fp16,avx512vl")]
11890#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11891#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11892pub fn _mm_maskz_cvtepu16_ph(k: __mmask8, a: __m128i) -> __m128h {
11893    _mm_mask_cvtepu16_ph(_mm_setzero_ph(), k, a)
11894}
11895
11896/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11897/// and store the results in dst.
11898///
11899/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu16_ph)
11900#[inline]
11901#[target_feature(enable = "avx512fp16,avx512vl")]
11902#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11903#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11904pub fn _mm256_cvtepu16_ph(a: __m256i) -> __m256h {
11905    unsafe { vcvtuw2ph_256(a.as_u16x16(), _MM_FROUND_CUR_DIRECTION) }
11906}
11907
11908/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11909/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11910/// mask bit is not set).
11911///
11912/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_ph)
11913#[inline]
11914#[target_feature(enable = "avx512fp16,avx512vl")]
11915#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11916#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11917pub fn _mm256_mask_cvtepu16_ph(src: __m256h, k: __mmask16, a: __m256i) -> __m256h {
11918    unsafe { simd_select_bitmask(k, _mm256_cvtepu16_ph(a), src) }
11919}
11920
11921/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11922/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11923///
11924/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_ph)
11925#[inline]
11926#[target_feature(enable = "avx512fp16,avx512vl")]
11927#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11928#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11929pub fn _mm256_maskz_cvtepu16_ph(k: __mmask16, a: __m256i) -> __m256h {
11930    _mm256_mask_cvtepu16_ph(_mm256_setzero_ph(), k, a)
11931}
11932
11933/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11934/// and store the results in dst.
11935///
11936/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_ph)
11937#[inline]
11938#[target_feature(enable = "avx512fp16")]
11939#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11940#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11941pub fn _mm512_cvtepu16_ph(a: __m512i) -> __m512h {
11942    unsafe { vcvtuw2ph_512(a.as_u16x32(), _MM_FROUND_CUR_DIRECTION) }
11943}
11944
11945/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11946/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11947/// mask bit is not set).
11948///
11949/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu16_ph)
11950#[inline]
11951#[target_feature(enable = "avx512fp16")]
11952#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11953#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11954pub fn _mm512_mask_cvtepu16_ph(src: __m512h, k: __mmask32, a: __m512i) -> __m512h {
11955    unsafe { simd_select_bitmask(k, _mm512_cvtepu16_ph(a), src) }
11956}
11957
11958/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11959/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
11960///
11961/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu16_ph)
11962#[inline]
11963#[target_feature(enable = "avx512fp16")]
11964#[cfg_attr(test, assert_instr(vcvtuw2ph))]
11965#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11966pub fn _mm512_maskz_cvtepu16_ph(k: __mmask32, a: __m512i) -> __m512h {
11967    _mm512_mask_cvtepu16_ph(_mm512_setzero_ph(), k, a)
11968}
11969
11970/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11971/// and store the results in dst.
11972///
11973/// Rounding is done according to the rounding parameter, which can be one of:
11974///
11975/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
11976/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
11977/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
11978/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
11979/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
11980///
11981/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu16_ph)
11982#[inline]
11983#[target_feature(enable = "avx512fp16")]
11984#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
11985#[rustc_legacy_const_generics(1)]
11986#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
11987pub fn _mm512_cvt_roundepu16_ph<const ROUNDING: i32>(a: __m512i) -> __m512h {
11988    unsafe {
11989        static_assert_rounding!(ROUNDING);
11990        vcvtuw2ph_512(a.as_u16x32(), ROUNDING)
11991    }
11992}
11993
11994/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
11995/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
11996/// mask bit is not set).
11997///
11998/// Rounding is done according to the rounding parameter, which can be one of:
11999///
12000/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12001/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12002/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12003/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12004/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12005///
12006/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu16_ph)
12007#[inline]
12008#[target_feature(enable = "avx512fp16")]
12009#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
12010#[rustc_legacy_const_generics(3)]
12011#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12012pub fn _mm512_mask_cvt_roundepu16_ph<const ROUNDING: i32>(
12013    src: __m512h,
12014    k: __mmask32,
12015    a: __m512i,
12016) -> __m512h {
12017    unsafe {
12018        static_assert_rounding!(ROUNDING);
12019        simd_select_bitmask(k, _mm512_cvt_roundepu16_ph::<ROUNDING>(a), src)
12020    }
12021}
12022
12023/// Convert packed unsigned 16-bit integers in a to packed half-precision (16-bit) floating-point elements,
12024/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12025///
12026/// Rounding is done according to the rounding parameter, which can be one of:
12027///
12028/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12029/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12030/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12031/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12032/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12033///
12034/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu16_ph)
12035#[inline]
12036#[target_feature(enable = "avx512fp16")]
12037#[cfg_attr(test, assert_instr(vcvtuw2ph, ROUNDING = 8))]
12038#[rustc_legacy_const_generics(2)]
12039#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12040pub fn _mm512_maskz_cvt_roundepu16_ph<const ROUNDING: i32>(k: __mmask32, a: __m512i) -> __m512h {
12041    static_assert_rounding!(ROUNDING);
12042    _mm512_mask_cvt_roundepu16_ph::<ROUNDING>(_mm512_setzero_ph(), k, a)
12043}
12044
12045/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12046/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12047///
12048/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_ph)
12049#[inline]
12050#[target_feature(enable = "avx512fp16,avx512vl")]
12051#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12052#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12053pub fn _mm_cvtepi32_ph(a: __m128i) -> __m128h {
12054    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), 0xff, a)
12055}
12056
12057/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12058/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12059/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12060///
12061/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_ph)
12062#[inline]
12063#[target_feature(enable = "avx512fp16,avx512vl")]
12064#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12065#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12066pub fn _mm_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12067    unsafe { vcvtdq2ph_128(a.as_i32x4(), src, k) }
12068}
12069
12070/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12071/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12072/// The upper 64 bits of dst are zeroed out.
12073///
12074/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_ph)
12075#[inline]
12076#[target_feature(enable = "avx512fp16,avx512vl")]
12077#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12078#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12079pub fn _mm_maskz_cvtepi32_ph(k: __mmask8, a: __m128i) -> __m128h {
12080    _mm_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
12081}
12082
12083/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12084/// and store the results in dst.
12085///
12086/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_ph)
12087#[inline]
12088#[target_feature(enable = "avx512fp16,avx512vl")]
12089#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12090#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12091pub fn _mm256_cvtepi32_ph(a: __m256i) -> __m128h {
12092    unsafe { vcvtdq2ph_256(a.as_i32x8(), _MM_FROUND_CUR_DIRECTION) }
12093}
12094
12095/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12096/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12097/// mask bit is not set).
12098///
12099/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_ph)
12100#[inline]
12101#[target_feature(enable = "avx512fp16,avx512vl")]
12102#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12103#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12104pub fn _mm256_mask_cvtepi32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12105    unsafe { simd_select_bitmask(k, _mm256_cvtepi32_ph(a), src) }
12106}
12107
12108/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12109/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12110///
12111/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_ph)
12112#[inline]
12113#[target_feature(enable = "avx512fp16,avx512vl")]
12114#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12115#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12116pub fn _mm256_maskz_cvtepi32_ph(k: __mmask8, a: __m256i) -> __m128h {
12117    _mm256_mask_cvtepi32_ph(_mm_setzero_ph(), k, a)
12118}
12119
12120/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12121/// and store the results in dst.
12122///
12123/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_ph)
12124#[inline]
12125#[target_feature(enable = "avx512fp16")]
12126#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12127#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12128pub fn _mm512_cvtepi32_ph(a: __m512i) -> __m256h {
12129    unsafe { vcvtdq2ph_512(a.as_i32x16(), _MM_FROUND_CUR_DIRECTION) }
12130}
12131
12132/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12133/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12134/// mask bit is not set).
12135///
12136/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_ph)
12137#[inline]
12138#[target_feature(enable = "avx512fp16")]
12139#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12140#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12141pub fn _mm512_mask_cvtepi32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12142    unsafe { simd_select_bitmask(k, _mm512_cvtepi32_ph(a), src) }
12143}
12144
12145/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12146/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12147///
12148/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_ph)
12149#[inline]
12150#[target_feature(enable = "avx512fp16")]
12151#[cfg_attr(test, assert_instr(vcvtdq2ph))]
12152#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12153pub fn _mm512_maskz_cvtepi32_ph(k: __mmask16, a: __m512i) -> __m256h {
12154    _mm512_mask_cvtepi32_ph(f16x16::ZERO.as_m256h(), k, a)
12155}
12156
12157/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12158/// and store the results in dst.
12159///
12160/// Rounding is done according to the rounding parameter, which can be one of:
12161///
12162/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12163/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12164/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12165/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12166/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12167///
12168/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ph)
12169#[inline]
12170#[target_feature(enable = "avx512fp16")]
12171#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12172#[rustc_legacy_const_generics(1)]
12173#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12174pub fn _mm512_cvt_roundepi32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12175    unsafe {
12176        static_assert_rounding!(ROUNDING);
12177        vcvtdq2ph_512(a.as_i32x16(), ROUNDING)
12178    }
12179}
12180
12181/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12182/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12183/// mask bit is not set).
12184///
12185/// Rounding is done according to the rounding parameter, which can be one of:
12186///
12187/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12188/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12189/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12190/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12191/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12192///
12193/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ph)
12194#[inline]
12195#[target_feature(enable = "avx512fp16")]
12196#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12197#[rustc_legacy_const_generics(3)]
12198#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12199pub fn _mm512_mask_cvt_roundepi32_ph<const ROUNDING: i32>(
12200    src: __m256h,
12201    k: __mmask16,
12202    a: __m512i,
12203) -> __m256h {
12204    unsafe {
12205        static_assert_rounding!(ROUNDING);
12206        simd_select_bitmask(k, _mm512_cvt_roundepi32_ph::<ROUNDING>(a), src)
12207    }
12208}
12209
12210/// Convert packed signed 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12211/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12212///
12213/// Rounding is done according to the rounding parameter, which can be one of:
12214///
12215/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12216/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12217/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12218/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12219/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12220///
12221/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ph)
12222#[inline]
12223#[target_feature(enable = "avx512fp16")]
12224#[cfg_attr(test, assert_instr(vcvtdq2ph, ROUNDING = 8))]
12225#[rustc_legacy_const_generics(2)]
12226#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12227pub fn _mm512_maskz_cvt_roundepi32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
12228    static_assert_rounding!(ROUNDING);
12229    _mm512_mask_cvt_roundepi32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
12230}
12231
12232/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12233/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12234/// of dst.
12235///
12236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvti32_sh)
12237#[inline]
12238#[target_feature(enable = "avx512fp16")]
12239#[cfg_attr(test, assert_instr(vcvtsi2sh))]
12240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12241pub fn _mm_cvti32_sh(a: __m128h, b: i32) -> __m128h {
12242    unsafe { vcvtsi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
12243}
12244
12245/// Convert the signed 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12246/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12247/// of dst.
12248///
12249/// Rounding is done according to the rounding parameter, which can be one of:
12250///
12251/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12252/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12253/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12254/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12255/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12256///
12257/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundi32_sh)
12258#[inline]
12259#[target_feature(enable = "avx512fp16")]
12260#[cfg_attr(test, assert_instr(vcvtsi2sh, ROUNDING = 8))]
12261#[rustc_legacy_const_generics(2)]
12262#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12263pub fn _mm_cvt_roundi32_sh<const ROUNDING: i32>(a: __m128h, b: i32) -> __m128h {
12264    unsafe {
12265        static_assert_rounding!(ROUNDING);
12266        vcvtsi2sh(a, b, ROUNDING)
12267    }
12268}
12269
12270/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12271/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12272///
12273/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_ph)
12274#[inline]
12275#[target_feature(enable = "avx512fp16,avx512vl")]
12276#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12277#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12278pub fn _mm_cvtepu32_ph(a: __m128i) -> __m128h {
12279    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), 0xff, a)
12280}
12281
12282/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12283/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12284/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12285///
12286/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_ph)
12287#[inline]
12288#[target_feature(enable = "avx512fp16,avx512vl")]
12289#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12290#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12291pub fn _mm_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12292    unsafe { vcvtudq2ph_128(a.as_u32x4(), src, k) }
12293}
12294
12295/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12296/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12297/// The upper 64 bits of dst are zeroed out.
12298///
12299/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_ph)
12300#[inline]
12301#[target_feature(enable = "avx512fp16,avx512vl")]
12302#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12303#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12304pub fn _mm_maskz_cvtepu32_ph(k: __mmask8, a: __m128i) -> __m128h {
12305    _mm_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12306}
12307
12308/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12309/// and store the results in dst.
12310///
12311/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_ph)
12312#[inline]
12313#[target_feature(enable = "avx512fp16,avx512vl")]
12314#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12315#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12316pub fn _mm256_cvtepu32_ph(a: __m256i) -> __m128h {
12317    unsafe { vcvtudq2ph_256(a.as_u32x8(), _MM_FROUND_CUR_DIRECTION) }
12318}
12319
12320/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12321/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12322/// mask bit is not set).
12323///
12324/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_ph)
12325#[inline]
12326#[target_feature(enable = "avx512fp16,avx512vl")]
12327#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12328#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12329pub fn _mm256_mask_cvtepu32_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12330    unsafe { simd_select_bitmask(k, _mm256_cvtepu32_ph(a), src) }
12331}
12332
12333/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12334/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12335///
12336/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_ph)
12337#[inline]
12338#[target_feature(enable = "avx512fp16,avx512vl")]
12339#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12340#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12341pub fn _mm256_maskz_cvtepu32_ph(k: __mmask8, a: __m256i) -> __m128h {
12342    _mm256_mask_cvtepu32_ph(_mm_setzero_ph(), k, a)
12343}
12344
12345/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12346/// and store the results in dst.
12347///
12348/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_ph)
12349#[inline]
12350#[target_feature(enable = "avx512fp16")]
12351#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12352#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12353pub fn _mm512_cvtepu32_ph(a: __m512i) -> __m256h {
12354    unsafe { vcvtudq2ph_512(a.as_u32x16(), _MM_FROUND_CUR_DIRECTION) }
12355}
12356
12357/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12358/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12359/// mask bit is not set).
12360///
12361/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_ph)
12362#[inline]
12363#[target_feature(enable = "avx512fp16")]
12364#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12365#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12366pub fn _mm512_mask_cvtepu32_ph(src: __m256h, k: __mmask16, a: __m512i) -> __m256h {
12367    unsafe { simd_select_bitmask(k, _mm512_cvtepu32_ph(a), src) }
12368}
12369
12370/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12371/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12372///
12373/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_ph)
12374#[inline]
12375#[target_feature(enable = "avx512fp16")]
12376#[cfg_attr(test, assert_instr(vcvtudq2ph))]
12377#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12378pub fn _mm512_maskz_cvtepu32_ph(k: __mmask16, a: __m512i) -> __m256h {
12379    _mm512_mask_cvtepu32_ph(f16x16::ZERO.as_m256h(), k, a)
12380}
12381
12382/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12383/// and store the results in dst.
12384///
12385/// Rounding is done according to the rounding parameter, which can be one of:
12386///
12387/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12388/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12389/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12390/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12391/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12392///
12393/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu32_ph)
12394#[inline]
12395#[target_feature(enable = "avx512fp16")]
12396#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12397#[rustc_legacy_const_generics(1)]
12398#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12399pub fn _mm512_cvt_roundepu32_ph<const ROUNDING: i32>(a: __m512i) -> __m256h {
12400    unsafe {
12401        static_assert_rounding!(ROUNDING);
12402        vcvtudq2ph_512(a.as_u32x16(), ROUNDING)
12403    }
12404}
12405
12406/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12407/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12408/// mask bit is not set).
12409///
12410/// Rounding is done according to the rounding parameter, which can be one of:
12411///
12412/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12413/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12414/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12415/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12416/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12417///
12418/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu32_ph)
12419#[inline]
12420#[target_feature(enable = "avx512fp16")]
12421#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12422#[rustc_legacy_const_generics(3)]
12423#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12424pub fn _mm512_mask_cvt_roundepu32_ph<const ROUNDING: i32>(
12425    src: __m256h,
12426    k: __mmask16,
12427    a: __m512i,
12428) -> __m256h {
12429    unsafe {
12430        static_assert_rounding!(ROUNDING);
12431        simd_select_bitmask(k, _mm512_cvt_roundepu32_ph::<ROUNDING>(a), src)
12432    }
12433}
12434
12435/// Convert packed unsigned 32-bit integers in a to packed half-precision (16-bit) floating-point elements,
12436/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12437///
12438/// Rounding is done according to the rounding parameter, which can be one of:
12439///
12440/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12441/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12442/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12443/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12444/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12445///
12446/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu32_ph)
12447#[inline]
12448#[target_feature(enable = "avx512fp16")]
12449#[cfg_attr(test, assert_instr(vcvtudq2ph, ROUNDING = 8))]
12450#[rustc_legacy_const_generics(2)]
12451#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12452pub fn _mm512_maskz_cvt_roundepu32_ph<const ROUNDING: i32>(k: __mmask16, a: __m512i) -> __m256h {
12453    static_assert_rounding!(ROUNDING);
12454    _mm512_mask_cvt_roundepu32_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
12455}
12456
12457/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12458/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12459/// of dst.
12460///
12461/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_sh)
12462#[inline]
12463#[target_feature(enable = "avx512fp16")]
12464#[cfg_attr(test, assert_instr(vcvtusi2sh))]
12465#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12466pub fn _mm_cvtu32_sh(a: __m128h, b: u32) -> __m128h {
12467    unsafe { vcvtusi2sh(a, b, _MM_FROUND_CUR_DIRECTION) }
12468}
12469
12470/// Convert the unsigned 32-bit integer b to a half-precision (16-bit) floating-point element, store the
12471/// result in the lower element of dst, and copy the upper 7 packed elements from a to the upper elements
12472/// of dst.
12473///
12474/// Rounding is done according to the rounding parameter, which can be one of:
12475///
12476/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12477/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12478/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12479/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12480/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12481///
12482/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundu32_sh)
12483#[inline]
12484#[target_feature(enable = "avx512fp16")]
12485#[cfg_attr(test, assert_instr(vcvtusi2sh, ROUNDING = 8))]
12486#[rustc_legacy_const_generics(2)]
12487#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12488pub fn _mm_cvt_roundu32_sh<const ROUNDING: i32>(a: __m128h, b: u32) -> __m128h {
12489    unsafe {
12490        static_assert_rounding!(ROUNDING);
12491        vcvtusi2sh(a, b, ROUNDING)
12492    }
12493}
12494
12495/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12496/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12497///
12498/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_ph)
12499#[inline]
12500#[target_feature(enable = "avx512fp16,avx512vl")]
12501#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12502#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12503pub fn _mm_cvtepi64_ph(a: __m128i) -> __m128h {
12504    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12505}
12506
12507/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12508/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12509/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12510///
12511/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_ph)
12512#[inline]
12513#[target_feature(enable = "avx512fp16,avx512vl")]
12514#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12515#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12516pub fn _mm_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12517    unsafe { vcvtqq2ph_128(a.as_i64x2(), src, k) }
12518}
12519
12520/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12521/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12522/// The upper 96 bits of dst are zeroed out.
12523///
12524/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_ph)
12525#[inline]
12526#[target_feature(enable = "avx512fp16,avx512vl")]
12527#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12528#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12529pub fn _mm_maskz_cvtepi64_ph(k: __mmask8, a: __m128i) -> __m128h {
12530    _mm_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12531}
12532
12533/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12534/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12535///
12536/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_ph)
12537#[inline]
12538#[target_feature(enable = "avx512fp16,avx512vl")]
12539#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12540#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12541pub fn _mm256_cvtepi64_ph(a: __m256i) -> __m128h {
12542    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), 0xff, a)
12543}
12544
12545/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12546/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12547/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12548///
12549/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_ph)
12550#[inline]
12551#[target_feature(enable = "avx512fp16,avx512vl")]
12552#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12553#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12554pub fn _mm256_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12555    unsafe { vcvtqq2ph_256(a.as_i64x4(), src, k) }
12556}
12557
12558/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12559/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12560/// The upper 64 bits of dst are zeroed out.
12561///
12562/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_ph)
12563#[inline]
12564#[target_feature(enable = "avx512fp16,avx512vl")]
12565#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12566#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12567pub fn _mm256_maskz_cvtepi64_ph(k: __mmask8, a: __m256i) -> __m128h {
12568    _mm256_mask_cvtepi64_ph(_mm_setzero_ph(), k, a)
12569}
12570
12571/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12572/// and store the results in dst.
12573///
12574/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_ph)
12575#[inline]
12576#[target_feature(enable = "avx512fp16")]
12577#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12578#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12579pub fn _mm512_cvtepi64_ph(a: __m512i) -> __m128h {
12580    unsafe { vcvtqq2ph_512(a.as_i64x8(), _MM_FROUND_CUR_DIRECTION) }
12581}
12582
12583/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12584/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12585/// mask bit is not set).
12586///
12587/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_ph)
12588#[inline]
12589#[target_feature(enable = "avx512fp16")]
12590#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12591#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12592pub fn _mm512_mask_cvtepi64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12593    unsafe { simd_select_bitmask(k, _mm512_cvtepi64_ph(a), src) }
12594}
12595
12596/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12597/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12598///
12599/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_ph)
12600#[inline]
12601#[target_feature(enable = "avx512fp16")]
12602#[cfg_attr(test, assert_instr(vcvtqq2ph))]
12603#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12604pub fn _mm512_maskz_cvtepi64_ph(k: __mmask8, a: __m512i) -> __m128h {
12605    _mm512_mask_cvtepi64_ph(f16x8::ZERO.as_m128h(), k, a)
12606}
12607
12608/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12609/// and store the results in dst.
12610///
12611/// Rounding is done according to the rounding parameter, which can be one of:
12612///
12613/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12614/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12615/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12616/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12617/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12618///
12619/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi64_ph)
12620#[inline]
12621#[target_feature(enable = "avx512fp16")]
12622#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12623#[rustc_legacy_const_generics(1)]
12624#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12625pub fn _mm512_cvt_roundepi64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
12626    unsafe {
12627        static_assert_rounding!(ROUNDING);
12628        vcvtqq2ph_512(a.as_i64x8(), ROUNDING)
12629    }
12630}
12631
12632/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12633/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12634/// mask bit is not set).
12635///
12636/// Rounding is done according to the rounding parameter, which can be one of:
12637///
12638/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12639/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12640/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12641/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12642/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12643///
12644/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi64_ph)
12645#[inline]
12646#[target_feature(enable = "avx512fp16")]
12647#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12648#[rustc_legacy_const_generics(3)]
12649#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12650pub fn _mm512_mask_cvt_roundepi64_ph<const ROUNDING: i32>(
12651    src: __m128h,
12652    k: __mmask8,
12653    a: __m512i,
12654) -> __m128h {
12655    unsafe {
12656        static_assert_rounding!(ROUNDING);
12657        simd_select_bitmask(k, _mm512_cvt_roundepi64_ph::<ROUNDING>(a), src)
12658    }
12659}
12660
12661/// Convert packed signed 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12662/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12663///
12664/// Rounding is done according to the rounding parameter, which can be one of:
12665///
12666/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12667/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12668/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12669/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12670/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12671///
12672/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi64_ph)
12673#[inline]
12674#[target_feature(enable = "avx512fp16")]
12675#[cfg_attr(test, assert_instr(vcvtqq2ph, ROUNDING = 8))]
12676#[rustc_legacy_const_generics(2)]
12677#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12678pub fn _mm512_maskz_cvt_roundepi64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
12679    static_assert_rounding!(ROUNDING);
12680    _mm512_mask_cvt_roundepi64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
12681}
12682
12683/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12684/// and store the results in dst. The upper 96 bits of dst are zeroed out.
12685///
12686/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu64_ph)
12687#[inline]
12688#[target_feature(enable = "avx512fp16,avx512vl")]
12689#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12690#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12691pub fn _mm_cvtepu64_ph(a: __m128i) -> __m128h {
12692    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12693}
12694
12695/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12696/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12697/// mask bit is not set). The upper 96 bits of dst are zeroed out.
12698///
12699/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu64_ph)
12700#[inline]
12701#[target_feature(enable = "avx512fp16,avx512vl")]
12702#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12703#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12704pub fn _mm_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m128i) -> __m128h {
12705    unsafe { vcvtuqq2ph_128(a.as_u64x2(), src, k) }
12706}
12707
12708/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12709/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12710/// The upper 96 bits of dst are zeroed out.
12711///
12712/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu64_ph)
12713#[inline]
12714#[target_feature(enable = "avx512fp16,avx512vl")]
12715#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12716#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12717pub fn _mm_maskz_cvtepu64_ph(k: __mmask8, a: __m128i) -> __m128h {
12718    _mm_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12719}
12720
12721/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12722/// and store the results in dst. The upper 64 bits of dst are zeroed out.
12723///
12724/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu64_ph)
12725#[inline]
12726#[target_feature(enable = "avx512fp16,avx512vl")]
12727#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12728#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12729pub fn _mm256_cvtepu64_ph(a: __m256i) -> __m128h {
12730    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), 0xff, a)
12731}
12732
12733/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12734/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12735/// mask bit is not set). The upper 64 bits of dst are zeroed out.
12736///
12737/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu64_ph)
12738#[inline]
12739#[target_feature(enable = "avx512fp16,avx512vl")]
12740#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12741#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12742pub fn _mm256_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m256i) -> __m128h {
12743    unsafe { vcvtuqq2ph_256(a.as_u64x4(), src, k) }
12744}
12745
12746/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12747/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12748/// The upper 64 bits of dst are zeroed out.
12749///
12750/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu64_ph)
12751#[inline]
12752#[target_feature(enable = "avx512fp16,avx512vl")]
12753#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12754#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12755pub fn _mm256_maskz_cvtepu64_ph(k: __mmask8, a: __m256i) -> __m128h {
12756    _mm256_mask_cvtepu64_ph(_mm_setzero_ph(), k, a)
12757}
12758
12759/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12760/// and store the results in dst.
12761///
12762/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu64_ph)
12763#[inline]
12764#[target_feature(enable = "avx512fp16")]
12765#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12766#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12767pub fn _mm512_cvtepu64_ph(a: __m512i) -> __m128h {
12768    unsafe { vcvtuqq2ph_512(a.as_u64x8(), _MM_FROUND_CUR_DIRECTION) }
12769}
12770
12771/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12772/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12773/// mask bit is not set).
12774///
12775/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu64_ph)
12776#[inline]
12777#[target_feature(enable = "avx512fp16")]
12778#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12779#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12780pub fn _mm512_mask_cvtepu64_ph(src: __m128h, k: __mmask8, a: __m512i) -> __m128h {
12781    unsafe { simd_select_bitmask(k, _mm512_cvtepu64_ph(a), src) }
12782}
12783
12784/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12785/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12786///
12787/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu64_ph)
12788#[inline]
12789#[target_feature(enable = "avx512fp16")]
12790#[cfg_attr(test, assert_instr(vcvtuqq2ph))]
12791#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12792pub fn _mm512_maskz_cvtepu64_ph(k: __mmask8, a: __m512i) -> __m128h {
12793    _mm512_mask_cvtepu64_ph(f16x8::ZERO.as_m128h(), k, a)
12794}
12795
12796/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12797/// and store the results in dst.
12798///
12799/// Rounding is done according to the rounding parameter, which can be one of:
12800///
12801/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12802/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12803/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12804/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12805/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12806///
12807/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu64_ph)
12808#[inline]
12809#[target_feature(enable = "avx512fp16")]
12810#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12811#[rustc_legacy_const_generics(1)]
12812#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12813pub fn _mm512_cvt_roundepu64_ph<const ROUNDING: i32>(a: __m512i) -> __m128h {
12814    unsafe {
12815        static_assert_rounding!(ROUNDING);
12816        vcvtuqq2ph_512(a.as_u64x8(), ROUNDING)
12817    }
12818}
12819
12820/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12821/// and store the results in dst using writemask k (elements are copied from src to dst when the corresponding
12822/// mask bit is not set).
12823///
12824/// Rounding is done according to the rounding parameter, which can be one of:
12825///
12826/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12827/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12828/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12829/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12830/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12831///
12832/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu64_ph)
12833#[inline]
12834#[target_feature(enable = "avx512fp16")]
12835#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12836#[rustc_legacy_const_generics(3)]
12837#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12838pub fn _mm512_mask_cvt_roundepu64_ph<const ROUNDING: i32>(
12839    src: __m128h,
12840    k: __mmask8,
12841    a: __m512i,
12842) -> __m128h {
12843    unsafe {
12844        static_assert_rounding!(ROUNDING);
12845        simd_select_bitmask(k, _mm512_cvt_roundepu64_ph::<ROUNDING>(a), src)
12846    }
12847}
12848
12849/// Convert packed unsigned 64-bit integers in a to packed half-precision (16-bit) floating-point elements,
12850/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
12851///
12852/// Rounding is done according to the rounding parameter, which can be one of:
12853///
12854/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12855/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12856/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12857/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12858/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12859///
12860/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu64_ph)
12861#[inline]
12862#[target_feature(enable = "avx512fp16")]
12863#[cfg_attr(test, assert_instr(vcvtuqq2ph, ROUNDING = 8))]
12864#[rustc_legacy_const_generics(2)]
12865#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12866pub fn _mm512_maskz_cvt_roundepu64_ph<const ROUNDING: i32>(k: __mmask8, a: __m512i) -> __m128h {
12867    static_assert_rounding!(ROUNDING);
12868    _mm512_mask_cvt_roundepu64_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
12869}
12870
12871/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12872/// floating-point elements, and store the results in dst.
12873///
12874/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxps_ph)
12875#[inline]
12876#[target_feature(enable = "avx512fp16,avx512vl")]
12877#[cfg_attr(test, assert_instr(vcvtps2phx))]
12878#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12879pub fn _mm_cvtxps_ph(a: __m128) -> __m128h {
12880    _mm_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
12881}
12882
12883/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12884/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12885/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
12886///
12887/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxps_ph)
12888#[inline]
12889#[target_feature(enable = "avx512fp16,avx512vl")]
12890#[cfg_attr(test, assert_instr(vcvtps2phx))]
12891#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12892pub fn _mm_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m128) -> __m128h {
12893    unsafe { vcvtps2phx_128(a, src, k) }
12894}
12895
12896/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12897/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12898/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
12899///
12900/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxps_ph)
12901#[inline]
12902#[target_feature(enable = "avx512fp16,avx512vl")]
12903#[cfg_attr(test, assert_instr(vcvtps2phx))]
12904#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12905pub fn _mm_maskz_cvtxps_ph(k: __mmask8, a: __m128) -> __m128h {
12906    _mm_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
12907}
12908
12909/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12910/// floating-point elements, and store the results in dst.
12911///
12912/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxps_ph)
12913#[inline]
12914#[target_feature(enable = "avx512fp16,avx512vl")]
12915#[cfg_attr(test, assert_instr(vcvtps2phx))]
12916#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12917pub fn _mm256_cvtxps_ph(a: __m256) -> __m128h {
12918    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), 0xff, a)
12919}
12920
12921/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12922/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12923/// when the corresponding mask bit is not set).
12924///
12925/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxps_ph)
12926#[inline]
12927#[target_feature(enable = "avx512fp16,avx512vl")]
12928#[cfg_attr(test, assert_instr(vcvtps2phx))]
12929#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12930pub fn _mm256_mask_cvtxps_ph(src: __m128h, k: __mmask8, a: __m256) -> __m128h {
12931    unsafe { vcvtps2phx_256(a, src, k) }
12932}
12933
12934/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12935/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12936/// corresponding mask bit is not set).
12937///
12938/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxps_ph)
12939#[inline]
12940#[target_feature(enable = "avx512fp16,avx512vl")]
12941#[cfg_attr(test, assert_instr(vcvtps2phx))]
12942#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12943pub fn _mm256_maskz_cvtxps_ph(k: __mmask8, a: __m256) -> __m128h {
12944    _mm256_mask_cvtxps_ph(_mm_setzero_ph(), k, a)
12945}
12946
12947/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12948/// floating-point elements, and store the results in dst.
12949///
12950/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxps_ph)
12951#[inline]
12952#[target_feature(enable = "avx512fp16")]
12953#[cfg_attr(test, assert_instr(vcvtps2phx))]
12954#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12955pub fn _mm512_cvtxps_ph(a: __m512) -> __m256h {
12956    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), 0xffff, a)
12957}
12958
12959/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12960/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
12961/// when the corresponding mask bit is not set).
12962///
12963/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxps_ph)
12964#[inline]
12965#[target_feature(enable = "avx512fp16")]
12966#[cfg_attr(test, assert_instr(vcvtps2phx))]
12967#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12968pub fn _mm512_mask_cvtxps_ph(src: __m256h, k: __mmask16, a: __m512) -> __m256h {
12969    unsafe { vcvtps2phx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
12970}
12971
12972/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12973/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
12974/// corresponding mask bit is not set).
12975///
12976/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxps_ph)
12977#[inline]
12978#[target_feature(enable = "avx512fp16")]
12979#[cfg_attr(test, assert_instr(vcvtps2phx))]
12980#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
12981pub fn _mm512_maskz_cvtxps_ph(k: __mmask16, a: __m512) -> __m256h {
12982    _mm512_mask_cvtxps_ph(f16x16::ZERO.as_m256h(), k, a)
12983}
12984
12985/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
12986/// floating-point elements, and store the results in dst.
12987///
12988/// Rounding is done according to the rounding parameter, which can be one of:
12989///
12990/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
12991/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
12992/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
12993/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
12994/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
12995///
12996/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundps_ph)
12997#[inline]
12998#[target_feature(enable = "avx512fp16")]
12999#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
13000#[rustc_legacy_const_generics(1)]
13001#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13002pub fn _mm512_cvtx_roundps_ph<const ROUNDING: i32>(a: __m512) -> __m256h {
13003    static_assert_rounding!(ROUNDING);
13004    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), 0xffff, a)
13005}
13006
13007/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13008/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13009/// when the corresponding mask bit is not set).
13010///
13011/// Rounding is done according to the rounding parameter, which can be one of:
13012///
13013/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13014/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13015/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13016/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13017/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13018///
13019/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundps_ph)
13020#[inline]
13021#[target_feature(enable = "avx512fp16")]
13022#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
13023#[rustc_legacy_const_generics(3)]
13024#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13025pub fn _mm512_mask_cvtx_roundps_ph<const ROUNDING: i32>(
13026    src: __m256h,
13027    k: __mmask16,
13028    a: __m512,
13029) -> __m256h {
13030    unsafe {
13031        static_assert_rounding!(ROUNDING);
13032        vcvtps2phx_512(a, src, k, ROUNDING)
13033    }
13034}
13035
13036/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit)
13037/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13038/// corresponding mask bit is not set).
13039///
13040/// Rounding is done according to the rounding parameter, which can be one of:
13041///
13042/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13043/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13044/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13045/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13046/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13047///
13048/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundps_ph)
13049#[inline]
13050#[target_feature(enable = "avx512fp16")]
13051#[cfg_attr(test, assert_instr(vcvtps2phx, ROUNDING = 8))]
13052#[rustc_legacy_const_generics(2)]
13053#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13054pub fn _mm512_maskz_cvtx_roundps_ph<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m256h {
13055    static_assert_rounding!(ROUNDING);
13056    _mm512_mask_cvtx_roundps_ph::<ROUNDING>(f16x16::ZERO.as_m256h(), k, a)
13057}
13058
13059/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13060/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13061/// elements from a to the upper elements of dst.
13062///
13063/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sh)
13064#[inline]
13065#[target_feature(enable = "avx512fp16")]
13066#[cfg_attr(test, assert_instr(vcvtss2sh))]
13067#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13068pub fn _mm_cvtss_sh(a: __m128h, b: __m128) -> __m128h {
13069    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
13070}
13071
13072/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13073/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13074/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13075/// upper elements of dst.
13076///
13077/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtss_sh)
13078#[inline]
13079#[target_feature(enable = "avx512fp16")]
13080#[cfg_attr(test, assert_instr(vcvtss2sh))]
13081#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13082pub fn _mm_mask_cvtss_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128) -> __m128h {
13083    unsafe { vcvtss2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
13084}
13085
13086/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13087/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13088/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13089/// elements of dst.
13090///
13091/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtss_sh)
13092#[inline]
13093#[target_feature(enable = "avx512fp16")]
13094#[cfg_attr(test, assert_instr(vcvtss2sh))]
13095#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13096pub fn _mm_maskz_cvtss_sh(k: __mmask8, a: __m128h, b: __m128) -> __m128h {
13097    _mm_mask_cvtss_sh(f16x8::ZERO.as_m128h(), k, a, b)
13098}
13099
13100/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13101/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13102/// elements from a to the upper elements of dst.
13103///
13104/// Rounding is done according to the rounding parameter, which can be one of:
13105///
13106/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13107/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13108/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13109/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13110/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13111///
13112/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundss_sh)
13113#[inline]
13114#[target_feature(enable = "avx512fp16")]
13115#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13116#[rustc_legacy_const_generics(2)]
13117#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13118pub fn _mm_cvt_roundss_sh<const ROUNDING: i32>(a: __m128h, b: __m128) -> __m128h {
13119    static_assert_rounding!(ROUNDING);
13120    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
13121}
13122
13123/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13124/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13125/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13126/// upper elements of dst.
13127///
13128/// Rounding is done according to the rounding parameter, which can be one of:
13129///
13130/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13131/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13132/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13133/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13134/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13135///
13136/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundss_sh)
13137#[inline]
13138#[target_feature(enable = "avx512fp16")]
13139#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13140#[rustc_legacy_const_generics(4)]
13141#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13142pub fn _mm_mask_cvt_roundss_sh<const ROUNDING: i32>(
13143    src: __m128h,
13144    k: __mmask8,
13145    a: __m128h,
13146    b: __m128,
13147) -> __m128h {
13148    unsafe {
13149        static_assert_rounding!(ROUNDING);
13150        vcvtss2sh(a, b, src, k, ROUNDING)
13151    }
13152}
13153
13154/// Convert the lower single-precision (32-bit) floating-point element in b to a half-precision (16-bit)
13155/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13156/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13157/// elements of dst.
13158///
13159/// Rounding is done according to the rounding parameter, which can be one of:
13160///
13161/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13162/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13163/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13164/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13165/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13166///
13167/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundss_sh)
13168#[inline]
13169#[target_feature(enable = "avx512fp16")]
13170#[cfg_attr(test, assert_instr(vcvtss2sh, ROUNDING = 8))]
13171#[rustc_legacy_const_generics(3)]
13172#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13173pub fn _mm_maskz_cvt_roundss_sh<const ROUNDING: i32>(
13174    k: __mmask8,
13175    a: __m128h,
13176    b: __m128,
13177) -> __m128h {
13178    static_assert_rounding!(ROUNDING);
13179    _mm_mask_cvt_roundss_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
13180}
13181
13182/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13183/// floating-point elements, and store the results in dst. The upper 96 bits of dst are zeroed out.
13184///
13185/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ph)
13186#[inline]
13187#[target_feature(enable = "avx512fp16,avx512vl")]
13188#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13189#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13190pub fn _mm_cvtpd_ph(a: __m128d) -> __m128h {
13191    _mm_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13192}
13193
13194/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13195/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13196/// when the corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
13197///
13198/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_ph)
13199#[inline]
13200#[target_feature(enable = "avx512fp16,avx512vl")]
13201#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13202#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13203pub fn _mm_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m128d) -> __m128h {
13204    unsafe { vcvtpd2ph_128(a, src, k) }
13205}
13206
13207/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13208/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13209/// corresponding mask bit is not set). The upper 96 bits of dst are zeroed out.
13210///
13211/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_ph)
13212#[inline]
13213#[target_feature(enable = "avx512fp16,avx512vl")]
13214#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13215#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13216pub fn _mm_maskz_cvtpd_ph(k: __mmask8, a: __m128d) -> __m128h {
13217    _mm_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13218}
13219
13220/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13221/// floating-point elements, and store the results in dst. The upper 64 bits of dst are zeroed out.
13222///
13223/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_ph)
13224#[inline]
13225#[target_feature(enable = "avx512fp16,avx512vl")]
13226#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13227#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13228pub fn _mm256_cvtpd_ph(a: __m256d) -> __m128h {
13229    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), 0xff, a)
13230}
13231
13232/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13233/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13234/// when the corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13235///
13236/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_ph)
13237#[inline]
13238#[target_feature(enable = "avx512fp16,avx512vl")]
13239#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13240#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13241pub fn _mm256_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m256d) -> __m128h {
13242    unsafe { vcvtpd2ph_256(a, src, k) }
13243}
13244
13245/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13246/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13247/// corresponding mask bit is not set). The upper 64 bits of dst are zeroed out.
13248///
13249/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_ph)
13250#[inline]
13251#[target_feature(enable = "avx512fp16,avx512vl")]
13252#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13253#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13254pub fn _mm256_maskz_cvtpd_ph(k: __mmask8, a: __m256d) -> __m128h {
13255    _mm256_mask_cvtpd_ph(_mm_setzero_ph(), k, a)
13256}
13257
13258/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13259/// floating-point elements, and store the results in dst.
13260///
13261/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_ph)
13262#[inline]
13263#[target_feature(enable = "avx512fp16")]
13264#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13265#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13266pub fn _mm512_cvtpd_ph(a: __m512d) -> __m128h {
13267    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), 0xff, a)
13268}
13269
13270/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13271/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13272/// when the corresponding mask bit is not set).
13273///
13274/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_ph)
13275#[inline]
13276#[target_feature(enable = "avx512fp16")]
13277#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13278#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13279pub fn _mm512_mask_cvtpd_ph(src: __m128h, k: __mmask8, a: __m512d) -> __m128h {
13280    unsafe { vcvtpd2ph_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
13281}
13282
13283/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13284/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13285/// corresponding mask bit is not set).
13286///
13287/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_ph)
13288#[inline]
13289#[target_feature(enable = "avx512fp16")]
13290#[cfg_attr(test, assert_instr(vcvtpd2ph))]
13291#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13292pub fn _mm512_maskz_cvtpd_ph(k: __mmask8, a: __m512d) -> __m128h {
13293    _mm512_mask_cvtpd_ph(f16x8::ZERO.as_m128h(), k, a)
13294}
13295
13296/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13297/// floating-point elements, and store the results in dst.
13298///
13299/// Rounding is done according to the rounding parameter, which can be one of:
13300///
13301/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13302/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13303/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13304/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13305/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13306///
13307/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_ph)
13308#[inline]
13309#[target_feature(enable = "avx512fp16")]
13310#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13311#[rustc_legacy_const_generics(1)]
13312#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13313pub fn _mm512_cvt_roundpd_ph<const ROUNDING: i32>(a: __m512d) -> __m128h {
13314    static_assert_rounding!(ROUNDING);
13315    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a)
13316}
13317
13318/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13319/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to dst
13320/// when the corresponding mask bit is not set).
13321///
13322/// Rounding is done according to the rounding parameter, which can be one of:
13323///
13324/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13325/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13326/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13327/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13328/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13329///
13330/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_ph)
13331#[inline]
13332#[target_feature(enable = "avx512fp16")]
13333#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13334#[rustc_legacy_const_generics(3)]
13335#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13336pub fn _mm512_mask_cvt_roundpd_ph<const ROUNDING: i32>(
13337    src: __m128h,
13338    k: __mmask8,
13339    a: __m512d,
13340) -> __m128h {
13341    unsafe {
13342        static_assert_rounding!(ROUNDING);
13343        vcvtpd2ph_512(a, src, k, ROUNDING)
13344    }
13345}
13346
13347/// Convert packed double-precision (64-bit) floating-point elements in a to packed half-precision (16-bit)
13348/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
13349/// corresponding mask bit is not set).
13350///
13351/// Rounding is done according to the rounding parameter, which can be one of:
13352///
13353/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13354/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13355/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13356/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13357/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13358///
13359/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ph)
13360#[inline]
13361#[target_feature(enable = "avx512fp16")]
13362#[cfg_attr(test, assert_instr(vcvtpd2ph, ROUNDING = 8))]
13363#[rustc_legacy_const_generics(2)]
13364#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13365pub fn _mm512_maskz_cvt_roundpd_ph<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m128h {
13366    static_assert_rounding!(ROUNDING);
13367    _mm512_mask_cvt_roundpd_ph::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a)
13368}
13369
13370/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13371/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13372/// elements from a to the upper elements of dst.
13373///
13374/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_sh)
13375#[inline]
13376#[target_feature(enable = "avx512fp16")]
13377#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13378#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13379pub fn _mm_cvtsd_sh(a: __m128h, b: __m128d) -> __m128h {
13380    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), 0xff, a, b)
13381}
13382
13383/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13384/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13385/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13386/// upper elements of dst.
13387///
13388/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsd_sh)
13389#[inline]
13390#[target_feature(enable = "avx512fp16")]
13391#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13392#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13393pub fn _mm_mask_cvtsd_sh(src: __m128h, k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13394    unsafe { vcvtsd2sh(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
13395}
13396
13397/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13398/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13399/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13400/// elements of dst.
13401///
13402/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsd_sh)
13403#[inline]
13404#[target_feature(enable = "avx512fp16")]
13405#[cfg_attr(test, assert_instr(vcvtsd2sh))]
13406#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13407pub fn _mm_maskz_cvtsd_sh(k: __mmask8, a: __m128h, b: __m128d) -> __m128h {
13408    _mm_mask_cvtsd_sh(f16x8::ZERO.as_m128h(), k, a, b)
13409}
13410
13411/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13412/// floating-point elements, store the result in the lower element of dst, and copy the upper 7 packed
13413/// elements from a to the upper elements of dst.
13414///
13415/// Rounding is done according to the rounding parameter, which can be one of:
13416///
13417/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13418/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13419/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13420/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13421/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13422///
13423/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsd_sh)
13424#[inline]
13425#[target_feature(enable = "avx512fp16")]
13426#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13427#[rustc_legacy_const_generics(2)]
13428#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13429pub fn _mm_cvt_roundsd_sh<const ROUNDING: i32>(a: __m128h, b: __m128d) -> __m128h {
13430    static_assert_rounding!(ROUNDING);
13431    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), 0xff, a, b)
13432}
13433
13434/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13435/// floating-point elements, store the result in the lower element of dst using writemask k (the element
13436/// if copied from src when mask bit 0 is not set), and copy the upper 7 packed elements from a to the
13437/// upper elements of dst.
13438///
13439/// Rounding is done according to the rounding parameter, which can be one of:
13440///
13441/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13442/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13443/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13444/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13445/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13446///
13447/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsd_sh)
13448#[inline]
13449#[target_feature(enable = "avx512fp16")]
13450#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13451#[rustc_legacy_const_generics(4)]
13452#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13453pub fn _mm_mask_cvt_roundsd_sh<const ROUNDING: i32>(
13454    src: __m128h,
13455    k: __mmask8,
13456    a: __m128h,
13457    b: __m128d,
13458) -> __m128h {
13459    unsafe {
13460        static_assert_rounding!(ROUNDING);
13461        vcvtsd2sh(a, b, src, k, ROUNDING)
13462    }
13463}
13464
13465/// Convert the lower double-precision (64-bit) floating-point element in b to a half-precision (16-bit)
13466/// floating-point elements, store the result in the lower element of dst using zeromask k (the element
13467/// is zeroed out when mask bit 0 is not set), and copy the upper 7 packed elements from a to the upper
13468/// elements of dst.
13469///
13470/// Rounding is done according to the rounding parameter, which can be one of:
13471///
13472/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13473/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13474/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13475/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13476/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13477///
13478/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsd_sh)
13479#[inline]
13480#[target_feature(enable = "avx512fp16")]
13481#[cfg_attr(test, assert_instr(vcvtsd2sh, ROUNDING = 8))]
13482#[rustc_legacy_const_generics(3)]
13483#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13484pub fn _mm_maskz_cvt_roundsd_sh<const ROUNDING: i32>(
13485    k: __mmask8,
13486    a: __m128h,
13487    b: __m128d,
13488) -> __m128h {
13489    static_assert_rounding!(ROUNDING);
13490    _mm_mask_cvt_roundsd_sh::<ROUNDING>(f16x8::ZERO.as_m128h(), k, a, b)
13491}
13492
13493/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13494/// store the results in dst.
13495///
13496/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi16)
13497#[inline]
13498#[target_feature(enable = "avx512fp16,avx512vl")]
13499#[cfg_attr(test, assert_instr(vcvtph2w))]
13500#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13501pub fn _mm_cvtph_epi16(a: __m128h) -> __m128i {
13502    _mm_mask_cvtph_epi16(_mm_undefined_si128(), 0xff, a)
13503}
13504
13505/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13506/// store the results in dst using writemask k (elements are copied from src when the corresponding
13507/// mask bit is not set).
13508///
13509/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi16)
13510#[inline]
13511#[target_feature(enable = "avx512fp16,avx512vl")]
13512#[cfg_attr(test, assert_instr(vcvtph2w))]
13513#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13514pub fn _mm_mask_cvtph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13515    unsafe { transmute(vcvtph2w_128(a, src.as_i16x8(), k)) }
13516}
13517
13518/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13519/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13520///
13521/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi16)
13522#[inline]
13523#[target_feature(enable = "avx512fp16,avx512vl")]
13524#[cfg_attr(test, assert_instr(vcvtph2w))]
13525#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13526pub fn _mm_maskz_cvtph_epi16(k: __mmask8, a: __m128h) -> __m128i {
13527    _mm_mask_cvtph_epi16(_mm_setzero_si128(), k, a)
13528}
13529
13530/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13531/// store the results in dst.
13532///
13533/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi16)
13534#[inline]
13535#[target_feature(enable = "avx512fp16,avx512vl")]
13536#[cfg_attr(test, assert_instr(vcvtph2w))]
13537#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13538pub fn _mm256_cvtph_epi16(a: __m256h) -> __m256i {
13539    _mm256_mask_cvtph_epi16(_mm256_undefined_si256(), 0xffff, a)
13540}
13541
13542/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13543/// store the results in dst using writemask k (elements are copied from src when the corresponding
13544/// mask bit is not set).
13545///
13546/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi16)
13547#[inline]
13548#[target_feature(enable = "avx512fp16,avx512vl")]
13549#[cfg_attr(test, assert_instr(vcvtph2w))]
13550#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13551pub fn _mm256_mask_cvtph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13552    unsafe { transmute(vcvtph2w_256(a, src.as_i16x16(), k)) }
13553}
13554
13555/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13556/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13557///
13558/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi16)
13559#[inline]
13560#[target_feature(enable = "avx512fp16,avx512vl")]
13561#[cfg_attr(test, assert_instr(vcvtph2w))]
13562#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13563pub fn _mm256_maskz_cvtph_epi16(k: __mmask16, a: __m256h) -> __m256i {
13564    _mm256_mask_cvtph_epi16(_mm256_setzero_si256(), k, a)
13565}
13566
13567/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13568/// store the results in dst.
13569///
13570/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi16)
13571#[inline]
13572#[target_feature(enable = "avx512fp16")]
13573#[cfg_attr(test, assert_instr(vcvtph2w))]
13574#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13575pub fn _mm512_cvtph_epi16(a: __m512h) -> __m512i {
13576    _mm512_mask_cvtph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
13577}
13578
13579/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13580/// store the results in dst using writemask k (elements are copied from src when the corresponding
13581/// mask bit is not set).
13582///
13583/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi16)
13584#[inline]
13585#[target_feature(enable = "avx512fp16")]
13586#[cfg_attr(test, assert_instr(vcvtph2w))]
13587#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13588pub fn _mm512_mask_cvtph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13589    unsafe {
13590        transmute(vcvtph2w_512(
13591            a,
13592            src.as_i16x32(),
13593            k,
13594            _MM_FROUND_CUR_DIRECTION,
13595        ))
13596    }
13597}
13598
13599/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13600/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13601///
13602/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi16)
13603#[inline]
13604#[target_feature(enable = "avx512fp16")]
13605#[cfg_attr(test, assert_instr(vcvtph2w))]
13606#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13607pub fn _mm512_maskz_cvtph_epi16(k: __mmask32, a: __m512h) -> __m512i {
13608    _mm512_mask_cvtph_epi16(_mm512_setzero_si512(), k, a)
13609}
13610
13611/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13612/// store the results in dst.
13613///
13614/// Rounding is done according to the rounding parameter, which can be one of:
13615///
13616/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13617/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13618/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13619/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13620/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13621///
13622/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi16)
13623#[inline]
13624#[target_feature(enable = "avx512fp16")]
13625#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13626#[rustc_legacy_const_generics(1)]
13627#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13628pub fn _mm512_cvt_roundph_epi16<const ROUNDING: i32>(a: __m512h) -> __m512i {
13629    static_assert_rounding!(ROUNDING);
13630    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_undefined_epi32(), 0xffffffff, a)
13631}
13632
13633/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13634/// store the results in dst using writemask k (elements are copied from src when the corresponding
13635/// mask bit is not set).
13636///
13637/// Rounding is done according to the rounding parameter, which can be one of:
13638///
13639/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13640/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13641/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13642/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13643/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13644///
13645/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi16)
13646#[inline]
13647#[target_feature(enable = "avx512fp16")]
13648#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13649#[rustc_legacy_const_generics(3)]
13650#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13651pub fn _mm512_mask_cvt_roundph_epi16<const ROUNDING: i32>(
13652    src: __m512i,
13653    k: __mmask32,
13654    a: __m512h,
13655) -> __m512i {
13656    unsafe {
13657        static_assert_rounding!(ROUNDING);
13658        transmute(vcvtph2w_512(a, src.as_i16x32(), k, ROUNDING))
13659    }
13660}
13661
13662/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers, and
13663/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13664///
13665/// Rounding is done according to the rounding parameter, which can be one of:
13666///
13667/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
13668/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
13669/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
13670/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
13671/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
13672///
13673/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi16)
13674#[inline]
13675#[target_feature(enable = "avx512fp16")]
13676#[cfg_attr(test, assert_instr(vcvtph2w, ROUNDING = 8))]
13677#[rustc_legacy_const_generics(2)]
13678#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13679pub fn _mm512_maskz_cvt_roundph_epi16<const ROUNDING: i32>(k: __mmask32, a: __m512h) -> __m512i {
13680    static_assert_rounding!(ROUNDING);
13681    _mm512_mask_cvt_roundph_epi16::<ROUNDING>(_mm512_setzero_si512(), k, a)
13682}
13683
13684/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13685/// and store the results in dst.
13686///
13687/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu16)
13688#[inline]
13689#[target_feature(enable = "avx512fp16,avx512vl")]
13690#[cfg_attr(test, assert_instr(vcvtph2uw))]
13691#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13692pub fn _mm_cvtph_epu16(a: __m128h) -> __m128i {
13693    _mm_mask_cvtph_epu16(_mm_undefined_si128(), 0xff, a)
13694}
13695
13696/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13697/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13698/// mask bit is not set).
13699///
13700/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu16)
13701#[inline]
13702#[target_feature(enable = "avx512fp16,avx512vl")]
13703#[cfg_attr(test, assert_instr(vcvtph2uw))]
13704#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13705pub fn _mm_mask_cvtph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13706    unsafe { transmute(vcvtph2uw_128(a, src.as_u16x8(), k)) }
13707}
13708
13709/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13710/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13711///
13712/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu16)
13713#[inline]
13714#[target_feature(enable = "avx512fp16,avx512vl")]
13715#[cfg_attr(test, assert_instr(vcvtph2uw))]
13716#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13717pub fn _mm_maskz_cvtph_epu16(k: __mmask8, a: __m128h) -> __m128i {
13718    _mm_mask_cvtph_epu16(_mm_setzero_si128(), k, a)
13719}
13720
13721/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13722/// and store the results in dst.
13723///
13724/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu16)
13725#[inline]
13726#[target_feature(enable = "avx512fp16,avx512vl")]
13727#[cfg_attr(test, assert_instr(vcvtph2uw))]
13728#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13729pub fn _mm256_cvtph_epu16(a: __m256h) -> __m256i {
13730    _mm256_mask_cvtph_epu16(_mm256_undefined_si256(), 0xffff, a)
13731}
13732
13733/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13734/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13735/// mask bit is not set).
13736///
13737/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu16)
13738#[inline]
13739#[target_feature(enable = "avx512fp16,avx512vl")]
13740#[cfg_attr(test, assert_instr(vcvtph2uw))]
13741#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13742pub fn _mm256_mask_cvtph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13743    unsafe { transmute(vcvtph2uw_256(a, src.as_u16x16(), k)) }
13744}
13745
13746/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13747/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13748///
13749/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu16)
13750#[inline]
13751#[target_feature(enable = "avx512fp16,avx512vl")]
13752#[cfg_attr(test, assert_instr(vcvtph2uw))]
13753#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13754pub fn _mm256_maskz_cvtph_epu16(k: __mmask16, a: __m256h) -> __m256i {
13755    _mm256_mask_cvtph_epu16(_mm256_setzero_si256(), k, a)
13756}
13757
13758/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13759/// and store the results in dst.
13760///
13761/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu16)
13762#[inline]
13763#[target_feature(enable = "avx512fp16")]
13764#[cfg_attr(test, assert_instr(vcvtph2uw))]
13765#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13766pub fn _mm512_cvtph_epu16(a: __m512h) -> __m512i {
13767    _mm512_mask_cvtph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
13768}
13769
13770/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13771/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13772/// mask bit is not set).
13773///
13774/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu16)
13775#[inline]
13776#[target_feature(enable = "avx512fp16")]
13777#[cfg_attr(test, assert_instr(vcvtph2uw))]
13778#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13779pub fn _mm512_mask_cvtph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13780    unsafe {
13781        transmute(vcvtph2uw_512(
13782            a,
13783            src.as_u16x32(),
13784            k,
13785            _MM_FROUND_CUR_DIRECTION,
13786        ))
13787    }
13788}
13789
13790/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13791/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13792///
13793/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu16)
13794#[inline]
13795#[target_feature(enable = "avx512fp16")]
13796#[cfg_attr(test, assert_instr(vcvtph2uw))]
13797#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13798pub fn _mm512_maskz_cvtph_epu16(k: __mmask32, a: __m512h) -> __m512i {
13799    _mm512_mask_cvtph_epu16(_mm512_setzero_si512(), k, a)
13800}
13801
13802/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13803/// and store the results in dst.
13804///
13805/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
13806///
13807/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu16)
13808#[inline]
13809#[target_feature(enable = "avx512fp16")]
13810#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
13811#[rustc_legacy_const_generics(1)]
13812#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13813pub fn _mm512_cvt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
13814    static_assert_sae!(SAE);
13815    _mm512_mask_cvt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
13816}
13817
13818/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13819/// and store the results in dst using writemask k (elements are copied from src when the corresponding
13820/// mask bit is not set).
13821///
13822/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
13823///
13824/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu16)
13825#[inline]
13826#[target_feature(enable = "avx512fp16")]
13827#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
13828#[rustc_legacy_const_generics(3)]
13829#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13830pub fn _mm512_mask_cvt_roundph_epu16<const SAE: i32>(
13831    src: __m512i,
13832    k: __mmask32,
13833    a: __m512h,
13834) -> __m512i {
13835    unsafe {
13836        static_assert_sae!(SAE);
13837        transmute(vcvtph2uw_512(a, src.as_u16x32(), k, SAE))
13838    }
13839}
13840
13841/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers,
13842/// and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
13843///
13844/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
13845///
13846/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu16)
13847#[inline]
13848#[target_feature(enable = "avx512fp16")]
13849#[cfg_attr(test, assert_instr(vcvtph2uw, SAE = 8))]
13850#[rustc_legacy_const_generics(2)]
13851#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13852pub fn _mm512_maskz_cvt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
13853    static_assert_sae!(SAE);
13854    _mm512_mask_cvt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
13855}
13856
13857/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13858/// truncation, and store the results in dst.
13859///
13860/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi16)
13861#[inline]
13862#[target_feature(enable = "avx512fp16,avx512vl")]
13863#[cfg_attr(test, assert_instr(vcvttph2w))]
13864#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13865pub fn _mm_cvttph_epi16(a: __m128h) -> __m128i {
13866    _mm_mask_cvttph_epi16(_mm_undefined_si128(), 0xff, a)
13867}
13868
13869/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13870/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13871/// mask bit is not set).
13872///
13873/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi16)
13874#[inline]
13875#[target_feature(enable = "avx512fp16,avx512vl")]
13876#[cfg_attr(test, assert_instr(vcvttph2w))]
13877#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13878pub fn _mm_mask_cvttph_epi16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
13879    unsafe { transmute(vcvttph2w_128(a, src.as_i16x8(), k)) }
13880}
13881
13882/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13883/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13884/// mask bit is not set).
13885///
13886/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi16)
13887#[inline]
13888#[target_feature(enable = "avx512fp16,avx512vl")]
13889#[cfg_attr(test, assert_instr(vcvttph2w))]
13890#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13891pub fn _mm_maskz_cvttph_epi16(k: __mmask8, a: __m128h) -> __m128i {
13892    _mm_mask_cvttph_epi16(_mm_setzero_si128(), k, a)
13893}
13894
13895/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13896/// truncation, and store the results in dst.
13897///
13898/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi16)
13899#[inline]
13900#[target_feature(enable = "avx512fp16,avx512vl")]
13901#[cfg_attr(test, assert_instr(vcvttph2w))]
13902#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13903pub fn _mm256_cvttph_epi16(a: __m256h) -> __m256i {
13904    _mm256_mask_cvttph_epi16(_mm256_undefined_si256(), 0xffff, a)
13905}
13906
13907/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13908/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13909/// mask bit is not set).
13910///
13911/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi16)
13912#[inline]
13913#[target_feature(enable = "avx512fp16,avx512vl")]
13914#[cfg_attr(test, assert_instr(vcvttph2w))]
13915#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13916pub fn _mm256_mask_cvttph_epi16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
13917    unsafe { transmute(vcvttph2w_256(a, src.as_i16x16(), k)) }
13918}
13919
13920/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13921/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13922/// mask bit is not set).
13923///
13924/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi16)
13925#[inline]
13926#[target_feature(enable = "avx512fp16,avx512vl")]
13927#[cfg_attr(test, assert_instr(vcvttph2w))]
13928#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13929pub fn _mm256_maskz_cvttph_epi16(k: __mmask16, a: __m256h) -> __m256i {
13930    _mm256_mask_cvttph_epi16(_mm256_setzero_si256(), k, a)
13931}
13932
13933/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13934/// truncation, and store the results in dst.
13935///
13936/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi16)
13937#[inline]
13938#[target_feature(enable = "avx512fp16")]
13939#[cfg_attr(test, assert_instr(vcvttph2w))]
13940#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13941pub fn _mm512_cvttph_epi16(a: __m512h) -> __m512i {
13942    _mm512_mask_cvttph_epi16(_mm512_undefined_epi32(), 0xffffffff, a)
13943}
13944
13945/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13946/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13947/// mask bit is not set).
13948///
13949/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi16)
13950#[inline]
13951#[target_feature(enable = "avx512fp16")]
13952#[cfg_attr(test, assert_instr(vcvttph2w))]
13953#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13954pub fn _mm512_mask_cvttph_epi16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
13955    unsafe {
13956        transmute(vcvttph2w_512(
13957            a,
13958            src.as_i16x32(),
13959            k,
13960            _MM_FROUND_CUR_DIRECTION,
13961        ))
13962    }
13963}
13964
13965/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13966/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
13967/// mask bit is not set).
13968///
13969/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi16)
13970#[inline]
13971#[target_feature(enable = "avx512fp16")]
13972#[cfg_attr(test, assert_instr(vcvttph2w))]
13973#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13974pub fn _mm512_maskz_cvttph_epi16(k: __mmask32, a: __m512h) -> __m512i {
13975    _mm512_mask_cvttph_epi16(_mm512_setzero_si512(), k, a)
13976}
13977
13978/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13979/// truncation, and store the results in dst.
13980///
13981/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13982///
13983/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi16)
13984#[inline]
13985#[target_feature(enable = "avx512fp16")]
13986#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
13987#[rustc_legacy_const_generics(1)]
13988#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
13989pub fn _mm512_cvtt_roundph_epi16<const SAE: i32>(a: __m512h) -> __m512i {
13990    static_assert_sae!(SAE);
13991    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
13992}
13993
13994/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
13995/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
13996/// mask bit is not set).
13997///
13998/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
13999///
14000/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi16)
14001#[inline]
14002#[target_feature(enable = "avx512fp16")]
14003#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
14004#[rustc_legacy_const_generics(3)]
14005#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14006pub fn _mm512_mask_cvtt_roundph_epi16<const SAE: i32>(
14007    src: __m512i,
14008    k: __mmask32,
14009    a: __m512h,
14010) -> __m512i {
14011    unsafe {
14012        static_assert_sae!(SAE);
14013        transmute(vcvttph2w_512(a, src.as_i16x32(), k, SAE))
14014    }
14015}
14016
14017/// Convert packed half-precision (16-bit) floating-point elements in a to packed 16-bit integers with
14018/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14019/// mask bit is not set).
14020///
14021/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14022///
14023/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi16)
14024#[inline]
14025#[target_feature(enable = "avx512fp16")]
14026#[cfg_attr(test, assert_instr(vcvttph2w, SAE = 8))]
14027#[rustc_legacy_const_generics(2)]
14028#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14029pub fn _mm512_maskz_cvtt_roundph_epi16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
14030    static_assert_sae!(SAE);
14031    _mm512_mask_cvtt_roundph_epi16::<SAE>(_mm512_setzero_si512(), k, a)
14032}
14033
14034/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14035/// truncation, and store the results in dst.
14036///
14037/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu16)
14038#[inline]
14039#[target_feature(enable = "avx512fp16,avx512vl")]
14040#[cfg_attr(test, assert_instr(vcvttph2uw))]
14041#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14042pub fn _mm_cvttph_epu16(a: __m128h) -> __m128i {
14043    _mm_mask_cvttph_epu16(_mm_undefined_si128(), 0xff, a)
14044}
14045
14046/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14047/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14048/// mask bit is not set).
14049///
14050/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu16)
14051#[inline]
14052#[target_feature(enable = "avx512fp16,avx512vl")]
14053#[cfg_attr(test, assert_instr(vcvttph2uw))]
14054#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14055pub fn _mm_mask_cvttph_epu16(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14056    unsafe { transmute(vcvttph2uw_128(a, src.as_u16x8(), k)) }
14057}
14058
14059/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14060/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14061/// mask bit is not set).
14062///
14063/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu16)
14064#[inline]
14065#[target_feature(enable = "avx512fp16,avx512vl")]
14066#[cfg_attr(test, assert_instr(vcvttph2uw))]
14067#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14068pub fn _mm_maskz_cvttph_epu16(k: __mmask8, a: __m128h) -> __m128i {
14069    _mm_mask_cvttph_epu16(_mm_setzero_si128(), k, a)
14070}
14071
14072/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14073/// truncation, and store the results in dst.
14074///
14075/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu16)
14076#[inline]
14077#[target_feature(enable = "avx512fp16,avx512vl")]
14078#[cfg_attr(test, assert_instr(vcvttph2uw))]
14079#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14080pub fn _mm256_cvttph_epu16(a: __m256h) -> __m256i {
14081    _mm256_mask_cvttph_epu16(_mm256_undefined_si256(), 0xffff, a)
14082}
14083
14084/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14085/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14086/// mask bit is not set).
14087///
14088/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu16)
14089#[inline]
14090#[target_feature(enable = "avx512fp16,avx512vl")]
14091#[cfg_attr(test, assert_instr(vcvttph2uw))]
14092#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14093pub fn _mm256_mask_cvttph_epu16(src: __m256i, k: __mmask16, a: __m256h) -> __m256i {
14094    unsafe { transmute(vcvttph2uw_256(a, src.as_u16x16(), k)) }
14095}
14096
14097/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14098/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14099/// mask bit is not set).
14100///
14101/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu16)
14102#[inline]
14103#[target_feature(enable = "avx512fp16,avx512vl")]
14104#[cfg_attr(test, assert_instr(vcvttph2uw))]
14105#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14106pub fn _mm256_maskz_cvttph_epu16(k: __mmask16, a: __m256h) -> __m256i {
14107    _mm256_mask_cvttph_epu16(_mm256_setzero_si256(), k, a)
14108}
14109
14110/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14111/// truncation, and store the results in dst.
14112///
14113/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu16)
14114#[inline]
14115#[target_feature(enable = "avx512fp16")]
14116#[cfg_attr(test, assert_instr(vcvttph2uw))]
14117#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14118pub fn _mm512_cvttph_epu16(a: __m512h) -> __m512i {
14119    _mm512_mask_cvttph_epu16(_mm512_undefined_epi32(), 0xffffffff, a)
14120}
14121
14122/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14123/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14124/// mask bit is not set).
14125///
14126/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu16)
14127#[inline]
14128#[target_feature(enable = "avx512fp16")]
14129#[cfg_attr(test, assert_instr(vcvttph2uw))]
14130#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14131pub fn _mm512_mask_cvttph_epu16(src: __m512i, k: __mmask32, a: __m512h) -> __m512i {
14132    unsafe {
14133        transmute(vcvttph2uw_512(
14134            a,
14135            src.as_u16x32(),
14136            k,
14137            _MM_FROUND_CUR_DIRECTION,
14138        ))
14139    }
14140}
14141
14142/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14143/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14144/// mask bit is not set).
14145///
14146/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu16)
14147#[inline]
14148#[target_feature(enable = "avx512fp16")]
14149#[cfg_attr(test, assert_instr(vcvttph2uw))]
14150#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14151pub fn _mm512_maskz_cvttph_epu16(k: __mmask32, a: __m512h) -> __m512i {
14152    _mm512_mask_cvttph_epu16(_mm512_setzero_si512(), k, a)
14153}
14154
14155/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14156/// truncation, and store the results in dst.
14157///
14158/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14159///
14160/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu16)
14161#[inline]
14162#[target_feature(enable = "avx512fp16")]
14163#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14164#[rustc_legacy_const_generics(1)]
14165#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14166pub fn _mm512_cvtt_roundph_epu16<const SAE: i32>(a: __m512h) -> __m512i {
14167    static_assert_sae!(SAE);
14168    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_undefined_epi32(), 0xffffffff, a)
14169}
14170
14171/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14172/// truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding
14173/// mask bit is not set).
14174///
14175/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14176///
14177/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu16)
14178#[inline]
14179#[target_feature(enable = "avx512fp16")]
14180#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14181#[rustc_legacy_const_generics(3)]
14182#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14183pub fn _mm512_mask_cvtt_roundph_epu16<const SAE: i32>(
14184    src: __m512i,
14185    k: __mmask32,
14186    a: __m512h,
14187) -> __m512i {
14188    unsafe {
14189        static_assert_sae!(SAE);
14190        transmute(vcvttph2uw_512(a, src.as_u16x32(), k, SAE))
14191    }
14192}
14193
14194/// Convert packed half-precision (16-bit) floating-point elements in a to packed unsigned 16-bit integers with
14195/// truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding
14196/// mask bit is not set).
14197///
14198/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
14199///
14200/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu16)
14201#[inline]
14202#[target_feature(enable = "avx512fp16")]
14203#[cfg_attr(test, assert_instr(vcvttph2uw, SAE = 8))]
14204#[rustc_legacy_const_generics(2)]
14205#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14206pub fn _mm512_maskz_cvtt_roundph_epu16<const SAE: i32>(k: __mmask32, a: __m512h) -> __m512i {
14207    static_assert_sae!(SAE);
14208    _mm512_mask_cvtt_roundph_epu16::<SAE>(_mm512_setzero_si512(), k, a)
14209}
14210
14211/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14212/// results in dst.
14213///
14214/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi32)
14215#[inline]
14216#[target_feature(enable = "avx512fp16,avx512vl")]
14217#[cfg_attr(test, assert_instr(vcvtph2dq))]
14218#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14219pub fn _mm_cvtph_epi32(a: __m128h) -> __m128i {
14220    _mm_mask_cvtph_epi32(_mm_undefined_si128(), 0xff, a)
14221}
14222
14223/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14224/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14225///
14226/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi32)
14227#[inline]
14228#[target_feature(enable = "avx512fp16,avx512vl")]
14229#[cfg_attr(test, assert_instr(vcvtph2dq))]
14230#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14231pub fn _mm_mask_cvtph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14232    unsafe { transmute(vcvtph2dq_128(a, src.as_i32x4(), k)) }
14233}
14234
14235/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14236/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14237///
14238/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi32)
14239#[inline]
14240#[target_feature(enable = "avx512fp16,avx512vl")]
14241#[cfg_attr(test, assert_instr(vcvtph2dq))]
14242#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14243pub fn _mm_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14244    _mm_mask_cvtph_epi32(_mm_setzero_si128(), k, a)
14245}
14246
14247/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14248/// results in dst.
14249///
14250/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi32)
14251#[inline]
14252#[target_feature(enable = "avx512fp16,avx512vl")]
14253#[cfg_attr(test, assert_instr(vcvtph2dq))]
14254#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14255pub fn _mm256_cvtph_epi32(a: __m128h) -> __m256i {
14256    _mm256_mask_cvtph_epi32(_mm256_undefined_si256(), 0xff, a)
14257}
14258
14259/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14260/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14261///
14262/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi32)
14263#[inline]
14264#[target_feature(enable = "avx512fp16,avx512vl")]
14265#[cfg_attr(test, assert_instr(vcvtph2dq))]
14266#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14267pub fn _mm256_mask_cvtph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14268    unsafe { transmute(vcvtph2dq_256(a, src.as_i32x8(), k)) }
14269}
14270
14271/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14272/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14273///
14274/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi32)
14275#[inline]
14276#[target_feature(enable = "avx512fp16,avx512vl")]
14277#[cfg_attr(test, assert_instr(vcvtph2dq))]
14278#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14279pub fn _mm256_maskz_cvtph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14280    _mm256_mask_cvtph_epi32(_mm256_setzero_si256(), k, a)
14281}
14282
14283/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14284/// results in dst.
14285///
14286/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi32)
14287#[inline]
14288#[target_feature(enable = "avx512fp16")]
14289#[cfg_attr(test, assert_instr(vcvtph2dq))]
14290#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14291pub fn _mm512_cvtph_epi32(a: __m256h) -> __m512i {
14292    _mm512_mask_cvtph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14293}
14294
14295/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14296/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14297///
14298/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi32)
14299#[inline]
14300#[target_feature(enable = "avx512fp16")]
14301#[cfg_attr(test, assert_instr(vcvtph2dq))]
14302#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14303pub fn _mm512_mask_cvtph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14304    unsafe {
14305        transmute(vcvtph2dq_512(
14306            a,
14307            src.as_i32x16(),
14308            k,
14309            _MM_FROUND_CUR_DIRECTION,
14310        ))
14311    }
14312}
14313
14314/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14315/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14316///
14317/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi32)
14318#[inline]
14319#[target_feature(enable = "avx512fp16")]
14320#[cfg_attr(test, assert_instr(vcvtph2dq))]
14321#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14322pub fn _mm512_maskz_cvtph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14323    _mm512_mask_cvtph_epi32(_mm512_setzero_si512(), k, a)
14324}
14325
14326/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14327/// results in dst.
14328///
14329/// Rounding is done according to the rounding parameter, which can be one of:
14330///
14331/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14332/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14333/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14334/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14335/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14336///
14337/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi32)
14338#[inline]
14339#[target_feature(enable = "avx512fp16")]
14340#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14341#[rustc_legacy_const_generics(1)]
14342#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14343pub fn _mm512_cvt_roundph_epi32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14344    static_assert_rounding!(ROUNDING);
14345    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14346}
14347
14348/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14349/// results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14350///
14351/// Rounding is done according to the rounding parameter, which can be one of:
14352///
14353/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14354/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14355/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14356/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14357/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14358///
14359/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi32)
14360#[inline]
14361#[target_feature(enable = "avx512fp16")]
14362#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14363#[rustc_legacy_const_generics(3)]
14364#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14365pub fn _mm512_mask_cvt_roundph_epi32<const ROUNDING: i32>(
14366    src: __m512i,
14367    k: __mmask16,
14368    a: __m256h,
14369) -> __m512i {
14370    unsafe {
14371        static_assert_rounding!(ROUNDING);
14372        transmute(vcvtph2dq_512(a, src.as_i32x16(), k, ROUNDING))
14373    }
14374}
14375
14376/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14377/// results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14378///
14379/// Rounding is done according to the rounding parameter, which can be one of:
14380///
14381/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14382/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14383/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14384/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14385/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14386///
14387/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi32)
14388#[inline]
14389#[target_feature(enable = "avx512fp16")]
14390#[cfg_attr(test, assert_instr(vcvtph2dq, ROUNDING = 8))]
14391#[rustc_legacy_const_generics(2)]
14392#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14393pub fn _mm512_maskz_cvt_roundph_epi32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
14394    static_assert_rounding!(ROUNDING);
14395    _mm512_mask_cvt_roundph_epi32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14396}
14397
14398/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14399/// the result in dst.
14400///
14401/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_i32)
14402#[inline]
14403#[target_feature(enable = "avx512fp16")]
14404#[cfg_attr(test, assert_instr(vcvtsh2si))]
14405#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14406pub fn _mm_cvtsh_i32(a: __m128h) -> i32 {
14407    unsafe { vcvtsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
14408}
14409
14410/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer, and store
14411/// the result in dst.
14412///
14413/// Rounding is done according to the rounding parameter, which can be one of:
14414///
14415/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14416/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14417/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14418/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14419/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14420///
14421/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_i32)
14422#[inline]
14423#[target_feature(enable = "avx512fp16")]
14424#[cfg_attr(test, assert_instr(vcvtsh2si, ROUNDING = 8))]
14425#[rustc_legacy_const_generics(1)]
14426#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14427pub fn _mm_cvt_roundsh_i32<const ROUNDING: i32>(a: __m128h) -> i32 {
14428    unsafe {
14429        static_assert_rounding!(ROUNDING);
14430        vcvtsh2si32(a, ROUNDING)
14431    }
14432}
14433
14434/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers, and store the
14435/// results in dst.
14436///
14437/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu32)
14438#[inline]
14439#[target_feature(enable = "avx512fp16,avx512vl")]
14440#[cfg_attr(test, assert_instr(vcvtph2udq))]
14441#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14442pub fn _mm_cvtph_epu32(a: __m128h) -> __m128i {
14443    _mm_mask_cvtph_epu32(_mm_undefined_si128(), 0xff, a)
14444}
14445
14446/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14447/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14448///
14449/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu32)
14450#[inline]
14451#[target_feature(enable = "avx512fp16,avx512vl")]
14452#[cfg_attr(test, assert_instr(vcvtph2udq))]
14453#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14454pub fn _mm_mask_cvtph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14455    unsafe { transmute(vcvtph2udq_128(a, src.as_u32x4(), k)) }
14456}
14457
14458/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14459/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14460///
14461/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu32)
14462#[inline]
14463#[target_feature(enable = "avx512fp16,avx512vl")]
14464#[cfg_attr(test, assert_instr(vcvtph2udq))]
14465#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14466pub fn _mm_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m128i {
14467    _mm_mask_cvtph_epu32(_mm_setzero_si128(), k, a)
14468}
14469
14470/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14471/// the results in dst.
14472///
14473/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu32)
14474#[inline]
14475#[target_feature(enable = "avx512fp16,avx512vl")]
14476#[cfg_attr(test, assert_instr(vcvtph2udq))]
14477#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14478pub fn _mm256_cvtph_epu32(a: __m128h) -> __m256i {
14479    _mm256_mask_cvtph_epu32(_mm256_undefined_si256(), 0xff, a)
14480}
14481
14482/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14483/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14484///
14485/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu32)
14486#[inline]
14487#[target_feature(enable = "avx512fp16,avx512vl")]
14488#[cfg_attr(test, assert_instr(vcvtph2udq))]
14489#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14490pub fn _mm256_mask_cvtph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14491    unsafe { transmute(vcvtph2udq_256(a, src.as_u32x8(), k)) }
14492}
14493
14494/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14495/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14496///
14497/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu32)
14498#[inline]
14499#[target_feature(enable = "avx512fp16,avx512vl")]
14500#[cfg_attr(test, assert_instr(vcvtph2udq))]
14501#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14502pub fn _mm256_maskz_cvtph_epu32(k: __mmask8, a: __m128h) -> __m256i {
14503    _mm256_mask_cvtph_epu32(_mm256_setzero_si256(), k, a)
14504}
14505
14506/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14507/// the results in dst.
14508///
14509/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu32)
14510#[inline]
14511#[target_feature(enable = "avx512fp16")]
14512#[cfg_attr(test, assert_instr(vcvtph2udq))]
14513#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14514pub fn _mm512_cvtph_epu32(a: __m256h) -> __m512i {
14515    _mm512_mask_cvtph_epu32(_mm512_undefined_epi32(), 0xffff, a)
14516}
14517
14518/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14519/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14520///
14521/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu32)
14522#[inline]
14523#[target_feature(enable = "avx512fp16")]
14524#[cfg_attr(test, assert_instr(vcvtph2udq))]
14525#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14526pub fn _mm512_mask_cvtph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14527    unsafe {
14528        transmute(vcvtph2udq_512(
14529            a,
14530            src.as_u32x16(),
14531            k,
14532            _MM_FROUND_CUR_DIRECTION,
14533        ))
14534    }
14535}
14536
14537/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14538/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14539///
14540/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu32)
14541#[inline]
14542#[target_feature(enable = "avx512fp16")]
14543#[cfg_attr(test, assert_instr(vcvtph2udq))]
14544#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14545pub fn _mm512_maskz_cvtph_epu32(k: __mmask16, a: __m256h) -> __m512i {
14546    _mm512_mask_cvtph_epu32(_mm512_setzero_si512(), k, a)
14547}
14548
14549/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14550/// the results in dst.
14551///
14552/// Rounding is done according to the rounding parameter, which can be one of:
14553///
14554/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14555/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14556/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14557/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14558/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14559///
14560/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu32)
14561#[inline]
14562#[target_feature(enable = "avx512fp16")]
14563#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14564#[rustc_legacy_const_generics(1)]
14565#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14566pub fn _mm512_cvt_roundph_epu32<const ROUNDING: i32>(a: __m256h) -> __m512i {
14567    static_assert_rounding!(ROUNDING);
14568    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_undefined_epi32(), 0xffff, a)
14569}
14570
14571/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14572/// the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14573///
14574/// Rounding is done according to the rounding parameter, which can be one of:
14575///
14576/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14577/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14578/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14579/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14580/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14581///
14582/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu32)
14583#[inline]
14584#[target_feature(enable = "avx512fp16")]
14585#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14586#[rustc_legacy_const_generics(3)]
14587#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14588pub fn _mm512_mask_cvt_roundph_epu32<const ROUNDING: i32>(
14589    src: __m512i,
14590    k: __mmask16,
14591    a: __m256h,
14592) -> __m512i {
14593    unsafe {
14594        static_assert_rounding!(ROUNDING);
14595        transmute(vcvtph2udq_512(a, src.as_u32x16(), k, ROUNDING))
14596    }
14597}
14598
14599/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers, and store
14600/// the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14601///
14602/// Rounding is done according to the rounding parameter, which can be one of:
14603///
14604/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
14605/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
14606/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
14607/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
14608/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
14609///
14610/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu32)
14611#[inline]
14612#[target_feature(enable = "avx512fp16")]
14613#[cfg_attr(test, assert_instr(vcvtph2udq, ROUNDING = 8))]
14614#[rustc_legacy_const_generics(2)]
14615#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14616pub fn _mm512_maskz_cvt_roundph_epu32<const ROUNDING: i32>(k: __mmask16, a: __m256h) -> __m512i {
14617    static_assert_rounding!(ROUNDING);
14618    _mm512_mask_cvt_roundph_epu32::<ROUNDING>(_mm512_setzero_si512(), k, a)
14619}
14620
14621/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14622/// the result in dst.
14623///
14624/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_u32)
14625#[inline]
14626#[target_feature(enable = "avx512fp16")]
14627#[cfg_attr(test, assert_instr(vcvtsh2usi))]
14628#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14629pub fn _mm_cvtsh_u32(a: __m128h) -> u32 {
14630    unsafe { vcvtsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
14631}
14632
14633/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer, and store
14634/// the result in dst.
14635///
14636/// Exceptions can be suppressed by passing [`_MM_FROUND_NO_EXC`] in the sae parameter.
14637///
14638/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_u32)
14639#[inline]
14640#[target_feature(enable = "avx512fp16")]
14641#[cfg_attr(test, assert_instr(vcvtsh2usi, SAE = 8))]
14642#[rustc_legacy_const_generics(1)]
14643#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14644pub fn _mm_cvt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
14645    unsafe {
14646        static_assert_rounding!(SAE);
14647        vcvtsh2usi32(a, SAE)
14648    }
14649}
14650
14651/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14652/// store the results in dst.
14653///
14654/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi32)
14655#[inline]
14656#[target_feature(enable = "avx512fp16,avx512vl")]
14657#[cfg_attr(test, assert_instr(vcvttph2dq))]
14658#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14659pub fn _mm_cvttph_epi32(a: __m128h) -> __m128i {
14660    _mm_mask_cvttph_epi32(_mm_undefined_si128(), 0xff, a)
14661}
14662
14663/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14664/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14665///
14666/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi32)
14667#[inline]
14668#[target_feature(enable = "avx512fp16,avx512vl")]
14669#[cfg_attr(test, assert_instr(vcvttph2dq))]
14670#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14671pub fn _mm_mask_cvttph_epi32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14672    unsafe { transmute(vcvttph2dq_128(a, src.as_i32x4(), k)) }
14673}
14674
14675/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14676/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14677///
14678/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi32)
14679#[inline]
14680#[target_feature(enable = "avx512fp16,avx512vl")]
14681#[cfg_attr(test, assert_instr(vcvttph2dq))]
14682#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14683pub fn _mm_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m128i {
14684    _mm_mask_cvttph_epi32(_mm_setzero_si128(), k, a)
14685}
14686
14687/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14688/// store the results in dst.
14689///
14690/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi32)
14691#[inline]
14692#[target_feature(enable = "avx512fp16,avx512vl")]
14693#[cfg_attr(test, assert_instr(vcvttph2dq))]
14694#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14695pub fn _mm256_cvttph_epi32(a: __m128h) -> __m256i {
14696    _mm256_mask_cvttph_epi32(_mm256_undefined_si256(), 0xff, a)
14697}
14698
14699/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14700/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14701///
14702/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi32)
14703#[inline]
14704#[target_feature(enable = "avx512fp16,avx512vl")]
14705#[cfg_attr(test, assert_instr(vcvttph2dq))]
14706#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14707pub fn _mm256_mask_cvttph_epi32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14708    unsafe { transmute(vcvttph2dq_256(a, src.as_i32x8(), k)) }
14709}
14710
14711/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14712/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14713///
14714/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi32)
14715#[inline]
14716#[target_feature(enable = "avx512fp16,avx512vl")]
14717#[cfg_attr(test, assert_instr(vcvttph2dq))]
14718#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14719pub fn _mm256_maskz_cvttph_epi32(k: __mmask8, a: __m128h) -> __m256i {
14720    _mm256_mask_cvttph_epi32(_mm256_setzero_si256(), k, a)
14721}
14722
14723/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14724/// store the results in dst.
14725///
14726/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi32)
14727#[inline]
14728#[target_feature(enable = "avx512fp16")]
14729#[cfg_attr(test, assert_instr(vcvttph2dq))]
14730#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14731pub fn _mm512_cvttph_epi32(a: __m256h) -> __m512i {
14732    _mm512_mask_cvttph_epi32(_mm512_undefined_epi32(), 0xffff, a)
14733}
14734
14735/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14736/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14737///
14738/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi32)
14739#[inline]
14740#[target_feature(enable = "avx512fp16")]
14741#[cfg_attr(test, assert_instr(vcvttph2dq))]
14742#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14743pub fn _mm512_mask_cvttph_epi32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14744    unsafe {
14745        transmute(vcvttph2dq_512(
14746            a,
14747            src.as_i32x16(),
14748            k,
14749            _MM_FROUND_CUR_DIRECTION,
14750        ))
14751    }
14752}
14753
14754/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14755/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14756///
14757/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi32)
14758#[inline]
14759#[target_feature(enable = "avx512fp16")]
14760#[cfg_attr(test, assert_instr(vcvttph2dq))]
14761#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14762pub fn _mm512_maskz_cvttph_epi32(k: __mmask16, a: __m256h) -> __m512i {
14763    _mm512_mask_cvttph_epi32(_mm512_setzero_si512(), k, a)
14764}
14765
14766/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14767/// store the results in dst.
14768///
14769/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14770///
14771/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi32)
14772#[inline]
14773#[target_feature(enable = "avx512fp16")]
14774#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14775#[rustc_legacy_const_generics(1)]
14776#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14777pub fn _mm512_cvtt_roundph_epi32<const SAE: i32>(a: __m256h) -> __m512i {
14778    static_assert_sae!(SAE);
14779    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
14780}
14781
14782/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14783/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14784///
14785/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14786///
14787/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi32)
14788#[inline]
14789#[target_feature(enable = "avx512fp16")]
14790#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14791#[rustc_legacy_const_generics(3)]
14792#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14793pub fn _mm512_mask_cvtt_roundph_epi32<const SAE: i32>(
14794    src: __m512i,
14795    k: __mmask16,
14796    a: __m256h,
14797) -> __m512i {
14798    unsafe {
14799        static_assert_sae!(SAE);
14800        transmute(vcvttph2dq_512(a, src.as_i32x16(), k, SAE))
14801    }
14802}
14803
14804/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit integers with truncation, and
14805/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14806///
14807/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14808///
14809/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi32)
14810#[inline]
14811#[target_feature(enable = "avx512fp16")]
14812#[cfg_attr(test, assert_instr(vcvttph2dq, SAE = 8))]
14813#[rustc_legacy_const_generics(2)]
14814#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14815pub fn _mm512_maskz_cvtt_roundph_epi32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
14816    static_assert_sae!(SAE);
14817    _mm512_mask_cvtt_roundph_epi32::<SAE>(_mm512_setzero_si512(), k, a)
14818}
14819
14820/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
14821/// the result in dst.
14822///
14823/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_i32)
14824#[inline]
14825#[target_feature(enable = "avx512fp16")]
14826#[cfg_attr(test, assert_instr(vcvttsh2si))]
14827#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14828pub fn _mm_cvttsh_i32(a: __m128h) -> i32 {
14829    unsafe { vcvttsh2si32(a, _MM_FROUND_CUR_DIRECTION) }
14830}
14831
14832/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit integer with truncation, and store
14833/// the result in dst.
14834///
14835/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14836///
14837/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_i32)
14838#[inline]
14839#[target_feature(enable = "avx512fp16")]
14840#[cfg_attr(test, assert_instr(vcvttsh2si, SAE = 8))]
14841#[rustc_legacy_const_generics(1)]
14842#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14843pub fn _mm_cvtt_roundsh_i32<const SAE: i32>(a: __m128h) -> i32 {
14844    unsafe {
14845        static_assert_sae!(SAE);
14846        vcvttsh2si32(a, SAE)
14847    }
14848}
14849
14850/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14851/// store the results in dst.
14852///
14853/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu32)
14854#[inline]
14855#[target_feature(enable = "avx512fp16,avx512vl")]
14856#[cfg_attr(test, assert_instr(vcvttph2udq))]
14857#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14858pub fn _mm_cvttph_epu32(a: __m128h) -> __m128i {
14859    _mm_mask_cvttph_epu32(_mm_undefined_si128(), 0xff, a)
14860}
14861
14862/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14863/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14864///
14865/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu32)
14866#[inline]
14867#[target_feature(enable = "avx512fp16,avx512vl")]
14868#[cfg_attr(test, assert_instr(vcvttph2udq))]
14869#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14870pub fn _mm_mask_cvttph_epu32(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
14871    unsafe { transmute(vcvttph2udq_128(a, src.as_u32x4(), k)) }
14872}
14873
14874/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14875/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14876///
14877/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu32)
14878#[inline]
14879#[target_feature(enable = "avx512fp16,avx512vl")]
14880#[cfg_attr(test, assert_instr(vcvttph2udq))]
14881#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14882pub fn _mm_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m128i {
14883    _mm_mask_cvttph_epu32(_mm_setzero_si128(), k, a)
14884}
14885
14886/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14887/// store the results in dst.
14888///
14889/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu32)
14890#[inline]
14891#[target_feature(enable = "avx512fp16,avx512vl")]
14892#[cfg_attr(test, assert_instr(vcvttph2udq))]
14893#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14894pub fn _mm256_cvttph_epu32(a: __m128h) -> __m256i {
14895    _mm256_mask_cvttph_epu32(_mm256_undefined_si256(), 0xff, a)
14896}
14897
14898/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14899/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14900///
14901/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu32)
14902#[inline]
14903#[target_feature(enable = "avx512fp16,avx512vl")]
14904#[cfg_attr(test, assert_instr(vcvttph2udq))]
14905#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14906pub fn _mm256_mask_cvttph_epu32(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
14907    unsafe { transmute(vcvttph2udq_256(a, src.as_u32x8(), k)) }
14908}
14909
14910/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14911/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14912///
14913/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu32)
14914#[inline]
14915#[target_feature(enable = "avx512fp16,avx512vl")]
14916#[cfg_attr(test, assert_instr(vcvttph2udq))]
14917#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14918pub fn _mm256_maskz_cvttph_epu32(k: __mmask8, a: __m128h) -> __m256i {
14919    _mm256_mask_cvttph_epu32(_mm256_setzero_si256(), k, a)
14920}
14921
14922/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14923/// store the results in dst.
14924///
14925/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu32)
14926#[inline]
14927#[target_feature(enable = "avx512fp16")]
14928#[cfg_attr(test, assert_instr(vcvttph2udq))]
14929#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14930pub fn _mm512_cvttph_epu32(a: __m256h) -> __m512i {
14931    _mm512_mask_cvttph_epu32(_mm512_undefined_epi32(), 0xffff, a)
14932}
14933
14934/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14935/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14936///
14937/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu32)
14938#[inline]
14939#[target_feature(enable = "avx512fp16")]
14940#[cfg_attr(test, assert_instr(vcvttph2udq))]
14941#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14942pub fn _mm512_mask_cvttph_epu32(src: __m512i, k: __mmask16, a: __m256h) -> __m512i {
14943    unsafe {
14944        transmute(vcvttph2udq_512(
14945            a,
14946            src.as_u32x16(),
14947            k,
14948            _MM_FROUND_CUR_DIRECTION,
14949        ))
14950    }
14951}
14952
14953/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14954/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
14955///
14956/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu32)
14957#[inline]
14958#[target_feature(enable = "avx512fp16")]
14959#[cfg_attr(test, assert_instr(vcvttph2udq))]
14960#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14961pub fn _mm512_maskz_cvttph_epu32(k: __mmask16, a: __m256h) -> __m512i {
14962    _mm512_mask_cvttph_epu32(_mm512_setzero_si512(), k, a)
14963}
14964
14965/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14966/// store the results in dst.
14967///
14968/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14969///
14970/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu32)
14971#[inline]
14972#[target_feature(enable = "avx512fp16")]
14973#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14974#[rustc_legacy_const_generics(1)]
14975#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14976pub fn _mm512_cvtt_roundph_epu32<const SAE: i32>(a: __m256h) -> __m512i {
14977    static_assert_sae!(SAE);
14978    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_undefined_epi32(), 0xffff, a)
14979}
14980
14981/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
14982/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
14983///
14984/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
14985///
14986/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu32)
14987#[inline]
14988#[target_feature(enable = "avx512fp16")]
14989#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
14990#[rustc_legacy_const_generics(3)]
14991#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
14992pub fn _mm512_mask_cvtt_roundph_epu32<const SAE: i32>(
14993    src: __m512i,
14994    k: __mmask16,
14995    a: __m256h,
14996) -> __m512i {
14997    unsafe {
14998        static_assert_sae!(SAE);
14999        transmute(vcvttph2udq_512(a, src.as_u32x16(), k, SAE))
15000    }
15001}
15002
15003/// Convert packed half-precision (16-bit) floating-point elements in a to packed 32-bit unsigned integers with truncation, and
15004/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15005///
15006/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15007///
15008/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu32)
15009#[inline]
15010#[target_feature(enable = "avx512fp16")]
15011#[cfg_attr(test, assert_instr(vcvttph2udq, SAE = 8))]
15012#[rustc_legacy_const_generics(2)]
15013#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15014pub fn _mm512_maskz_cvtt_roundph_epu32<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512i {
15015    static_assert_sae!(SAE);
15016    _mm512_mask_cvtt_roundph_epu32::<SAE>(_mm512_setzero_si512(), k, a)
15017}
15018
15019/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
15020/// the result in dst.
15021///
15022/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsh_u32)
15023#[inline]
15024#[target_feature(enable = "avx512fp16")]
15025#[cfg_attr(test, assert_instr(vcvttsh2usi))]
15026#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15027pub fn _mm_cvttsh_u32(a: __m128h) -> u32 {
15028    unsafe { vcvttsh2usi32(a, _MM_FROUND_CUR_DIRECTION) }
15029}
15030
15031/// Convert the lower half-precision (16-bit) floating-point element in a to a 32-bit unsigned integer with truncation, and store
15032/// the result in dst.
15033///
15034/// Exceptions can be suppressed by passing `_MM_FROUND_NO_EXC` in the `sae` parameter.
15035///
15036/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsh_u32)
15037#[inline]
15038#[target_feature(enable = "avx512fp16")]
15039#[cfg_attr(test, assert_instr(vcvttsh2usi, SAE = 8))]
15040#[rustc_legacy_const_generics(1)]
15041#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15042pub fn _mm_cvtt_roundsh_u32<const SAE: i32>(a: __m128h) -> u32 {
15043    unsafe {
15044        static_assert_sae!(SAE);
15045        vcvttsh2usi32(a, SAE)
15046    }
15047}
15048
15049/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15050/// store the results in dst.
15051///
15052/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epi64)
15053#[inline]
15054#[target_feature(enable = "avx512fp16,avx512vl")]
15055#[cfg_attr(test, assert_instr(vcvtph2qq))]
15056#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15057pub fn _mm_cvtph_epi64(a: __m128h) -> __m128i {
15058    _mm_mask_cvtph_epi64(_mm_undefined_si128(), 0xff, a)
15059}
15060
15061/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15062/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15063///
15064/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epi64)
15065#[inline]
15066#[target_feature(enable = "avx512fp16,avx512vl")]
15067#[cfg_attr(test, assert_instr(vcvtph2qq))]
15068#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15069pub fn _mm_mask_cvtph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15070    unsafe { transmute(vcvtph2qq_128(a, src.as_i64x2(), k)) }
15071}
15072
15073/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15074/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15075///
15076/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epi64)
15077#[inline]
15078#[target_feature(enable = "avx512fp16,avx512vl")]
15079#[cfg_attr(test, assert_instr(vcvtph2qq))]
15080#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15081pub fn _mm_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m128i {
15082    _mm_mask_cvtph_epi64(_mm_setzero_si128(), k, a)
15083}
15084
15085/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15086/// store the results in dst.
15087///
15088/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epi64)
15089#[inline]
15090#[target_feature(enable = "avx512fp16,avx512vl")]
15091#[cfg_attr(test, assert_instr(vcvtph2qq))]
15092#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15093pub fn _mm256_cvtph_epi64(a: __m128h) -> __m256i {
15094    _mm256_mask_cvtph_epi64(_mm256_undefined_si256(), 0xff, a)
15095}
15096
15097/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15098/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15099///
15100/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epi64)
15101#[inline]
15102#[target_feature(enable = "avx512fp16,avx512vl")]
15103#[cfg_attr(test, assert_instr(vcvtph2qq))]
15104#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15105pub fn _mm256_mask_cvtph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15106    unsafe { transmute(vcvtph2qq_256(a, src.as_i64x4(), k)) }
15107}
15108
15109/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15110/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15111///
15112/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epi64)
15113#[inline]
15114#[target_feature(enable = "avx512fp16,avx512vl")]
15115#[cfg_attr(test, assert_instr(vcvtph2qq))]
15116#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15117pub fn _mm256_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15118    _mm256_mask_cvtph_epi64(_mm256_setzero_si256(), k, a)
15119}
15120
15121/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15122/// store the results in dst.
15123///
15124/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epi64)
15125#[inline]
15126#[target_feature(enable = "avx512fp16")]
15127#[cfg_attr(test, assert_instr(vcvtph2qq))]
15128#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15129pub fn _mm512_cvtph_epi64(a: __m128h) -> __m512i {
15130    _mm512_mask_cvtph_epi64(_mm512_undefined_epi32(), 0xff, a)
15131}
15132
15133/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15134/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15135///
15136/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epi64)
15137#[inline]
15138#[target_feature(enable = "avx512fp16")]
15139#[cfg_attr(test, assert_instr(vcvtph2qq))]
15140#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15141pub fn _mm512_mask_cvtph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15142    unsafe {
15143        transmute(vcvtph2qq_512(
15144            a,
15145            src.as_i64x8(),
15146            k,
15147            _MM_FROUND_CUR_DIRECTION,
15148        ))
15149    }
15150}
15151
15152/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15153/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15154///
15155/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epi64)
15156#[inline]
15157#[target_feature(enable = "avx512fp16")]
15158#[cfg_attr(test, assert_instr(vcvtph2qq))]
15159#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15160pub fn _mm512_maskz_cvtph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15161    _mm512_mask_cvtph_epi64(_mm512_setzero_si512(), k, a)
15162}
15163
15164/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15165/// store the results in dst.
15166///
15167/// Rounding is done according to the rounding parameter, which can be one of:
15168///
15169/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15170/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15171/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15172/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15173/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15174///
15175/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epi64)
15176#[inline]
15177#[target_feature(enable = "avx512fp16")]
15178#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15179#[rustc_legacy_const_generics(1)]
15180#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15181pub fn _mm512_cvt_roundph_epi64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15182    static_assert_rounding!(ROUNDING);
15183    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15184}
15185
15186/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15187/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15188///
15189/// Rounding is done according to the rounding parameter, which can be one of:
15190///
15191/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15192/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15193/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15194/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15195/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15196///
15197/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epi64)
15198#[inline]
15199#[target_feature(enable = "avx512fp16")]
15200#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15201#[rustc_legacy_const_generics(3)]
15202#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15203pub fn _mm512_mask_cvt_roundph_epi64<const ROUNDING: i32>(
15204    src: __m512i,
15205    k: __mmask8,
15206    a: __m128h,
15207) -> __m512i {
15208    unsafe {
15209        static_assert_rounding!(ROUNDING);
15210        transmute(vcvtph2qq_512(a, src.as_i64x8(), k, ROUNDING))
15211    }
15212}
15213
15214/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers, and
15215/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15216///
15217/// Rounding is done according to the rounding parameter, which can be one of:
15218///
15219/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15220/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15221/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15222/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15223/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15224///
15225/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epi64)
15226#[inline]
15227#[target_feature(enable = "avx512fp16")]
15228#[cfg_attr(test, assert_instr(vcvtph2qq, ROUNDING = 8))]
15229#[rustc_legacy_const_generics(2)]
15230#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15231pub fn _mm512_maskz_cvt_roundph_epi64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
15232    static_assert_rounding!(ROUNDING);
15233    _mm512_mask_cvt_roundph_epi64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15234}
15235
15236/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15237/// store the results in dst.
15238///
15239/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_epu64)
15240#[inline]
15241#[target_feature(enable = "avx512fp16,avx512vl")]
15242#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15243#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15244pub fn _mm_cvtph_epu64(a: __m128h) -> __m128i {
15245    _mm_mask_cvtph_epu64(_mm_undefined_si128(), 0xff, a)
15246}
15247
15248/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15249/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15250///
15251/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_epu64)
15252#[inline]
15253#[target_feature(enable = "avx512fp16,avx512vl")]
15254#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15255#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15256pub fn _mm_mask_cvtph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15257    unsafe { transmute(vcvtph2uqq_128(a, src.as_u64x2(), k)) }
15258}
15259
15260/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15261/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15262///
15263/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_epu64)
15264#[inline]
15265#[target_feature(enable = "avx512fp16,avx512vl")]
15266#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15267#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15268pub fn _mm_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15269    _mm_mask_cvtph_epu64(_mm_setzero_si128(), k, a)
15270}
15271
15272/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15273/// store the results in dst.
15274///
15275/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_epu64)
15276#[inline]
15277#[target_feature(enable = "avx512fp16,avx512vl")]
15278#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15279#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15280pub fn _mm256_cvtph_epu64(a: __m128h) -> __m256i {
15281    _mm256_mask_cvtph_epu64(_mm256_undefined_si256(), 0xff, a)
15282}
15283
15284/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15285/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15286///
15287/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_epu64)
15288#[inline]
15289#[target_feature(enable = "avx512fp16,avx512vl")]
15290#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15291#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15292pub fn _mm256_mask_cvtph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15293    unsafe { transmute(vcvtph2uqq_256(a, src.as_u64x4(), k)) }
15294}
15295
15296/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15297/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15298///
15299/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_epu64)
15300#[inline]
15301#[target_feature(enable = "avx512fp16,avx512vl")]
15302#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15303#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15304pub fn _mm256_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15305    _mm256_mask_cvtph_epu64(_mm256_setzero_si256(), k, a)
15306}
15307
15308/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15309/// store the results in dst.
15310///
15311/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_epu64)
15312#[inline]
15313#[target_feature(enable = "avx512fp16")]
15314#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15315#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15316pub fn _mm512_cvtph_epu64(a: __m128h) -> __m512i {
15317    _mm512_mask_cvtph_epu64(_mm512_undefined_epi32(), 0xff, a)
15318}
15319
15320/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15321/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15322///
15323/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_epu64)
15324#[inline]
15325#[target_feature(enable = "avx512fp16")]
15326#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15327#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15328pub fn _mm512_mask_cvtph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15329    unsafe {
15330        transmute(vcvtph2uqq_512(
15331            a,
15332            src.as_u64x8(),
15333            k,
15334            _MM_FROUND_CUR_DIRECTION,
15335        ))
15336    }
15337}
15338
15339/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15340/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15341///
15342/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_epu64)
15343#[inline]
15344#[target_feature(enable = "avx512fp16")]
15345#[cfg_attr(test, assert_instr(vcvtph2uqq))]
15346#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15347pub fn _mm512_maskz_cvtph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15348    _mm512_mask_cvtph_epu64(_mm512_setzero_si512(), k, a)
15349}
15350
15351/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15352/// store the results in dst.
15353///
15354/// Rounding is done according to the rounding parameter, which can be one of:
15355///
15356/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15357/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15358/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15359/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15360/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15361///
15362/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_epu64)
15363#[inline]
15364#[target_feature(enable = "avx512fp16")]
15365#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15366#[rustc_legacy_const_generics(1)]
15367#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15368pub fn _mm512_cvt_roundph_epu64<const ROUNDING: i32>(a: __m128h) -> __m512i {
15369    static_assert_rounding!(ROUNDING);
15370    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_undefined_epi32(), 0xff, a)
15371}
15372
15373/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15374/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15375///
15376/// Rounding is done according to the rounding parameter, which can be one of:
15377///
15378/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15379/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15380/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15381/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15382/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15383///
15384/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_epu64)
15385#[inline]
15386#[target_feature(enable = "avx512fp16")]
15387#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15388#[rustc_legacy_const_generics(3)]
15389#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15390pub fn _mm512_mask_cvt_roundph_epu64<const ROUNDING: i32>(
15391    src: __m512i,
15392    k: __mmask8,
15393    a: __m128h,
15394) -> __m512i {
15395    unsafe {
15396        static_assert_rounding!(ROUNDING);
15397        transmute(vcvtph2uqq_512(a, src.as_u64x8(), k, ROUNDING))
15398    }
15399}
15400
15401/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers, and
15402/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15403///
15404/// Rounding is done according to the rounding parameter, which can be one of:
15405///
15406/// * [`_MM_FROUND_TO_NEAREST_INT`] | [`_MM_FROUND_NO_EXC`] : round to nearest and suppress exceptions
15407/// * [`_MM_FROUND_TO_NEG_INF`] | [`_MM_FROUND_NO_EXC`] : round down and suppress exceptions
15408/// * [`_MM_FROUND_TO_POS_INF`] | [`_MM_FROUND_NO_EXC`] : round up and suppress exceptions
15409/// * [`_MM_FROUND_TO_ZERO`] | [`_MM_FROUND_NO_EXC`] : truncate and suppress exceptions
15410/// * [`_MM_FROUND_CUR_DIRECTION`] : use `MXCSR.RC` - see [`_MM_SET_ROUNDING_MODE`]
15411///
15412/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_epu64)
15413#[inline]
15414#[target_feature(enable = "avx512fp16")]
15415#[cfg_attr(test, assert_instr(vcvtph2uqq, ROUNDING = 8))]
15416#[rustc_legacy_const_generics(2)]
15417#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15418pub fn _mm512_maskz_cvt_roundph_epu64<const ROUNDING: i32>(k: __mmask8, a: __m128h) -> __m512i {
15419    static_assert_rounding!(ROUNDING);
15420    _mm512_mask_cvt_roundph_epu64::<ROUNDING>(_mm512_setzero_si512(), k, a)
15421}
15422
15423/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15424/// store the results in dst.
15425///
15426/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epi64)
15427#[inline]
15428#[target_feature(enable = "avx512fp16,avx512vl")]
15429#[cfg_attr(test, assert_instr(vcvttph2qq))]
15430#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15431pub fn _mm_cvttph_epi64(a: __m128h) -> __m128i {
15432    _mm_mask_cvttph_epi64(_mm_undefined_si128(), 0xff, a)
15433}
15434
15435/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15436/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15437///
15438/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epi64)
15439#[inline]
15440#[target_feature(enable = "avx512fp16,avx512vl")]
15441#[cfg_attr(test, assert_instr(vcvttph2qq))]
15442#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15443pub fn _mm_mask_cvttph_epi64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15444    unsafe { transmute(vcvttph2qq_128(a, src.as_i64x2(), k)) }
15445}
15446
15447/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15448/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15449///
15450/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epi64)
15451#[inline]
15452#[target_feature(enable = "avx512fp16,avx512vl")]
15453#[cfg_attr(test, assert_instr(vcvttph2qq))]
15454#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15455pub fn _mm_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m128i {
15456    _mm_mask_cvttph_epi64(_mm_setzero_si128(), k, a)
15457}
15458
15459/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15460/// store the results in dst.
15461///
15462/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epi64)
15463#[inline]
15464#[target_feature(enable = "avx512fp16,avx512vl")]
15465#[cfg_attr(test, assert_instr(vcvttph2qq))]
15466#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15467pub fn _mm256_cvttph_epi64(a: __m128h) -> __m256i {
15468    _mm256_mask_cvttph_epi64(_mm256_undefined_si256(), 0xff, a)
15469}
15470
15471/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15472/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15473///
15474/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epi64)
15475#[inline]
15476#[target_feature(enable = "avx512fp16,avx512vl")]
15477#[cfg_attr(test, assert_instr(vcvttph2qq))]
15478#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15479pub fn _mm256_mask_cvttph_epi64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15480    unsafe { transmute(vcvttph2qq_256(a, src.as_i64x4(), k)) }
15481}
15482
15483/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15484/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15485///
15486/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epi64)
15487#[inline]
15488#[target_feature(enable = "avx512fp16,avx512vl")]
15489#[cfg_attr(test, assert_instr(vcvttph2qq))]
15490#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15491pub fn _mm256_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m256i {
15492    _mm256_mask_cvttph_epi64(_mm256_setzero_si256(), k, a)
15493}
15494
15495/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15496/// store the results in dst.
15497///
15498/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epi64)
15499#[inline]
15500#[target_feature(enable = "avx512fp16")]
15501#[cfg_attr(test, assert_instr(vcvttph2qq))]
15502#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15503pub fn _mm512_cvttph_epi64(a: __m128h) -> __m512i {
15504    _mm512_mask_cvttph_epi64(_mm512_undefined_epi32(), 0xff, a)
15505}
15506
15507/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15508/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15509///
15510/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epi64)
15511#[inline]
15512#[target_feature(enable = "avx512fp16")]
15513#[cfg_attr(test, assert_instr(vcvttph2qq))]
15514#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15515pub fn _mm512_mask_cvttph_epi64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15516    unsafe {
15517        transmute(vcvttph2qq_512(
15518            a,
15519            src.as_i64x8(),
15520            k,
15521            _MM_FROUND_CUR_DIRECTION,
15522        ))
15523    }
15524}
15525
15526/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15527/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15528///
15529/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epi64)
15530#[inline]
15531#[target_feature(enable = "avx512fp16")]
15532#[cfg_attr(test, assert_instr(vcvttph2qq))]
15533#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15534pub fn _mm512_maskz_cvttph_epi64(k: __mmask8, a: __m128h) -> __m512i {
15535    _mm512_mask_cvttph_epi64(_mm512_setzero_si512(), k, a)
15536}
15537
15538/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15539/// store the results in dst.
15540///
15541/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15542///
15543/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epi64)
15544#[inline]
15545#[target_feature(enable = "avx512fp16")]
15546#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15547#[rustc_legacy_const_generics(1)]
15548#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15549pub fn _mm512_cvtt_roundph_epi64<const SAE: i32>(a: __m128h) -> __m512i {
15550    static_assert_sae!(SAE);
15551    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15552}
15553
15554/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15555/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15556///
15557/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15558///
15559/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epi64)
15560#[inline]
15561#[target_feature(enable = "avx512fp16")]
15562#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15563#[rustc_legacy_const_generics(3)]
15564#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15565pub fn _mm512_mask_cvtt_roundph_epi64<const SAE: i32>(
15566    src: __m512i,
15567    k: __mmask8,
15568    a: __m128h,
15569) -> __m512i {
15570    unsafe {
15571        static_assert_sae!(SAE);
15572        transmute(vcvttph2qq_512(a, src.as_i64x8(), k, SAE))
15573    }
15574}
15575
15576/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit integers with truncation, and
15577/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15578///
15579/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15580///
15581/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epi64)
15582#[inline]
15583#[target_feature(enable = "avx512fp16")]
15584#[cfg_attr(test, assert_instr(vcvttph2qq, SAE = 8))]
15585#[rustc_legacy_const_generics(2)]
15586#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15587pub fn _mm512_maskz_cvtt_roundph_epi64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15588    static_assert_sae!(SAE);
15589    _mm512_mask_cvtt_roundph_epi64::<SAE>(_mm512_setzero_si512(), k, a)
15590}
15591
15592/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15593/// store the results in dst.
15594///
15595/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttph_epu64)
15596#[inline]
15597#[target_feature(enable = "avx512fp16,avx512vl")]
15598#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15599#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15600pub fn _mm_cvttph_epu64(a: __m128h) -> __m128i {
15601    _mm_mask_cvttph_epu64(_mm_undefined_si128(), 0xff, a)
15602}
15603
15604/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15605/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15606///
15607/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttph_epu64)
15608#[inline]
15609#[target_feature(enable = "avx512fp16,avx512vl")]
15610#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15611#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15612pub fn _mm_mask_cvttph_epu64(src: __m128i, k: __mmask8, a: __m128h) -> __m128i {
15613    unsafe { transmute(vcvttph2uqq_128(a, src.as_u64x2(), k)) }
15614}
15615
15616/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15617/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15618///
15619/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttph_epu64)
15620#[inline]
15621#[target_feature(enable = "avx512fp16,avx512vl")]
15622#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15623#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15624pub fn _mm_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m128i {
15625    _mm_mask_cvttph_epu64(_mm_setzero_si128(), k, a)
15626}
15627
15628/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15629/// store the results in dst.
15630///
15631/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttph_epu64)
15632#[inline]
15633#[target_feature(enable = "avx512fp16,avx512vl")]
15634#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15635#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15636pub fn _mm256_cvttph_epu64(a: __m128h) -> __m256i {
15637    _mm256_mask_cvttph_epu64(_mm256_undefined_si256(), 0xff, a)
15638}
15639
15640/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15641/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15642///
15643/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttph_epu64)
15644#[inline]
15645#[target_feature(enable = "avx512fp16,avx512vl")]
15646#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15647#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15648pub fn _mm256_mask_cvttph_epu64(src: __m256i, k: __mmask8, a: __m128h) -> __m256i {
15649    unsafe { transmute(vcvttph2uqq_256(a, src.as_u64x4(), k)) }
15650}
15651
15652/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15653/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15654///
15655/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttph_epu64)
15656#[inline]
15657#[target_feature(enable = "avx512fp16,avx512vl")]
15658#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15659#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15660pub fn _mm256_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m256i {
15661    _mm256_mask_cvttph_epu64(_mm256_setzero_si256(), k, a)
15662}
15663
15664/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15665/// store the results in dst.
15666///
15667/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttph_epu64)
15668#[inline]
15669#[target_feature(enable = "avx512fp16")]
15670#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15671#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15672pub fn _mm512_cvttph_epu64(a: __m128h) -> __m512i {
15673    _mm512_mask_cvttph_epu64(_mm512_undefined_epi32(), 0xff, a)
15674}
15675
15676/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15677/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15678///
15679/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttph_epu64)
15680#[inline]
15681#[target_feature(enable = "avx512fp16")]
15682#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15683#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15684pub fn _mm512_mask_cvttph_epu64(src: __m512i, k: __mmask8, a: __m128h) -> __m512i {
15685    unsafe {
15686        transmute(vcvttph2uqq_512(
15687            a,
15688            src.as_u64x8(),
15689            k,
15690            _MM_FROUND_CUR_DIRECTION,
15691        ))
15692    }
15693}
15694
15695/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15696/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15697///
15698/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttph_epu64)
15699#[inline]
15700#[target_feature(enable = "avx512fp16")]
15701#[cfg_attr(test, assert_instr(vcvttph2uqq))]
15702#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15703pub fn _mm512_maskz_cvttph_epu64(k: __mmask8, a: __m128h) -> __m512i {
15704    _mm512_mask_cvttph_epu64(_mm512_setzero_si512(), k, a)
15705}
15706
15707/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15708/// store the results in dst.
15709///
15710/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15711///
15712/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundph_epu64)
15713#[inline]
15714#[target_feature(enable = "avx512fp16")]
15715#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15716#[rustc_legacy_const_generics(1)]
15717#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15718pub fn _mm512_cvtt_roundph_epu64<const SAE: i32>(a: __m128h) -> __m512i {
15719    static_assert_sae!(SAE);
15720    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_undefined_epi32(), 0xff, a)
15721}
15722
15723/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15724/// store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
15725///
15726/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15727///
15728/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundph_epu64)
15729#[inline]
15730#[target_feature(enable = "avx512fp16")]
15731#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15732#[rustc_legacy_const_generics(3)]
15733#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15734pub fn _mm512_mask_cvtt_roundph_epu64<const SAE: i32>(
15735    src: __m512i,
15736    k: __mmask8,
15737    a: __m128h,
15738) -> __m512i {
15739    unsafe {
15740        static_assert_sae!(SAE);
15741        transmute(vcvttph2uqq_512(a, src.as_u64x8(), k, SAE))
15742    }
15743}
15744
15745/// Convert packed half-precision (16-bit) floating-point elements in a to packed 64-bit unsigned integers with truncation, and
15746/// store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
15747///
15748/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15749///
15750/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundph_epu64)
15751#[inline]
15752#[target_feature(enable = "avx512fp16")]
15753#[cfg_attr(test, assert_instr(vcvttph2uqq, SAE = 8))]
15754#[rustc_legacy_const_generics(2)]
15755#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15756pub fn _mm512_maskz_cvtt_roundph_epu64<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512i {
15757    static_assert_sae!(SAE);
15758    _mm512_mask_cvtt_roundph_epu64::<SAE>(_mm512_setzero_si512(), k, a)
15759}
15760
15761/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15762/// floating-point elements, and store the results in dst.
15763///
15764/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtxph_ps)
15765#[inline]
15766#[target_feature(enable = "avx512fp16,avx512vl")]
15767#[cfg_attr(test, assert_instr(vcvtph2psx))]
15768#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15769pub fn _mm_cvtxph_ps(a: __m128h) -> __m128 {
15770    _mm_mask_cvtxph_ps(_mm_setzero_ps(), 0xff, a)
15771}
15772
15773/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15774/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15775/// dst when the corresponding mask bit is not set).
15776///
15777/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtxph_ps)
15778#[inline]
15779#[target_feature(enable = "avx512fp16,avx512vl")]
15780#[cfg_attr(test, assert_instr(vcvtph2psx))]
15781#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15782pub fn _mm_mask_cvtxph_ps(src: __m128, k: __mmask8, a: __m128h) -> __m128 {
15783    unsafe { vcvtph2psx_128(a, src, k) }
15784}
15785
15786/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15787/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15788/// corresponding mask bit is not set).
15789///
15790/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtxph_ps)
15791#[inline]
15792#[target_feature(enable = "avx512fp16,avx512vl")]
15793#[cfg_attr(test, assert_instr(vcvtph2psx))]
15794#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15795pub fn _mm_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m128 {
15796    _mm_mask_cvtxph_ps(_mm_setzero_ps(), k, a)
15797}
15798
15799/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15800/// floating-point elements, and store the results in dst.
15801///
15802/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtxph_ps)
15803#[inline]
15804#[target_feature(enable = "avx512fp16,avx512vl")]
15805#[cfg_attr(test, assert_instr(vcvtph2psx))]
15806#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15807pub fn _mm256_cvtxph_ps(a: __m128h) -> __m256 {
15808    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), 0xff, a)
15809}
15810
15811/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15812/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15813/// dst when the corresponding mask bit is not set).
15814///
15815/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtxph_ps)
15816#[inline]
15817#[target_feature(enable = "avx512fp16,avx512vl")]
15818#[cfg_attr(test, assert_instr(vcvtph2psx))]
15819#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15820pub fn _mm256_mask_cvtxph_ps(src: __m256, k: __mmask8, a: __m128h) -> __m256 {
15821    unsafe { vcvtph2psx_256(a, src, k) }
15822}
15823
15824/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15825/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15826/// corresponding mask bit is not set).
15827///
15828/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtxph_ps)
15829#[inline]
15830#[target_feature(enable = "avx512fp16,avx512vl")]
15831#[cfg_attr(test, assert_instr(vcvtph2psx))]
15832#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15833pub fn _mm256_maskz_cvtxph_ps(k: __mmask8, a: __m128h) -> __m256 {
15834    _mm256_mask_cvtxph_ps(_mm256_setzero_ps(), k, a)
15835}
15836
15837/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15838/// floating-point elements, and store the results in dst.
15839///
15840/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtxph_ps)
15841#[inline]
15842#[target_feature(enable = "avx512fp16")]
15843#[cfg_attr(test, assert_instr(vcvtph2psx))]
15844#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15845pub fn _mm512_cvtxph_ps(a: __m256h) -> __m512 {
15846    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), 0xffff, a)
15847}
15848
15849/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15850/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15851/// dst when the corresponding mask bit is not set).
15852///
15853/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtxph_ps)
15854#[inline]
15855#[target_feature(enable = "avx512fp16")]
15856#[cfg_attr(test, assert_instr(vcvtph2psx))]
15857#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15858pub fn _mm512_mask_cvtxph_ps(src: __m512, k: __mmask16, a: __m256h) -> __m512 {
15859    unsafe { vcvtph2psx_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
15860}
15861
15862/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15863/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15864/// corresponding mask bit is not set).
15865///
15866/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtxph_ps)
15867#[inline]
15868#[target_feature(enable = "avx512fp16")]
15869#[cfg_attr(test, assert_instr(vcvtph2psx))]
15870#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15871pub fn _mm512_maskz_cvtxph_ps(k: __mmask16, a: __m256h) -> __m512 {
15872    _mm512_mask_cvtxph_ps(_mm512_setzero_ps(), k, a)
15873}
15874
15875/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15876/// floating-point elements, and store the results in dst.
15877///
15878/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15879///
15880/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtx_roundph_ps)
15881#[inline]
15882#[target_feature(enable = "avx512fp16")]
15883#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15884#[rustc_legacy_const_generics(1)]
15885#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15886pub fn _mm512_cvtx_roundph_ps<const SAE: i32>(a: __m256h) -> __m512 {
15887    static_assert_sae!(SAE);
15888    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), 0xffff, a)
15889}
15890
15891/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15892/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
15893/// dst when the corresponding mask bit is not set).
15894///
15895/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15896///
15897/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtx_roundph_ps)
15898#[inline]
15899#[target_feature(enable = "avx512fp16")]
15900#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15901#[rustc_legacy_const_generics(3)]
15902#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15903pub fn _mm512_mask_cvtx_roundph_ps<const SAE: i32>(
15904    src: __m512,
15905    k: __mmask16,
15906    a: __m256h,
15907) -> __m512 {
15908    unsafe {
15909        static_assert_sae!(SAE);
15910        vcvtph2psx_512(a, src, k, SAE)
15911    }
15912}
15913
15914/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit)
15915/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
15916/// corresponding mask bit is not set).
15917///
15918/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15919///
15920/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtx_roundph_ps)
15921#[inline]
15922#[target_feature(enable = "avx512fp16")]
15923#[cfg_attr(test, assert_instr(vcvtph2psx, SAE = 8))]
15924#[rustc_legacy_const_generics(2)]
15925#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15926pub fn _mm512_maskz_cvtx_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256h) -> __m512 {
15927    static_assert_sae!(SAE);
15928    _mm512_mask_cvtx_roundph_ps::<SAE>(_mm512_setzero_ps(), k, a)
15929}
15930
15931/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15932/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed
15933/// elements from a to the upper elements of dst.
15934///
15935/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_ss)
15936#[inline]
15937#[target_feature(enable = "avx512fp16")]
15938#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15939#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15940pub fn _mm_cvtsh_ss(a: __m128, b: __m128h) -> __m128 {
15941    _mm_mask_cvtsh_ss(a, 0xff, a, b)
15942}
15943
15944/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15945/// floating-point element, store the result in the lower element of dst using writemask k (the element is
15946/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
15947/// upper elements of dst.
15948///
15949/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_ss)
15950#[inline]
15951#[target_feature(enable = "avx512fp16")]
15952#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15953#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15954pub fn _mm_mask_cvtsh_ss(src: __m128, k: __mmask8, a: __m128, b: __m128h) -> __m128 {
15955    unsafe { vcvtsh2ss(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
15956}
15957
15958/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15959/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
15960/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
15961/// of dst.
15962///
15963/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_ss)
15964#[inline]
15965#[target_feature(enable = "avx512fp16")]
15966#[cfg_attr(test, assert_instr(vcvtsh2ss))]
15967#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15968pub fn _mm_maskz_cvtsh_ss(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
15969    _mm_mask_cvtsh_ss(_mm_set_ss(0.0), k, a, b)
15970}
15971
15972/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15973/// floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements
15974/// from a to the upper elements of dst.
15975///
15976/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15977///
15978/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_ss)
15979#[inline]
15980#[target_feature(enable = "avx512fp16")]
15981#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
15982#[rustc_legacy_const_generics(2)]
15983#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
15984pub fn _mm_cvt_roundsh_ss<const SAE: i32>(a: __m128, b: __m128h) -> __m128 {
15985    static_assert_sae!(SAE);
15986    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_undefined_ps(), 0xff, a, b)
15987}
15988
15989/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
15990/// floating-point element, store the result in the lower element of dst using writemask k (the element is
15991/// copied from src to dst when mask bit 0 is not set), and copy the upper 3 packed elements from a to the
15992/// upper elements of dst.
15993///
15994/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
15995///
15996/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_ss)
15997#[inline]
15998#[target_feature(enable = "avx512fp16")]
15999#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
16000#[rustc_legacy_const_generics(4)]
16001#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16002pub fn _mm_mask_cvt_roundsh_ss<const SAE: i32>(
16003    src: __m128,
16004    k: __mmask8,
16005    a: __m128,
16006    b: __m128h,
16007) -> __m128 {
16008    unsafe {
16009        static_assert_sae!(SAE);
16010        vcvtsh2ss(a, b, src, k, SAE)
16011    }
16012}
16013
16014/// Convert the lower half-precision (16-bit) floating-point element in b to a single-precision (32-bit)
16015/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16016/// zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements
16017/// of dst.
16018///
16019/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16020///
16021/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_ss)
16022#[inline]
16023#[target_feature(enable = "avx512fp16")]
16024#[cfg_attr(test, assert_instr(vcvtsh2ss, SAE = 8))]
16025#[rustc_legacy_const_generics(3)]
16026#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16027pub fn _mm_maskz_cvt_roundsh_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128h) -> __m128 {
16028    static_assert_sae!(SAE);
16029    _mm_mask_cvt_roundsh_ss::<SAE>(_mm_set_ss(0.0), k, a, b)
16030}
16031
16032/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16033/// floating-point elements, and store the results in dst.
16034///
16035/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtph_pd)
16036#[inline]
16037#[target_feature(enable = "avx512fp16,avx512vl")]
16038#[cfg_attr(test, assert_instr(vcvtph2pd))]
16039#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16040pub fn _mm_cvtph_pd(a: __m128h) -> __m128d {
16041    _mm_mask_cvtph_pd(_mm_setzero_pd(), 0xff, a)
16042}
16043
16044/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16045/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16046/// dst when the corresponding mask bit is not set).
16047///
16048/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_pd)
16049#[inline]
16050#[target_feature(enable = "avx512fp16,avx512vl")]
16051#[cfg_attr(test, assert_instr(vcvtph2pd))]
16052#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16053pub fn _mm_mask_cvtph_pd(src: __m128d, k: __mmask8, a: __m128h) -> __m128d {
16054    unsafe { vcvtph2pd_128(a, src, k) }
16055}
16056
16057/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16058/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16059/// corresponding mask bit is not set).
16060///
16061/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_pd)
16062#[inline]
16063#[target_feature(enable = "avx512fp16,avx512vl")]
16064#[cfg_attr(test, assert_instr(vcvtph2pd))]
16065#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16066pub fn _mm_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m128d {
16067    _mm_mask_cvtph_pd(_mm_setzero_pd(), k, a)
16068}
16069
16070/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16071/// floating-point elements, and store the results in dst.
16072///
16073/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtph_pd)
16074#[inline]
16075#[target_feature(enable = "avx512fp16,avx512vl")]
16076#[cfg_attr(test, assert_instr(vcvtph2pd))]
16077#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16078pub fn _mm256_cvtph_pd(a: __m128h) -> __m256d {
16079    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), 0xff, a)
16080}
16081
16082/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16083/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16084/// dst when the corresponding mask bit is not set).
16085///
16086/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_pd)
16087#[inline]
16088#[target_feature(enable = "avx512fp16,avx512vl")]
16089#[cfg_attr(test, assert_instr(vcvtph2pd))]
16090#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16091pub fn _mm256_mask_cvtph_pd(src: __m256d, k: __mmask8, a: __m128h) -> __m256d {
16092    unsafe { vcvtph2pd_256(a, src, k) }
16093}
16094
16095/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16096/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16097/// corresponding mask bit is not set).
16098///
16099/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_pd)
16100#[inline]
16101#[target_feature(enable = "avx512fp16,avx512vl")]
16102#[cfg_attr(test, assert_instr(vcvtph2pd))]
16103#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16104pub fn _mm256_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m256d {
16105    _mm256_mask_cvtph_pd(_mm256_setzero_pd(), k, a)
16106}
16107
16108/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16109/// floating-point elements, and store the results in dst.
16110///
16111/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_pd)
16112#[inline]
16113#[target_feature(enable = "avx512fp16")]
16114#[cfg_attr(test, assert_instr(vcvtph2pd))]
16115#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16116pub fn _mm512_cvtph_pd(a: __m128h) -> __m512d {
16117    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), 0xff, a)
16118}
16119
16120/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16121/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16122/// dst when the corresponding mask bit is not set).
16123///
16124/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_pd)
16125#[inline]
16126#[target_feature(enable = "avx512fp16")]
16127#[cfg_attr(test, assert_instr(vcvtph2pd))]
16128#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16129pub fn _mm512_mask_cvtph_pd(src: __m512d, k: __mmask8, a: __m128h) -> __m512d {
16130    unsafe { vcvtph2pd_512(a, src, k, _MM_FROUND_CUR_DIRECTION) }
16131}
16132
16133/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16134/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16135/// corresponding mask bit is not set).
16136///
16137/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_pd)
16138#[inline]
16139#[target_feature(enable = "avx512fp16")]
16140#[cfg_attr(test, assert_instr(vcvtph2pd))]
16141#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16142pub fn _mm512_maskz_cvtph_pd(k: __mmask8, a: __m128h) -> __m512d {
16143    _mm512_mask_cvtph_pd(_mm512_setzero_pd(), k, a)
16144}
16145
16146/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16147/// floating-point elements, and store the results in dst.
16148///
16149/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16150///
16151/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_pd)
16152#[inline]
16153#[target_feature(enable = "avx512fp16")]
16154#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16155#[rustc_legacy_const_generics(1)]
16156#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16157pub fn _mm512_cvt_roundph_pd<const SAE: i32>(a: __m128h) -> __m512d {
16158    static_assert_sae!(SAE);
16159    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), 0xff, a)
16160}
16161
16162/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16163/// floating-point elements, and store the results in dst using writemask k (elements are copied from src to
16164/// dst when the corresponding mask bit is not set).
16165///
16166/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16167///
16168/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_pd)
16169#[inline]
16170#[target_feature(enable = "avx512fp16")]
16171#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16172#[rustc_legacy_const_generics(3)]
16173#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16174pub fn _mm512_mask_cvt_roundph_pd<const SAE: i32>(
16175    src: __m512d,
16176    k: __mmask8,
16177    a: __m128h,
16178) -> __m512d {
16179    unsafe {
16180        static_assert_sae!(SAE);
16181        vcvtph2pd_512(a, src, k, SAE)
16182    }
16183}
16184
16185/// Convert packed half-precision (16-bit) floating-point elements in a to packed double-precision (64-bit)
16186/// floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the
16187/// corresponding mask bit is not set).
16188///
16189/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16190///
16191/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_pd)
16192#[inline]
16193#[target_feature(enable = "avx512fp16")]
16194#[cfg_attr(test, assert_instr(vcvtph2pd, SAE = 8))]
16195#[rustc_legacy_const_generics(2)]
16196#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16197pub fn _mm512_maskz_cvt_roundph_pd<const SAE: i32>(k: __mmask8, a: __m128h) -> __m512d {
16198    static_assert_sae!(SAE);
16199    _mm512_mask_cvt_roundph_pd::<SAE>(_mm512_setzero_pd(), k, a)
16200}
16201
16202/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16203/// floating-point element, store the result in the lower element of dst, and copy the upper element
16204/// from a to the upper element of dst.
16205///
16206/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_sd)
16207#[inline]
16208#[target_feature(enable = "avx512fp16")]
16209#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16210#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16211pub fn _mm_cvtsh_sd(a: __m128d, b: __m128h) -> __m128d {
16212    _mm_mask_cvtsh_sd(a, 0xff, a, b)
16213}
16214
16215/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16216/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16217/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
16218/// of dst.
16219///
16220/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsh_sd)
16221#[inline]
16222#[target_feature(enable = "avx512fp16")]
16223#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16224#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16225pub fn _mm_mask_cvtsh_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16226    unsafe { vcvtsh2sd(a, b, src, k, _MM_FROUND_CUR_DIRECTION) }
16227}
16228
16229/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16230/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16231/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
16232///
16233/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsh_sd)
16234#[inline]
16235#[target_feature(enable = "avx512fp16")]
16236#[cfg_attr(test, assert_instr(vcvtsh2sd))]
16237#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16238pub fn _mm_maskz_cvtsh_sd(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16239    _mm_mask_cvtsh_sd(_mm_set_sd(0.0), k, a, b)
16240}
16241
16242/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16243/// floating-point element, store the result in the lower element of dst, and copy the upper element from a
16244/// to the upper element of dst.
16245///
16246/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16247///
16248/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_roundsh_sd)
16249#[inline]
16250#[target_feature(enable = "avx512fp16")]
16251#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16252#[rustc_legacy_const_generics(2)]
16253#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16254pub fn _mm_cvt_roundsh_sd<const SAE: i32>(a: __m128d, b: __m128h) -> __m128d {
16255    static_assert_sae!(SAE);
16256    _mm_mask_cvt_roundsh_sd::<SAE>(a, 0xff, a, b)
16257}
16258
16259/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16260/// floating-point element, store the result in the lower element of dst using writemask k (the element is
16261/// copied from src to dst when mask bit 0 is not set), and copy the upper element from a to the upper element
16262/// of dst.
16263///
16264/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16265///
16266/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundsh_sd)
16267#[inline]
16268#[target_feature(enable = "avx512fp16")]
16269#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16270#[rustc_legacy_const_generics(4)]
16271#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16272pub fn _mm_mask_cvt_roundsh_sd<const SAE: i32>(
16273    src: __m128d,
16274    k: __mmask8,
16275    a: __m128d,
16276    b: __m128h,
16277) -> __m128d {
16278    unsafe {
16279        static_assert_sae!(SAE);
16280        vcvtsh2sd(a, b, src, k, SAE)
16281    }
16282}
16283
16284/// Convert the lower half-precision (16-bit) floating-point element in b to a double-precision (64-bit)
16285/// floating-point element, store the result in the lower element of dst using zeromask k (the element is
16286/// zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
16287///
16288/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
16289///
16290/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundsh_sd)
16291#[inline]
16292#[target_feature(enable = "avx512fp16")]
16293#[cfg_attr(test, assert_instr(vcvtsh2sd, SAE = 8))]
16294#[rustc_legacy_const_generics(3)]
16295#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16296pub fn _mm_maskz_cvt_roundsh_sd<const SAE: i32>(k: __mmask8, a: __m128d, b: __m128h) -> __m128d {
16297    static_assert_sae!(SAE);
16298    _mm_mask_cvt_roundsh_sd::<SAE>(_mm_set_sd(0.0), k, a, b)
16299}
16300
16301/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16302///
16303/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsh_h)
16304#[inline]
16305#[target_feature(enable = "avx512fp16")]
16306#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16307pub fn _mm_cvtsh_h(a: __m128h) -> f16 {
16308    unsafe { simd_extract!(a, 0) }
16309}
16310
16311/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16312///
16313/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsh_h)
16314#[inline]
16315#[target_feature(enable = "avx512fp16")]
16316#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16317pub fn _mm256_cvtsh_h(a: __m256h) -> f16 {
16318    unsafe { simd_extract!(a, 0) }
16319}
16320
16321/// Copy the lower half-precision (16-bit) floating-point element from `a` to `dst`.
16322///
16323/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsh_h)
16324#[inline]
16325#[target_feature(enable = "avx512fp16")]
16326#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16327pub fn _mm512_cvtsh_h(a: __m512h) -> f16 {
16328    unsafe { simd_extract!(a, 0) }
16329}
16330
16331/// Copy the lower 16-bit integer in a to dst.
16332///
16333/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si16)
16334#[inline]
16335#[target_feature(enable = "avx512fp16")]
16336#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16337pub fn _mm_cvtsi128_si16(a: __m128i) -> i16 {
16338    unsafe { simd_extract!(a.as_i16x8(), 0) }
16339}
16340
16341/// Copy 16-bit integer a to the lower elements of dst, and zero the upper elements of dst.
16342///
16343/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi16_si128)
16344#[inline]
16345#[target_feature(enable = "avx512fp16")]
16346#[unstable(feature = "stdarch_x86_avx512_f16", issue = "127213")]
16347pub fn _mm_cvtsi16_si128(a: i16) -> __m128i {
16348    unsafe { transmute(simd_insert!(i16x8::ZERO, 0, a)) }
16349}
16350
16351#[allow(improper_ctypes)]
16352unsafe extern "C" {
16353    #[link_name = "llvm.x86.avx512fp16.mask.cmp.sh"]
16354    fn vcmpsh(a: __m128h, b: __m128h, imm8: i32, mask: __mmask8, sae: i32) -> __mmask8;
16355    #[link_name = "llvm.x86.avx512fp16.vcomi.sh"]
16356    fn vcomish(a: __m128h, b: __m128h, imm8: i32, sae: i32) -> i32;
16357
16358    #[link_name = "llvm.x86.avx512fp16.add.ph.512"]
16359    fn vaddph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16360    #[link_name = "llvm.x86.avx512fp16.sub.ph.512"]
16361    fn vsubph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16362    #[link_name = "llvm.x86.avx512fp16.mul.ph.512"]
16363    fn vmulph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16364    #[link_name = "llvm.x86.avx512fp16.div.ph.512"]
16365    fn vdivph(a: __m512h, b: __m512h, rounding: i32) -> __m512h;
16366
16367    #[link_name = "llvm.x86.avx512fp16.mask.add.sh.round"]
16368    fn vaddsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16369    #[link_name = "llvm.x86.avx512fp16.mask.sub.sh.round"]
16370    fn vsubsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16371    #[link_name = "llvm.x86.avx512fp16.mask.mul.sh.round"]
16372    fn vmulsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16373    #[link_name = "llvm.x86.avx512fp16.mask.div.sh.round"]
16374    fn vdivsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16375
16376    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.128"]
16377    fn vfmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16378    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.256"]
16379    fn vfmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16380    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.cph.512"]
16381    fn vfmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16382    #[link_name = "llvm.x86.avx512fp16.mask.vfmul.csh"]
16383    fn vfmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16384
16385    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.128"]
16386    fn vfcmulcph_128(a: __m128, b: __m128, src: __m128, k: __mmask8) -> __m128;
16387    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.256"]
16388    fn vfcmulcph_256(a: __m256, b: __m256, src: __m256, k: __mmask8) -> __m256;
16389    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.cph.512"]
16390    fn vfcmulcph_512(a: __m512, b: __m512, src: __m512, k: __mmask16, rounding: i32) -> __m512;
16391    #[link_name = "llvm.x86.avx512fp16.mask.vfcmul.csh"]
16392    fn vfcmulcsh(a: __m128, b: __m128, src: __m128, k: __mmask8, rounding: i32) -> __m128;
16393
16394    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.128"]
16395    fn vfmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16396    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.128"]
16397    fn vfmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16398    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.256"]
16399    fn vfmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16400    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.256"]
16401    fn vfmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16402    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.cph.512"]
16403    fn vfmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16404    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.cph.512"]
16405    fn vfmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32) -> __m512;
16406    #[link_name = "llvm.x86.avx512fp16.mask.vfmadd.csh"]
16407    fn vfmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16408    #[link_name = "llvm.x86.avx512fp16.maskz.vfmadd.csh"]
16409    fn vfmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16410
16411    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.128"]
16412    fn vfcmaddcph_mask3_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16413    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.128"]
16414    fn vfcmaddcph_maskz_128(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128;
16415    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.256"]
16416    fn vfcmaddcph_mask3_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16417    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.256"]
16418    fn vfcmaddcph_maskz_256(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256;
16419    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.cph.512"]
16420    fn vfcmaddcph_mask3_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16421    -> __m512;
16422    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.cph.512"]
16423    fn vfcmaddcph_maskz_512(a: __m512, b: __m512, c: __m512, k: __mmask16, rounding: i32)
16424    -> __m512;
16425    #[link_name = "llvm.x86.avx512fp16.mask.vfcmadd.csh"]
16426    fn vfcmaddcsh_mask(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16427    #[link_name = "llvm.x86.avx512fp16.maskz.vfcmadd.csh"]
16428    fn vfcmaddcsh_maskz(a: __m128, b: __m128, c: __m128, k: __mmask8, rounding: i32) -> __m128;
16429
16430    #[link_name = "llvm.x86.avx512fp16.vfmadd.ph.512"]
16431    fn vfmaddph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16432    #[link_name = "llvm.x86.avx512fp16.vfmadd.f16"]
16433    fn vfmaddsh(a: f16, b: f16, c: f16, rounding: i32) -> f16;
16434
16435    #[link_name = "llvm.x86.avx512fp16.vfmaddsub.ph.512"]
16436    fn vfmaddsubph_512(a: __m512h, b: __m512h, c: __m512h, rounding: i32) -> __m512h;
16437
16438    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.128"]
16439    fn vrcpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16440    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.256"]
16441    fn vrcpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16442    #[link_name = "llvm.x86.avx512fp16.mask.rcp.ph.512"]
16443    fn vrcpph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16444    #[link_name = "llvm.x86.avx512fp16.mask.rcp.sh"]
16445    fn vrcpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16446
16447    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.128"]
16448    fn vrsqrtph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16449    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.256"]
16450    fn vrsqrtph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16451    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.ph.512"]
16452    fn vrsqrtph_512(a: __m512h, src: __m512h, k: __mmask32) -> __m512h;
16453    #[link_name = "llvm.x86.avx512fp16.mask.rsqrt.sh"]
16454    fn vrsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16455
16456    #[link_name = "llvm.x86.avx512fp16.sqrt.ph.512"]
16457    fn vsqrtph_512(a: __m512h, rounding: i32) -> __m512h;
16458    #[link_name = "llvm.x86.avx512fp16.mask.sqrt.sh"]
16459    fn vsqrtsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16460
16461    #[link_name = "llvm.x86.avx512fp16.max.ph.128"]
16462    fn vmaxph_128(a: __m128h, b: __m128h) -> __m128h;
16463    #[link_name = "llvm.x86.avx512fp16.max.ph.256"]
16464    fn vmaxph_256(a: __m256h, b: __m256h) -> __m256h;
16465    #[link_name = "llvm.x86.avx512fp16.max.ph.512"]
16466    fn vmaxph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16467    #[link_name = "llvm.x86.avx512fp16.mask.max.sh.round"]
16468    fn vmaxsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16469
16470    #[link_name = "llvm.x86.avx512fp16.min.ph.128"]
16471    fn vminph_128(a: __m128h, b: __m128h) -> __m128h;
16472    #[link_name = "llvm.x86.avx512fp16.min.ph.256"]
16473    fn vminph_256(a: __m256h, b: __m256h) -> __m256h;
16474    #[link_name = "llvm.x86.avx512fp16.min.ph.512"]
16475    fn vminph_512(a: __m512h, b: __m512h, sae: i32) -> __m512h;
16476    #[link_name = "llvm.x86.avx512fp16.mask.min.sh.round"]
16477    fn vminsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16478
16479    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.128"]
16480    fn vgetexpph_128(a: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16481    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.256"]
16482    fn vgetexpph_256(a: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16483    #[link_name = "llvm.x86.avx512fp16.mask.getexp.ph.512"]
16484    fn vgetexpph_512(a: __m512h, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16485    #[link_name = "llvm.x86.avx512fp16.mask.getexp.sh"]
16486    fn vgetexpsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, sae: i32) -> __m128h;
16487
16488    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.128"]
16489    fn vgetmantph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16490    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.256"]
16491    fn vgetmantph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16492    #[link_name = "llvm.x86.avx512fp16.mask.getmant.ph.512"]
16493    fn vgetmantph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16494    #[link_name = "llvm.x86.avx512fp16.mask.getmant.sh"]
16495    fn vgetmantsh(
16496        a: __m128h,
16497        b: __m128h,
16498        imm8: i32,
16499        src: __m128h,
16500        k: __mmask8,
16501        sae: i32,
16502    ) -> __m128h;
16503
16504    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.128"]
16505    fn vrndscaleph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16506    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.256"]
16507    fn vrndscaleph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16508    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.ph.512"]
16509    fn vrndscaleph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16510    #[link_name = "llvm.x86.avx512fp16.mask.rndscale.sh"]
16511    fn vrndscalesh(
16512        a: __m128h,
16513        b: __m128h,
16514        src: __m128h,
16515        k: __mmask8,
16516        imm8: i32,
16517        sae: i32,
16518    ) -> __m128h;
16519
16520    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.128"]
16521    fn vscalefph_128(a: __m128h, b: __m128h, src: __m128h, k: __mmask8) -> __m128h;
16522    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.256"]
16523    fn vscalefph_256(a: __m256h, b: __m256h, src: __m256h, k: __mmask16) -> __m256h;
16524    #[link_name = "llvm.x86.avx512fp16.mask.scalef.ph.512"]
16525    fn vscalefph_512(a: __m512h, b: __m512h, src: __m512h, k: __mmask32, rounding: i32) -> __m512h;
16526    #[link_name = "llvm.x86.avx512fp16.mask.scalef.sh"]
16527    fn vscalefsh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16528
16529    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.128"]
16530    fn vreduceph_128(a: __m128h, imm8: i32, src: __m128h, k: __mmask8) -> __m128h;
16531    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.256"]
16532    fn vreduceph_256(a: __m256h, imm8: i32, src: __m256h, k: __mmask16) -> __m256h;
16533    #[link_name = "llvm.x86.avx512fp16.mask.reduce.ph.512"]
16534    fn vreduceph_512(a: __m512h, imm8: i32, src: __m512h, k: __mmask32, sae: i32) -> __m512h;
16535    #[link_name = "llvm.x86.avx512fp16.mask.reduce.sh"]
16536    fn vreducesh(a: __m128h, b: __m128h, src: __m128h, k: __mmask8, imm8: i32, sae: i32)
16537    -> __m128h;
16538
16539    #[link_name = "llvm.x86.avx512fp16.mask.fpclass.sh"]
16540    fn vfpclasssh(a: __m128h, imm8: i32, k: __mmask8) -> __mmask8;
16541
16542    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i16"]
16543    fn vcvtw2ph_128(a: i16x8, rounding: i32) -> __m128h;
16544    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i16"]
16545    fn vcvtw2ph_256(a: i16x16, rounding: i32) -> __m256h;
16546    #[link_name = "llvm.x86.avx512.sitofp.round.v32f16.v32i16"]
16547    fn vcvtw2ph_512(a: i16x32, rounding: i32) -> __m512h;
16548    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i16"]
16549    fn vcvtuw2ph_128(a: u16x8, rounding: i32) -> __m128h;
16550    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16i16"]
16551    fn vcvtuw2ph_256(a: u16x16, rounding: i32) -> __m256h;
16552    #[link_name = "llvm.x86.avx512.uitofp.round.v32f16.v32i16"]
16553    fn vcvtuw2ph_512(a: u16x32, rounding: i32) -> __m512h;
16554
16555    #[link_name = "llvm.x86.avx512fp16.mask.vcvtdq2ph.128"]
16556    fn vcvtdq2ph_128(a: i32x4, src: __m128h, k: __mmask8) -> __m128h;
16557    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i32"]
16558    fn vcvtdq2ph_256(a: i32x8, rounding: i32) -> __m128h;
16559    #[link_name = "llvm.x86.avx512.sitofp.round.v16f16.v16i32"]
16560    fn vcvtdq2ph_512(a: i32x16, rounding: i32) -> __m256h;
16561    #[link_name = "llvm.x86.avx512fp16.vcvtsi2sh"]
16562    fn vcvtsi2sh(a: __m128h, b: i32, rounding: i32) -> __m128h;
16563    #[link_name = "llvm.x86.avx512fp16.mask.vcvtudq2ph.128"]
16564    fn vcvtudq2ph_128(a: u32x4, src: __m128h, k: __mmask8) -> __m128h;
16565    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i32"]
16566    fn vcvtudq2ph_256(a: u32x8, rounding: i32) -> __m128h;
16567    #[link_name = "llvm.x86.avx512.uitofp.round.v16f16.v16i32"]
16568    fn vcvtudq2ph_512(a: u32x16, rounding: i32) -> __m256h;
16569    #[link_name = "llvm.x86.avx512fp16.vcvtusi2sh"]
16570    fn vcvtusi2sh(a: __m128h, b: u32, rounding: i32) -> __m128h;
16571
16572    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.128"]
16573    fn vcvtqq2ph_128(a: i64x2, src: __m128h, k: __mmask8) -> __m128h;
16574    #[link_name = "llvm.x86.avx512fp16.mask.vcvtqq2ph.256"]
16575    fn vcvtqq2ph_256(a: i64x4, src: __m128h, k: __mmask8) -> __m128h;
16576    #[link_name = "llvm.x86.avx512.sitofp.round.v8f16.v8i64"]
16577    fn vcvtqq2ph_512(a: i64x8, rounding: i32) -> __m128h;
16578    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.128"]
16579    fn vcvtuqq2ph_128(a: u64x2, src: __m128h, k: __mmask8) -> __m128h;
16580    #[link_name = "llvm.x86.avx512fp16.mask.vcvtuqq2ph.256"]
16581    fn vcvtuqq2ph_256(a: u64x4, src: __m128h, k: __mmask8) -> __m128h;
16582    #[link_name = "llvm.x86.avx512.uitofp.round.v8f16.v8i64"]
16583    fn vcvtuqq2ph_512(a: u64x8, rounding: i32) -> __m128h;
16584
16585    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.128"]
16586    fn vcvtps2phx_128(a: __m128, src: __m128h, k: __mmask8) -> __m128h;
16587    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.256"]
16588    fn vcvtps2phx_256(a: __m256, src: __m128h, k: __mmask8) -> __m128h;
16589    #[link_name = "llvm.x86.avx512fp16.mask.vcvtps2phx.512"]
16590    fn vcvtps2phx_512(a: __m512, src: __m256h, k: __mmask16, rounding: i32) -> __m256h;
16591    #[link_name = "llvm.x86.avx512fp16.mask.vcvtss2sh.round"]
16592    fn vcvtss2sh(a: __m128h, b: __m128, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16593
16594    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.128"]
16595    fn vcvtpd2ph_128(a: __m128d, src: __m128h, k: __mmask8) -> __m128h;
16596    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.256"]
16597    fn vcvtpd2ph_256(a: __m256d, src: __m128h, k: __mmask8) -> __m128h;
16598    #[link_name = "llvm.x86.avx512fp16.mask.vcvtpd2ph.512"]
16599    fn vcvtpd2ph_512(a: __m512d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16600    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsd2sh.round"]
16601    fn vcvtsd2sh(a: __m128h, b: __m128d, src: __m128h, k: __mmask8, rounding: i32) -> __m128h;
16602
16603    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.128"]
16604    fn vcvtph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16605    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.256"]
16606    fn vcvtph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16607    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2w.512"]
16608    fn vcvtph2w_512(a: __m512h, src: i16x32, k: __mmask32, rounding: i32) -> i16x32;
16609    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.128"]
16610    fn vcvtph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16611    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.256"]
16612    fn vcvtph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16613    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uw.512"]
16614    fn vcvtph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
16615
16616    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.128"]
16617    fn vcvttph2w_128(a: __m128h, src: i16x8, k: __mmask8) -> i16x8;
16618    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.256"]
16619    fn vcvttph2w_256(a: __m256h, src: i16x16, k: __mmask16) -> i16x16;
16620    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2w.512"]
16621    fn vcvttph2w_512(a: __m512h, src: i16x32, k: __mmask32, sae: i32) -> i16x32;
16622    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.128"]
16623    fn vcvttph2uw_128(a: __m128h, src: u16x8, k: __mmask8) -> u16x8;
16624    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.256"]
16625    fn vcvttph2uw_256(a: __m256h, src: u16x16, k: __mmask16) -> u16x16;
16626    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uw.512"]
16627    fn vcvttph2uw_512(a: __m512h, src: u16x32, k: __mmask32, sae: i32) -> u16x32;
16628
16629    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.128"]
16630    fn vcvtph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16631    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.256"]
16632    fn vcvtph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16633    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2dq.512"]
16634    fn vcvtph2dq_512(a: __m256h, src: i32x16, k: __mmask16, rounding: i32) -> i32x16;
16635    #[link_name = "llvm.x86.avx512fp16.vcvtsh2si32"]
16636    fn vcvtsh2si32(a: __m128h, rounding: i32) -> i32;
16637    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.128"]
16638    fn vcvtph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16639    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.256"]
16640    fn vcvtph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16641    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2udq.512"]
16642    fn vcvtph2udq_512(a: __m256h, src: u32x16, k: __mmask16, rounding: i32) -> u32x16;
16643    #[link_name = "llvm.x86.avx512fp16.vcvtsh2usi32"]
16644    fn vcvtsh2usi32(a: __m128h, sae: i32) -> u32;
16645
16646    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.128"]
16647    fn vcvttph2dq_128(a: __m128h, src: i32x4, k: __mmask8) -> i32x4;
16648    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.256"]
16649    fn vcvttph2dq_256(a: __m128h, src: i32x8, k: __mmask8) -> i32x8;
16650    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2dq.512"]
16651    fn vcvttph2dq_512(a: __m256h, src: i32x16, k: __mmask16, sae: i32) -> i32x16;
16652    #[link_name = "llvm.x86.avx512fp16.vcvttsh2si32"]
16653    fn vcvttsh2si32(a: __m128h, sae: i32) -> i32;
16654    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.128"]
16655    fn vcvttph2udq_128(a: __m128h, src: u32x4, k: __mmask8) -> u32x4;
16656    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.256"]
16657    fn vcvttph2udq_256(a: __m128h, src: u32x8, k: __mmask8) -> u32x8;
16658    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2udq.512"]
16659    fn vcvttph2udq_512(a: __m256h, src: u32x16, k: __mmask16, sae: i32) -> u32x16;
16660    #[link_name = "llvm.x86.avx512fp16.vcvttsh2usi32"]
16661    fn vcvttsh2usi32(a: __m128h, sae: i32) -> u32;
16662
16663    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.128"]
16664    fn vcvtph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16665    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.256"]
16666    fn vcvtph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16667    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2qq.512"]
16668    fn vcvtph2qq_512(a: __m128h, src: i64x8, k: __mmask8, rounding: i32) -> i64x8;
16669    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.128"]
16670    fn vcvtph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16671    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.256"]
16672    fn vcvtph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16673    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2uqq.512"]
16674    fn vcvtph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, rounding: i32) -> u64x8;
16675
16676    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.128"]
16677    fn vcvttph2qq_128(a: __m128h, src: i64x2, k: __mmask8) -> i64x2;
16678    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.256"]
16679    fn vcvttph2qq_256(a: __m128h, src: i64x4, k: __mmask8) -> i64x4;
16680    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2qq.512"]
16681    fn vcvttph2qq_512(a: __m128h, src: i64x8, k: __mmask8, sae: i32) -> i64x8;
16682    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.128"]
16683    fn vcvttph2uqq_128(a: __m128h, src: u64x2, k: __mmask8) -> u64x2;
16684    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.256"]
16685    fn vcvttph2uqq_256(a: __m128h, src: u64x4, k: __mmask8) -> u64x4;
16686    #[link_name = "llvm.x86.avx512fp16.mask.vcvttph2uqq.512"]
16687    fn vcvttph2uqq_512(a: __m128h, src: u64x8, k: __mmask8, sae: i32) -> u64x8;
16688
16689    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.128"]
16690    fn vcvtph2psx_128(a: __m128h, src: __m128, k: __mmask8) -> __m128;
16691    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.256"]
16692    fn vcvtph2psx_256(a: __m128h, src: __m256, k: __mmask8) -> __m256;
16693    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2psx.512"]
16694    fn vcvtph2psx_512(a: __m256h, src: __m512, k: __mmask16, sae: i32) -> __m512;
16695    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2ss.round"]
16696    fn vcvtsh2ss(a: __m128, b: __m128h, src: __m128, k: __mmask8, sae: i32) -> __m128;
16697
16698    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.128"]
16699    fn vcvtph2pd_128(a: __m128h, src: __m128d, k: __mmask8) -> __m128d;
16700    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.256"]
16701    fn vcvtph2pd_256(a: __m128h, src: __m256d, k: __mmask8) -> __m256d;
16702    #[link_name = "llvm.x86.avx512fp16.mask.vcvtph2pd.512"]
16703    fn vcvtph2pd_512(a: __m128h, src: __m512d, k: __mmask8, sae: i32) -> __m512d;
16704    #[link_name = "llvm.x86.avx512fp16.mask.vcvtsh2sd.round"]
16705    fn vcvtsh2sd(a: __m128d, b: __m128h, src: __m128d, k: __mmask8, sae: i32) -> __m128d;
16706
16707}
16708
16709#[cfg(test)]
16710mod tests {
16711    use crate::core_arch::x86::*;
16712    use crate::mem::transmute;
16713    use crate::ptr::{addr_of, addr_of_mut};
16714    use stdarch_test::simd_test;
16715
16716    #[target_feature(enable = "avx512fp16")]
16717    unsafe fn _mm_set1_pch(re: f16, im: f16) -> __m128h {
16718        _mm_setr_ph(re, im, re, im, re, im, re, im)
16719    }
16720
16721    #[target_feature(enable = "avx512fp16")]
16722    unsafe fn _mm256_set1_pch(re: f16, im: f16) -> __m256h {
16723        _mm256_setr_ph(
16724            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16725        )
16726    }
16727
16728    #[target_feature(enable = "avx512fp16")]
16729    unsafe fn _mm512_set1_pch(re: f16, im: f16) -> __m512h {
16730        _mm512_setr_ph(
16731            re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re, im,
16732            re, im, re, im, re, im, re, im, re, im,
16733        )
16734    }
16735
16736    #[simd_test(enable = "avx512fp16,avx512vl")]
16737    unsafe fn test_mm_set_ph() {
16738        let r = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16739        let e = _mm_setr_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
16740        assert_eq_m128h(r, e);
16741    }
16742
16743    #[simd_test(enable = "avx512fp16,avx512vl")]
16744    unsafe fn test_mm256_set_ph() {
16745        let r = _mm256_set_ph(
16746            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16747        );
16748        let e = _mm256_setr_ph(
16749            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
16750        );
16751        assert_eq_m256h(r, e);
16752    }
16753
16754    #[simd_test(enable = "avx512fp16")]
16755    unsafe fn test_mm512_set_ph() {
16756        let r = _mm512_set_ph(
16757            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16758            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16759            31.0, 32.0,
16760        );
16761        let e = _mm512_setr_ph(
16762            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
16763            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
16764            3.0, 2.0, 1.0,
16765        );
16766        assert_eq_m512h(r, e);
16767    }
16768
16769    #[simd_test(enable = "avx512fp16,avx512vl")]
16770    unsafe fn test_mm_set_sh() {
16771        let r = _mm_set_sh(1.0);
16772        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0);
16773        assert_eq_m128h(r, e);
16774    }
16775
16776    #[simd_test(enable = "avx512fp16,avx512vl")]
16777    unsafe fn test_mm_set1_ph() {
16778        let r = _mm_set1_ph(1.0);
16779        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
16780        assert_eq_m128h(r, e);
16781    }
16782
16783    #[simd_test(enable = "avx512fp16,avx512vl")]
16784    unsafe fn test_mm256_set1_ph() {
16785        let r = _mm256_set1_ph(1.0);
16786        let e = _mm256_set_ph(
16787            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16788        );
16789        assert_eq_m256h(r, e);
16790    }
16791
16792    #[simd_test(enable = "avx512fp16")]
16793    unsafe fn test_mm512_set1_ph() {
16794        let r = _mm512_set1_ph(1.0);
16795        let e = _mm512_set_ph(
16796            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16797            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
16798        );
16799        assert_eq_m512h(r, e);
16800    }
16801
16802    #[simd_test(enable = "avx512fp16,avx512vl")]
16803    unsafe fn test_mm_setr_ph() {
16804        let r = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
16805        let e = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
16806        assert_eq_m128h(r, e);
16807    }
16808
16809    #[simd_test(enable = "avx512fp16,avx512vl")]
16810    unsafe fn test_mm256_setr_ph() {
16811        let r = _mm256_setr_ph(
16812            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16813        );
16814        let e = _mm256_set_ph(
16815            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
16816        );
16817        assert_eq_m256h(r, e);
16818    }
16819
16820    #[simd_test(enable = "avx512fp16")]
16821    unsafe fn test_mm512_setr_ph() {
16822        let r = _mm512_setr_ph(
16823            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
16824            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
16825            31.0, 32.0,
16826        );
16827        let e = _mm512_set_ph(
16828            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
16829            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
16830            3.0, 2.0, 1.0,
16831        );
16832        assert_eq_m512h(r, e);
16833    }
16834
16835    #[simd_test(enable = "avx512fp16,avx512vl")]
16836    unsafe fn test_mm_setzero_ph() {
16837        let r = _mm_setzero_ph();
16838        let e = _mm_set1_ph(0.0);
16839        assert_eq_m128h(r, e);
16840    }
16841
16842    #[simd_test(enable = "avx512fp16,avx512vl")]
16843    unsafe fn test_mm256_setzero_ph() {
16844        let r = _mm256_setzero_ph();
16845        let e = _mm256_set1_ph(0.0);
16846        assert_eq_m256h(r, e);
16847    }
16848
16849    #[simd_test(enable = "avx512fp16")]
16850    unsafe fn test_mm512_setzero_ph() {
16851        let r = _mm512_setzero_ph();
16852        let e = _mm512_set1_ph(0.0);
16853        assert_eq_m512h(r, e);
16854    }
16855
16856    #[simd_test(enable = "avx512fp16,avx512vl")]
16857    unsafe fn test_mm_castsi128_ph() {
16858        let a = _mm_set1_epi16(0x3c00);
16859        let r = _mm_castsi128_ph(a);
16860        let e = _mm_set1_ph(1.0);
16861        assert_eq_m128h(r, e);
16862    }
16863
16864    #[simd_test(enable = "avx512fp16,avx512vl")]
16865    unsafe fn test_mm256_castsi256_ph() {
16866        let a = _mm256_set1_epi16(0x3c00);
16867        let r = _mm256_castsi256_ph(a);
16868        let e = _mm256_set1_ph(1.0);
16869        assert_eq_m256h(r, e);
16870    }
16871
16872    #[simd_test(enable = "avx512fp16")]
16873    unsafe fn test_mm512_castsi512_ph() {
16874        let a = _mm512_set1_epi16(0x3c00);
16875        let r = _mm512_castsi512_ph(a);
16876        let e = _mm512_set1_ph(1.0);
16877        assert_eq_m512h(r, e);
16878    }
16879
16880    #[simd_test(enable = "avx512fp16")]
16881    unsafe fn test_mm_castph_si128() {
16882        let a = _mm_set1_ph(1.0);
16883        let r = _mm_castph_si128(a);
16884        let e = _mm_set1_epi16(0x3c00);
16885        assert_eq_m128i(r, e);
16886    }
16887
16888    #[simd_test(enable = "avx512fp16")]
16889    unsafe fn test_mm256_castph_si256() {
16890        let a = _mm256_set1_ph(1.0);
16891        let r = _mm256_castph_si256(a);
16892        let e = _mm256_set1_epi16(0x3c00);
16893        assert_eq_m256i(r, e);
16894    }
16895
16896    #[simd_test(enable = "avx512fp16")]
16897    unsafe fn test_mm512_castph_si512() {
16898        let a = _mm512_set1_ph(1.0);
16899        let r = _mm512_castph_si512(a);
16900        let e = _mm512_set1_epi16(0x3c00);
16901        assert_eq_m512i(r, e);
16902    }
16903
16904    #[simd_test(enable = "avx512fp16,avx512vl")]
16905    unsafe fn test_mm_castps_ph() {
16906        let a = _mm_castsi128_ps(_mm_set1_epi16(0x3c00));
16907        let r = _mm_castps_ph(a);
16908        let e = _mm_set1_ph(1.0);
16909        assert_eq_m128h(r, e);
16910    }
16911
16912    #[simd_test(enable = "avx512fp16,avx512vl")]
16913    unsafe fn test_mm256_castps_ph() {
16914        let a = _mm256_castsi256_ps(_mm256_set1_epi16(0x3c00));
16915        let r = _mm256_castps_ph(a);
16916        let e = _mm256_set1_ph(1.0);
16917        assert_eq_m256h(r, e);
16918    }
16919
16920    #[simd_test(enable = "avx512fp16")]
16921    unsafe fn test_mm512_castps_ph() {
16922        let a = _mm512_castsi512_ps(_mm512_set1_epi16(0x3c00));
16923        let r = _mm512_castps_ph(a);
16924        let e = _mm512_set1_ph(1.0);
16925        assert_eq_m512h(r, e);
16926    }
16927
16928    #[simd_test(enable = "avx512fp16")]
16929    unsafe fn test_mm_castph_ps() {
16930        let a = _mm_castsi128_ph(_mm_set1_epi32(0x3f800000));
16931        let r = _mm_castph_ps(a);
16932        let e = _mm_set1_ps(1.0);
16933        assert_eq_m128(r, e);
16934    }
16935
16936    #[simd_test(enable = "avx512fp16")]
16937    unsafe fn test_mm256_castph_ps() {
16938        let a = _mm256_castsi256_ph(_mm256_set1_epi32(0x3f800000));
16939        let r = _mm256_castph_ps(a);
16940        let e = _mm256_set1_ps(1.0);
16941        assert_eq_m256(r, e);
16942    }
16943
16944    #[simd_test(enable = "avx512fp16")]
16945    unsafe fn test_mm512_castph_ps() {
16946        let a = _mm512_castsi512_ph(_mm512_set1_epi32(0x3f800000));
16947        let r = _mm512_castph_ps(a);
16948        let e = _mm512_set1_ps(1.0);
16949        assert_eq_m512(r, e);
16950    }
16951
16952    #[simd_test(enable = "avx512fp16,avx512vl")]
16953    unsafe fn test_mm_castpd_ph() {
16954        let a = _mm_castsi128_pd(_mm_set1_epi16(0x3c00));
16955        let r = _mm_castpd_ph(a);
16956        let e = _mm_set1_ph(1.0);
16957        assert_eq_m128h(r, e);
16958    }
16959
16960    #[simd_test(enable = "avx512fp16,avx512vl")]
16961    unsafe fn test_mm256_castpd_ph() {
16962        let a = _mm256_castsi256_pd(_mm256_set1_epi16(0x3c00));
16963        let r = _mm256_castpd_ph(a);
16964        let e = _mm256_set1_ph(1.0);
16965        assert_eq_m256h(r, e);
16966    }
16967
16968    #[simd_test(enable = "avx512fp16")]
16969    unsafe fn test_mm512_castpd_ph() {
16970        let a = _mm512_castsi512_pd(_mm512_set1_epi16(0x3c00));
16971        let r = _mm512_castpd_ph(a);
16972        let e = _mm512_set1_ph(1.0);
16973        assert_eq_m512h(r, e);
16974    }
16975
16976    #[simd_test(enable = "avx512fp16")]
16977    unsafe fn test_mm_castph_pd() {
16978        let a = _mm_castsi128_ph(_mm_set1_epi64x(0x3ff0000000000000));
16979        let r = _mm_castph_pd(a);
16980        let e = _mm_set1_pd(1.0);
16981        assert_eq_m128d(r, e);
16982    }
16983
16984    #[simd_test(enable = "avx512fp16")]
16985    unsafe fn test_mm256_castph_pd() {
16986        let a = _mm256_castsi256_ph(_mm256_set1_epi64x(0x3ff0000000000000));
16987        let r = _mm256_castph_pd(a);
16988        let e = _mm256_set1_pd(1.0);
16989        assert_eq_m256d(r, e);
16990    }
16991
16992    #[simd_test(enable = "avx512fp16")]
16993    unsafe fn test_mm512_castph_pd() {
16994        let a = _mm512_castsi512_ph(_mm512_set1_epi64(0x3ff0000000000000));
16995        let r = _mm512_castph_pd(a);
16996        let e = _mm512_set1_pd(1.0);
16997        assert_eq_m512d(r, e);
16998    }
16999
17000    #[simd_test(enable = "avx512fp16,avx512vl")]
17001    unsafe fn test_mm256_castph256_ph128() {
17002        let a = _mm256_setr_ph(
17003            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17004        );
17005        let r = _mm256_castph256_ph128(a);
17006        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17007        assert_eq_m128h(r, e);
17008    }
17009
17010    #[simd_test(enable = "avx512fp16,avx512vl")]
17011    unsafe fn test_mm512_castph512_ph128() {
17012        let a = _mm512_setr_ph(
17013            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
17014            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
17015        );
17016        let r = _mm512_castph512_ph128(a);
17017        let e = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17018        assert_eq_m128h(r, e);
17019    }
17020
17021    #[simd_test(enable = "avx512fp16,avx512vl")]
17022    unsafe fn test_mm512_castph512_ph256() {
17023        let a = _mm512_setr_ph(
17024            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 17., 18., 19.,
17025            20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
17026        );
17027        let r = _mm512_castph512_ph256(a);
17028        let e = _mm256_setr_ph(
17029            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17030        );
17031        assert_eq_m256h(r, e);
17032    }
17033
17034    #[simd_test(enable = "avx512fp16,avx512vl")]
17035    unsafe fn test_mm256_castph128_ph256() {
17036        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17037        let r = _mm256_castph128_ph256(a);
17038        assert_eq_m128h(_mm256_castph256_ph128(r), a);
17039    }
17040
17041    #[simd_test(enable = "avx512fp16,avx512vl")]
17042    unsafe fn test_mm512_castph128_ph512() {
17043        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17044        let r = _mm512_castph128_ph512(a);
17045        assert_eq_m128h(_mm512_castph512_ph128(r), a);
17046    }
17047
17048    #[simd_test(enable = "avx512fp16,avx512vl")]
17049    unsafe fn test_mm512_castph256_ph512() {
17050        let a = _mm256_setr_ph(
17051            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17052        );
17053        let r = _mm512_castph256_ph512(a);
17054        assert_eq_m256h(_mm512_castph512_ph256(r), a);
17055    }
17056
17057    #[simd_test(enable = "avx512fp16,avx512vl")]
17058    unsafe fn test_mm256_zextph128_ph256() {
17059        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17060        let r = _mm256_zextph128_ph256(a);
17061        let e = _mm256_setr_ph(
17062            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
17063        );
17064        assert_eq_m256h(r, e);
17065    }
17066
17067    #[simd_test(enable = "avx512fp16")]
17068    unsafe fn test_mm512_zextph128_ph512() {
17069        let a = _mm_setr_ph(1., 2., 3., 4., 5., 6., 7., 8.);
17070        let r = _mm512_zextph128_ph512(a);
17071        let e = _mm512_setr_ph(
17072            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17073            0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17074        );
17075        assert_eq_m512h(r, e);
17076    }
17077
17078    #[simd_test(enable = "avx512fp16")]
17079    unsafe fn test_mm512_zextph256_ph512() {
17080        let a = _mm256_setr_ph(
17081            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
17082        );
17083        let r = _mm512_zextph256_ph512(a);
17084        let e = _mm512_setr_ph(
17085            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16., 0., 0., 0., 0.,
17086            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
17087        );
17088        assert_eq_m512h(r, e);
17089    }
17090
17091    #[simd_test(enable = "avx512fp16,avx512vl")]
17092    unsafe fn test_mm_cmp_ph_mask() {
17093        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17094        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
17095        let r = _mm_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17096        assert_eq!(r, 0b11110000);
17097    }
17098
17099    #[simd_test(enable = "avx512fp16,avx512vl")]
17100    unsafe fn test_mm_mask_cmp_ph_mask() {
17101        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17102        let b = _mm_set_ph(1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0);
17103        let r = _mm_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101, a, b);
17104        assert_eq!(r, 0b01010000);
17105    }
17106
17107    #[simd_test(enable = "avx512fp16,avx512vl")]
17108    unsafe fn test_mm256_cmp_ph_mask() {
17109        let a = _mm256_set_ph(
17110            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17111        );
17112        let b = _mm256_set_ph(
17113            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17114            -16.0,
17115        );
17116        let r = _mm256_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17117        assert_eq!(r, 0b1111000011110000);
17118    }
17119
17120    #[simd_test(enable = "avx512fp16,avx512vl")]
17121    unsafe fn test_mm256_mask_cmp_ph_mask() {
17122        let a = _mm256_set_ph(
17123            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17124        );
17125        let b = _mm256_set_ph(
17126            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17127            -16.0,
17128        );
17129        let r = _mm256_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b0101010101010101, a, b);
17130        assert_eq!(r, 0b0101000001010000);
17131    }
17132
17133    #[simd_test(enable = "avx512fp16")]
17134    unsafe fn test_mm512_cmp_ph_mask() {
17135        let a = _mm512_set_ph(
17136            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17137            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17138            31.0, 32.0,
17139        );
17140        let b = _mm512_set_ph(
17141            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17142            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17143            -29.0, -30.0, -31.0, -32.0,
17144        );
17145        let r = _mm512_cmp_ph_mask::<_CMP_EQ_OQ>(a, b);
17146        assert_eq!(r, 0b11110000111100001111000011110000);
17147    }
17148
17149    #[simd_test(enable = "avx512fp16")]
17150    unsafe fn test_mm512_mask_cmp_ph_mask() {
17151        let a = _mm512_set_ph(
17152            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17153            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17154            31.0, 32.0,
17155        );
17156        let b = _mm512_set_ph(
17157            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17158            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17159            -29.0, -30.0, -31.0, -32.0,
17160        );
17161        let r = _mm512_mask_cmp_ph_mask::<_CMP_EQ_OQ>(0b01010101010101010101010101010101, a, b);
17162        assert_eq!(r, 0b01010000010100000101000001010000);
17163    }
17164
17165    #[simd_test(enable = "avx512fp16")]
17166    unsafe fn test_mm512_cmp_round_ph_mask() {
17167        let a = _mm512_set_ph(
17168            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17169            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17170            31.0, 32.0,
17171        );
17172        let b = _mm512_set_ph(
17173            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17174            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17175            -29.0, -30.0, -31.0, -32.0,
17176        );
17177        let r = _mm512_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17178        assert_eq!(r, 0b11110000111100001111000011110000);
17179    }
17180
17181    #[simd_test(enable = "avx512fp16")]
17182    unsafe fn test_mm512_mask_cmp_round_ph_mask() {
17183        let a = _mm512_set_ph(
17184            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17185            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17186            31.0, 32.0,
17187        );
17188        let b = _mm512_set_ph(
17189            1.0, 2.0, 3.0, 4.0, -5.0, -6.0, -7.0, -8.0, 9.0, 10.0, 11.0, 12.0, -13.0, -14.0, -15.0,
17190            -16.0, 17.0, 18.0, 19.0, 20.0, -21.0, -22.0, -23.0, -24.0, 25.0, 26.0, 27.0, 28.0,
17191            -29.0, -30.0, -31.0, -32.0,
17192        );
17193        let r = _mm512_mask_cmp_round_ph_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(
17194            0b01010101010101010101010101010101,
17195            a,
17196            b,
17197        );
17198        assert_eq!(r, 0b01010000010100000101000001010000);
17199    }
17200
17201    #[simd_test(enable = "avx512fp16")]
17202    unsafe fn test_mm_cmp_round_sh_mask() {
17203        let a = _mm_set_sh(1.0);
17204        let b = _mm_set_sh(1.0);
17205        let r = _mm_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17206        assert_eq!(r, 1);
17207    }
17208
17209    #[simd_test(enable = "avx512fp16")]
17210    unsafe fn test_mm_mask_cmp_round_sh_mask() {
17211        let a = _mm_set_sh(1.0);
17212        let b = _mm_set_sh(1.0);
17213        let r = _mm_mask_cmp_round_sh_mask::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(0, a, b);
17214        assert_eq!(r, 0);
17215    }
17216
17217    #[simd_test(enable = "avx512fp16")]
17218    unsafe fn test_mm_cmp_sh_mask() {
17219        let a = _mm_set_sh(1.0);
17220        let b = _mm_set_sh(1.0);
17221        let r = _mm_cmp_sh_mask::<_CMP_EQ_OQ>(a, b);
17222        assert_eq!(r, 1);
17223    }
17224
17225    #[simd_test(enable = "avx512fp16")]
17226    unsafe fn test_mm_mask_cmp_sh_mask() {
17227        let a = _mm_set_sh(1.0);
17228        let b = _mm_set_sh(1.0);
17229        let r = _mm_mask_cmp_sh_mask::<_CMP_EQ_OQ>(0, a, b);
17230        assert_eq!(r, 0);
17231    }
17232
17233    #[simd_test(enable = "avx512fp16")]
17234    unsafe fn test_mm_comi_round_sh() {
17235        let a = _mm_set_sh(1.0);
17236        let b = _mm_set_sh(1.0);
17237        let r = _mm_comi_round_sh::<_CMP_EQ_OQ, _MM_FROUND_NO_EXC>(a, b);
17238        assert_eq!(r, 1);
17239    }
17240
17241    #[simd_test(enable = "avx512fp16")]
17242    unsafe fn test_mm_comi_sh() {
17243        let a = _mm_set_sh(1.0);
17244        let b = _mm_set_sh(1.0);
17245        let r = _mm_comi_sh::<_CMP_EQ_OQ>(a, b);
17246        assert_eq!(r, 1);
17247    }
17248
17249    #[simd_test(enable = "avx512fp16")]
17250    unsafe fn test_mm_comieq_sh() {
17251        let a = _mm_set_sh(1.0);
17252        let b = _mm_set_sh(1.0);
17253        let r = _mm_comieq_sh(a, b);
17254        assert_eq!(r, 1);
17255    }
17256
17257    #[simd_test(enable = "avx512fp16")]
17258    unsafe fn test_mm_comige_sh() {
17259        let a = _mm_set_sh(2.0);
17260        let b = _mm_set_sh(1.0);
17261        let r = _mm_comige_sh(a, b);
17262        assert_eq!(r, 1);
17263    }
17264
17265    #[simd_test(enable = "avx512fp16")]
17266    unsafe fn test_mm_comigt_sh() {
17267        let a = _mm_set_sh(2.0);
17268        let b = _mm_set_sh(1.0);
17269        let r = _mm_comigt_sh(a, b);
17270        assert_eq!(r, 1);
17271    }
17272
17273    #[simd_test(enable = "avx512fp16")]
17274    unsafe fn test_mm_comile_sh() {
17275        let a = _mm_set_sh(1.0);
17276        let b = _mm_set_sh(2.0);
17277        let r = _mm_comile_sh(a, b);
17278        assert_eq!(r, 1);
17279    }
17280
17281    #[simd_test(enable = "avx512fp16")]
17282    unsafe fn test_mm_comilt_sh() {
17283        let a = _mm_set_sh(1.0);
17284        let b = _mm_set_sh(2.0);
17285        let r = _mm_comilt_sh(a, b);
17286        assert_eq!(r, 1);
17287    }
17288
17289    #[simd_test(enable = "avx512fp16")]
17290    unsafe fn test_mm_comineq_sh() {
17291        let a = _mm_set_sh(1.0);
17292        let b = _mm_set_sh(2.0);
17293        let r = _mm_comineq_sh(a, b);
17294        assert_eq!(r, 1);
17295    }
17296
17297    #[simd_test(enable = "avx512fp16")]
17298    unsafe fn test_mm_ucomieq_sh() {
17299        let a = _mm_set_sh(1.0);
17300        let b = _mm_set_sh(1.0);
17301        let r = _mm_ucomieq_sh(a, b);
17302        assert_eq!(r, 1);
17303    }
17304
17305    #[simd_test(enable = "avx512fp16")]
17306    unsafe fn test_mm_ucomige_sh() {
17307        let a = _mm_set_sh(2.0);
17308        let b = _mm_set_sh(1.0);
17309        let r = _mm_ucomige_sh(a, b);
17310        assert_eq!(r, 1);
17311    }
17312
17313    #[simd_test(enable = "avx512fp16")]
17314    unsafe fn test_mm_ucomigt_sh() {
17315        let a = _mm_set_sh(2.0);
17316        let b = _mm_set_sh(1.0);
17317        let r = _mm_ucomigt_sh(a, b);
17318        assert_eq!(r, 1);
17319    }
17320
17321    #[simd_test(enable = "avx512fp16")]
17322    unsafe fn test_mm_ucomile_sh() {
17323        let a = _mm_set_sh(1.0);
17324        let b = _mm_set_sh(2.0);
17325        let r = _mm_ucomile_sh(a, b);
17326        assert_eq!(r, 1);
17327    }
17328
17329    #[simd_test(enable = "avx512fp16")]
17330    unsafe fn test_mm_ucomilt_sh() {
17331        let a = _mm_set_sh(1.0);
17332        let b = _mm_set_sh(2.0);
17333        let r = _mm_ucomilt_sh(a, b);
17334        assert_eq!(r, 1);
17335    }
17336
17337    #[simd_test(enable = "avx512fp16")]
17338    unsafe fn test_mm_ucomineq_sh() {
17339        let a = _mm_set_sh(1.0);
17340        let b = _mm_set_sh(2.0);
17341        let r = _mm_ucomineq_sh(a, b);
17342        assert_eq!(r, 1);
17343    }
17344
17345    #[simd_test(enable = "avx512fp16,avx512vl")]
17346    unsafe fn test_mm_load_ph() {
17347        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17348        let b = _mm_load_ph(addr_of!(a).cast());
17349        assert_eq_m128h(a, b);
17350    }
17351
17352    #[simd_test(enable = "avx512fp16,avx512vl")]
17353    unsafe fn test_mm256_load_ph() {
17354        let a = _mm256_set_ph(
17355            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17356        );
17357        let b = _mm256_load_ph(addr_of!(a).cast());
17358        assert_eq_m256h(a, b);
17359    }
17360
17361    #[simd_test(enable = "avx512fp16")]
17362    unsafe fn test_mm512_load_ph() {
17363        let a = _mm512_set_ph(
17364            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17365            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17366            31.0, 32.0,
17367        );
17368        let b = _mm512_load_ph(addr_of!(a).cast());
17369        assert_eq_m512h(a, b);
17370    }
17371
17372    #[simd_test(enable = "avx512fp16,avx512vl")]
17373    unsafe fn test_mm_load_sh() {
17374        let a = _mm_set_sh(1.0);
17375        let b = _mm_load_sh(addr_of!(a).cast());
17376        assert_eq_m128h(a, b);
17377    }
17378
17379    #[simd_test(enable = "avx512fp16,avx512vl")]
17380    unsafe fn test_mm_mask_load_sh() {
17381        let a = _mm_set_sh(1.0);
17382        let src = _mm_set_sh(2.);
17383        let b = _mm_mask_load_sh(src, 1, addr_of!(a).cast());
17384        assert_eq_m128h(a, b);
17385        let b = _mm_mask_load_sh(src, 0, addr_of!(a).cast());
17386        assert_eq_m128h(src, b);
17387    }
17388
17389    #[simd_test(enable = "avx512fp16,avx512vl")]
17390    unsafe fn test_mm_maskz_load_sh() {
17391        let a = _mm_set_sh(1.0);
17392        let b = _mm_maskz_load_sh(1, addr_of!(a).cast());
17393        assert_eq_m128h(a, b);
17394        let b = _mm_maskz_load_sh(0, addr_of!(a).cast());
17395        assert_eq_m128h(_mm_setzero_ph(), b);
17396    }
17397
17398    #[simd_test(enable = "avx512fp16,avx512vl")]
17399    unsafe fn test_mm_loadu_ph() {
17400        let array = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
17401        let r = _mm_loadu_ph(array.as_ptr());
17402        let e = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17403        assert_eq_m128h(r, e);
17404    }
17405
17406    #[simd_test(enable = "avx512fp16,avx512vl")]
17407    unsafe fn test_mm256_loadu_ph() {
17408        let array = [
17409            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17410        ];
17411        let r = _mm256_loadu_ph(array.as_ptr());
17412        let e = _mm256_setr_ph(
17413            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17414        );
17415        assert_eq_m256h(r, e);
17416    }
17417
17418    #[simd_test(enable = "avx512fp16")]
17419    unsafe fn test_mm512_loadu_ph() {
17420        let array = [
17421            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17422            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17423            31.0, 32.0,
17424        ];
17425        let r = _mm512_loadu_ph(array.as_ptr());
17426        let e = _mm512_setr_ph(
17427            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17428            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17429            31.0, 32.0,
17430        );
17431        assert_eq_m512h(r, e);
17432    }
17433
17434    #[simd_test(enable = "avx512fp16,avx512vl")]
17435    unsafe fn test_mm_move_sh() {
17436        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17437        let b = _mm_set_sh(9.0);
17438        let r = _mm_move_sh(a, b);
17439        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 9.0);
17440        assert_eq_m128h(r, e);
17441    }
17442
17443    #[simd_test(enable = "avx512fp16,avx512vl")]
17444    unsafe fn test_mm_mask_move_sh() {
17445        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17446        let b = _mm_set_sh(9.0);
17447        let src = _mm_set_sh(10.0);
17448        let r = _mm_mask_move_sh(src, 0, a, b);
17449        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 10.0);
17450        assert_eq_m128h(r, e);
17451    }
17452
17453    #[simd_test(enable = "avx512fp16,avx512vl")]
17454    unsafe fn test_mm_maskz_move_sh() {
17455        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17456        let b = _mm_set_sh(9.0);
17457        let r = _mm_maskz_move_sh(0, a, b);
17458        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 0.0);
17459        assert_eq_m128h(r, e);
17460    }
17461
17462    #[simd_test(enable = "avx512fp16,avx512vl")]
17463    unsafe fn test_mm_store_ph() {
17464        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17465        let mut b = _mm_setzero_ph();
17466        _mm_store_ph(addr_of_mut!(b).cast(), a);
17467        assert_eq_m128h(a, b);
17468    }
17469
17470    #[simd_test(enable = "avx512fp16,avx512vl")]
17471    unsafe fn test_mm256_store_ph() {
17472        let a = _mm256_set_ph(
17473            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17474        );
17475        let mut b = _mm256_setzero_ph();
17476        _mm256_store_ph(addr_of_mut!(b).cast(), a);
17477        assert_eq_m256h(a, b);
17478    }
17479
17480    #[simd_test(enable = "avx512fp16")]
17481    unsafe fn test_mm512_store_ph() {
17482        let a = _mm512_set_ph(
17483            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17484            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17485            31.0, 32.0,
17486        );
17487        let mut b = _mm512_setzero_ph();
17488        _mm512_store_ph(addr_of_mut!(b).cast(), a);
17489        assert_eq_m512h(a, b);
17490    }
17491
17492    #[simd_test(enable = "avx512fp16,avx512vl")]
17493    unsafe fn test_mm_store_sh() {
17494        let a = _mm_set_sh(1.0);
17495        let mut b = _mm_setzero_ph();
17496        _mm_store_sh(addr_of_mut!(b).cast(), a);
17497        assert_eq_m128h(a, b);
17498    }
17499
17500    #[simd_test(enable = "avx512fp16,avx512vl")]
17501    unsafe fn test_mm_mask_store_sh() {
17502        let a = _mm_set_sh(1.0);
17503        let mut b = _mm_setzero_ph();
17504        _mm_mask_store_sh(addr_of_mut!(b).cast(), 0, a);
17505        assert_eq_m128h(_mm_setzero_ph(), b);
17506        _mm_mask_store_sh(addr_of_mut!(b).cast(), 1, a);
17507        assert_eq_m128h(a, b);
17508    }
17509
17510    #[simd_test(enable = "avx512fp16,avx512vl")]
17511    unsafe fn test_mm_storeu_ph() {
17512        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17513        let mut array = [0.0; 8];
17514        _mm_storeu_ph(array.as_mut_ptr(), a);
17515        assert_eq_m128h(a, _mm_loadu_ph(array.as_ptr()));
17516    }
17517
17518    #[simd_test(enable = "avx512fp16,avx512vl")]
17519    unsafe fn test_mm256_storeu_ph() {
17520        let a = _mm256_set_ph(
17521            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17522        );
17523        let mut array = [0.0; 16];
17524        _mm256_storeu_ph(array.as_mut_ptr(), a);
17525        assert_eq_m256h(a, _mm256_loadu_ph(array.as_ptr()));
17526    }
17527
17528    #[simd_test(enable = "avx512fp16")]
17529    unsafe fn test_mm512_storeu_ph() {
17530        let a = _mm512_set_ph(
17531            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17532            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17533            31.0, 32.0,
17534        );
17535        let mut array = [0.0; 32];
17536        _mm512_storeu_ph(array.as_mut_ptr(), a);
17537        assert_eq_m512h(a, _mm512_loadu_ph(array.as_ptr()));
17538    }
17539
17540    #[simd_test(enable = "avx512fp16,avx512vl")]
17541    unsafe fn test_mm_add_ph() {
17542        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17543        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17544        let r = _mm_add_ph(a, b);
17545        let e = _mm_set1_ph(9.0);
17546        assert_eq_m128h(r, e);
17547    }
17548
17549    #[simd_test(enable = "avx512fp16,avx512vl")]
17550    unsafe fn test_mm_mask_add_ph() {
17551        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17552        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17553        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17554        let r = _mm_mask_add_ph(src, 0b01010101, a, b);
17555        let e = _mm_set_ph(10., 9., 12., 9., 14., 9., 16., 9.);
17556        assert_eq_m128h(r, e);
17557    }
17558
17559    #[simd_test(enable = "avx512fp16,avx512vl")]
17560    unsafe fn test_mm_maskz_add_ph() {
17561        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17562        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17563        let r = _mm_maskz_add_ph(0b01010101, a, b);
17564        let e = _mm_set_ph(0., 9., 0., 9., 0., 9., 0., 9.);
17565        assert_eq_m128h(r, e);
17566    }
17567
17568    #[simd_test(enable = "avx512fp16,avx512vl")]
17569    unsafe fn test_mm256_add_ph() {
17570        let a = _mm256_set_ph(
17571            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17572        );
17573        let b = _mm256_set_ph(
17574            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17575        );
17576        let r = _mm256_add_ph(a, b);
17577        let e = _mm256_set1_ph(17.0);
17578        assert_eq_m256h(r, e);
17579    }
17580
17581    #[simd_test(enable = "avx512fp16,avx512vl")]
17582    unsafe fn test_mm256_mask_add_ph() {
17583        let a = _mm256_set_ph(
17584            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17585        );
17586        let b = _mm256_set_ph(
17587            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17588        );
17589        let src = _mm256_set_ph(
17590            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
17591        );
17592        let r = _mm256_mask_add_ph(src, 0b0101010101010101, a, b);
17593        let e = _mm256_set_ph(
17594            18., 17., 20., 17., 22., 17., 24., 17., 26., 17., 28., 17., 30., 17., 32., 17.,
17595        );
17596        assert_eq_m256h(r, e);
17597    }
17598
17599    #[simd_test(enable = "avx512fp16,avx512vl")]
17600    unsafe fn test_mm256_maskz_add_ph() {
17601        let a = _mm256_set_ph(
17602            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17603        );
17604        let b = _mm256_set_ph(
17605            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17606        );
17607        let r = _mm256_maskz_add_ph(0b0101010101010101, a, b);
17608        let e = _mm256_set_ph(
17609            0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17., 0., 17.,
17610        );
17611        assert_eq_m256h(r, e);
17612    }
17613
17614    #[simd_test(enable = "avx512fp16")]
17615    unsafe fn test_mm512_add_ph() {
17616        let a = _mm512_set_ph(
17617            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17618            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17619            31.0, 32.0,
17620        );
17621        let b = _mm512_set_ph(
17622            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17623            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17624            3.0, 2.0, 1.0,
17625        );
17626        let r = _mm512_add_ph(a, b);
17627        let e = _mm512_set1_ph(33.0);
17628        assert_eq_m512h(r, e);
17629    }
17630
17631    #[simd_test(enable = "avx512fp16")]
17632    unsafe fn test_mm512_mask_add_ph() {
17633        let a = _mm512_set_ph(
17634            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17635            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17636            31.0, 32.0,
17637        );
17638        let b = _mm512_set_ph(
17639            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17640            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17641            3.0, 2.0, 1.0,
17642        );
17643        let src = _mm512_set_ph(
17644            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17645            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17646        );
17647        let r = _mm512_mask_add_ph(src, 0b01010101010101010101010101010101, a, b);
17648        let e = _mm512_set_ph(
17649            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17650            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17651        );
17652        assert_eq_m512h(r, e);
17653    }
17654
17655    #[simd_test(enable = "avx512fp16")]
17656    unsafe fn test_mm512_maskz_add_ph() {
17657        let a = _mm512_set_ph(
17658            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17659            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17660            31.0, 32.0,
17661        );
17662        let b = _mm512_set_ph(
17663            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17664            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17665            3.0, 2.0, 1.0,
17666        );
17667        let r = _mm512_maskz_add_ph(0b01010101010101010101010101010101, a, b);
17668        let e = _mm512_set_ph(
17669            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17670            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17671        );
17672        assert_eq_m512h(r, e);
17673    }
17674
17675    #[simd_test(enable = "avx512fp16")]
17676    unsafe fn test_mm512_add_round_ph() {
17677        let a = _mm512_set_ph(
17678            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17679            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17680            31.0, 32.0,
17681        );
17682        let b = _mm512_set_ph(
17683            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17684            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17685            3.0, 2.0, 1.0,
17686        );
17687        let r = _mm512_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17688        let e = _mm512_set1_ph(33.0);
17689        assert_eq_m512h(r, e);
17690    }
17691
17692    #[simd_test(enable = "avx512fp16")]
17693    unsafe fn test_mm512_mask_add_round_ph() {
17694        let a = _mm512_set_ph(
17695            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17696            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17697            31.0, 32.0,
17698        );
17699        let b = _mm512_set_ph(
17700            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17701            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17702            3.0, 2.0, 1.0,
17703        );
17704        let src = _mm512_set_ph(
17705            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17706            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17707        );
17708        let r = _mm512_mask_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17709            src,
17710            0b01010101010101010101010101010101,
17711            a,
17712            b,
17713        );
17714        let e = _mm512_set_ph(
17715            34., 33., 36., 33., 38., 33., 40., 33., 42., 33., 44., 33., 46., 33., 48., 33., 50.,
17716            33., 52., 33., 54., 33., 56., 33., 58., 33., 60., 33., 62., 33., 64., 33.,
17717        );
17718        assert_eq_m512h(r, e);
17719    }
17720
17721    #[simd_test(enable = "avx512fp16")]
17722    unsafe fn test_mm512_maskz_add_round_ph() {
17723        let a = _mm512_set_ph(
17724            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17725            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17726            31.0, 32.0,
17727        );
17728        let b = _mm512_set_ph(
17729            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17730            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17731            3.0, 2.0, 1.0,
17732        );
17733        let r = _mm512_maskz_add_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17734            0b01010101010101010101010101010101,
17735            a,
17736            b,
17737        );
17738        let e = _mm512_set_ph(
17739            0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0.,
17740            33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33., 0., 33.,
17741        );
17742        assert_eq_m512h(r, e);
17743    }
17744
17745    #[simd_test(enable = "avx512fp16,avx512vl")]
17746    unsafe fn test_mm_add_round_sh() {
17747        let a = _mm_set_sh(1.0);
17748        let b = _mm_set_sh(2.0);
17749        let r = _mm_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17750        let e = _mm_set_sh(3.0);
17751        assert_eq_m128h(r, e);
17752    }
17753
17754    #[simd_test(enable = "avx512fp16,avx512vl")]
17755    unsafe fn test_mm_mask_add_round_sh() {
17756        let a = _mm_set_sh(1.0);
17757        let b = _mm_set_sh(2.0);
17758        let src = _mm_set_sh(4.0);
17759        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17760            src, 0, a, b,
17761        );
17762        let e = _mm_set_sh(4.0);
17763        assert_eq_m128h(r, e);
17764        let r = _mm_mask_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17765            src, 1, a, b,
17766        );
17767        let e = _mm_set_sh(3.0);
17768        assert_eq_m128h(r, e);
17769    }
17770
17771    #[simd_test(enable = "avx512fp16,avx512vl")]
17772    unsafe fn test_mm_maskz_add_round_sh() {
17773        let a = _mm_set_sh(1.0);
17774        let b = _mm_set_sh(2.0);
17775        let r =
17776            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
17777        let e = _mm_set_sh(0.0);
17778        assert_eq_m128h(r, e);
17779        let r =
17780            _mm_maskz_add_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
17781        let e = _mm_set_sh(3.0);
17782        assert_eq_m128h(r, e);
17783    }
17784
17785    #[simd_test(enable = "avx512fp16,avx512vl")]
17786    unsafe fn test_mm_add_sh() {
17787        let a = _mm_set_sh(1.0);
17788        let b = _mm_set_sh(2.0);
17789        let r = _mm_add_sh(a, b);
17790        let e = _mm_set_sh(3.0);
17791        assert_eq_m128h(r, e);
17792    }
17793
17794    #[simd_test(enable = "avx512fp16,avx512vl")]
17795    unsafe fn test_mm_mask_add_sh() {
17796        let a = _mm_set_sh(1.0);
17797        let b = _mm_set_sh(2.0);
17798        let src = _mm_set_sh(4.0);
17799        let r = _mm_mask_add_sh(src, 0, a, b);
17800        let e = _mm_set_sh(4.0);
17801        assert_eq_m128h(r, e);
17802        let r = _mm_mask_add_sh(src, 1, a, b);
17803        let e = _mm_set_sh(3.0);
17804        assert_eq_m128h(r, e);
17805    }
17806
17807    #[simd_test(enable = "avx512fp16,avx512vl")]
17808    unsafe fn test_mm_maskz_add_sh() {
17809        let a = _mm_set_sh(1.0);
17810        let b = _mm_set_sh(2.0);
17811        let r = _mm_maskz_add_sh(0, a, b);
17812        let e = _mm_set_sh(0.0);
17813        assert_eq_m128h(r, e);
17814        let r = _mm_maskz_add_sh(1, a, b);
17815        let e = _mm_set_sh(3.0);
17816        assert_eq_m128h(r, e);
17817    }
17818
17819    #[simd_test(enable = "avx512fp16,avx512vl")]
17820    unsafe fn test_mm_sub_ph() {
17821        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17822        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17823        let r = _mm_sub_ph(a, b);
17824        let e = _mm_set_ph(-7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0);
17825        assert_eq_m128h(r, e);
17826    }
17827
17828    #[simd_test(enable = "avx512fp16,avx512vl")]
17829    unsafe fn test_mm_mask_sub_ph() {
17830        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17831        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17832        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
17833        let r = _mm_mask_sub_ph(src, 0b01010101, a, b);
17834        let e = _mm_set_ph(10., -5., 12., -1., 14., 3., 16., 7.);
17835        assert_eq_m128h(r, e);
17836    }
17837
17838    #[simd_test(enable = "avx512fp16,avx512vl")]
17839    unsafe fn test_mm_maskz_sub_ph() {
17840        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
17841        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
17842        let r = _mm_maskz_sub_ph(0b01010101, a, b);
17843        let e = _mm_set_ph(0., -5., 0., -1., 0., 3., 0., 7.);
17844        assert_eq_m128h(r, e);
17845    }
17846
17847    #[simd_test(enable = "avx512fp16,avx512vl")]
17848    unsafe fn test_mm256_sub_ph() {
17849        let a = _mm256_set_ph(
17850            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17851        );
17852        let b = _mm256_set_ph(
17853            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17854        );
17855        let r = _mm256_sub_ph(a, b);
17856        let e = _mm256_set_ph(
17857            -15.0, -13.0, -11.0, -9.0, -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0,
17858            15.0,
17859        );
17860        assert_eq_m256h(r, e);
17861    }
17862
17863    #[simd_test(enable = "avx512fp16,avx512vl")]
17864    unsafe fn test_mm256_mask_sub_ph() {
17865        let a = _mm256_set_ph(
17866            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17867        );
17868        let b = _mm256_set_ph(
17869            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17870        );
17871        let src = _mm256_set_ph(
17872            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
17873        );
17874        let r = _mm256_mask_sub_ph(src, 0b0101010101010101, a, b);
17875        let e = _mm256_set_ph(
17876            18., -13., 20., -9., 22., -5., 24., -1., 26., 3., 28., 7., 30., 11., 32., 15.,
17877        );
17878        assert_eq_m256h(r, e);
17879    }
17880
17881    #[simd_test(enable = "avx512fp16,avx512vl")]
17882    unsafe fn test_mm256_maskz_sub_ph() {
17883        let a = _mm256_set_ph(
17884            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17885        );
17886        let b = _mm256_set_ph(
17887            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
17888        );
17889        let r = _mm256_maskz_sub_ph(0b0101010101010101, a, b);
17890        let e = _mm256_set_ph(
17891            0., -13., 0., -9., 0., -5., 0., -1., 0., 3., 0., 7., 0., 11., 0., 15.,
17892        );
17893        assert_eq_m256h(r, e);
17894    }
17895
17896    #[simd_test(enable = "avx512fp16")]
17897    unsafe fn test_mm512_sub_ph() {
17898        let a = _mm512_set_ph(
17899            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17900            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17901            31.0, 32.0,
17902        );
17903        let b = _mm512_set_ph(
17904            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17905            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17906            3.0, 2.0, 1.0,
17907        );
17908        let r = _mm512_sub_ph(a, b);
17909        let e = _mm512_set_ph(
17910            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
17911            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
17912            23.0, 25.0, 27.0, 29.0, 31.0,
17913        );
17914        assert_eq_m512h(r, e);
17915    }
17916
17917    #[simd_test(enable = "avx512fp16")]
17918    unsafe fn test_mm512_mask_sub_ph() {
17919        let a = _mm512_set_ph(
17920            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17921            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17922            31.0, 32.0,
17923        );
17924        let b = _mm512_set_ph(
17925            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17926            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17927            3.0, 2.0, 1.0,
17928        );
17929        let src = _mm512_set_ph(
17930            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17931            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17932        );
17933        let r = _mm512_mask_sub_ph(src, 0b01010101010101010101010101010101, a, b);
17934        let e = _mm512_set_ph(
17935            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
17936            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
17937        );
17938        assert_eq_m512h(r, e);
17939    }
17940
17941    #[simd_test(enable = "avx512fp16")]
17942    unsafe fn test_mm512_maskz_sub_ph() {
17943        let a = _mm512_set_ph(
17944            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17945            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17946            31.0, 32.0,
17947        );
17948        let b = _mm512_set_ph(
17949            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17950            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17951            3.0, 2.0, 1.0,
17952        );
17953        let r = _mm512_maskz_sub_ph(0b01010101010101010101010101010101, a, b);
17954        let e = _mm512_set_ph(
17955            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
17956            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
17957        );
17958        assert_eq_m512h(r, e);
17959    }
17960
17961    #[simd_test(enable = "avx512fp16")]
17962    unsafe fn test_mm512_sub_round_ph() {
17963        let a = _mm512_set_ph(
17964            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17965            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17966            31.0, 32.0,
17967        );
17968        let b = _mm512_set_ph(
17969            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17970            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17971            3.0, 2.0, 1.0,
17972        );
17973        let r = _mm512_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
17974        let e = _mm512_set_ph(
17975            -31.0, -29.0, -27.0, -25.0, -23.0, -21.0, -19.0, -17.0, -15.0, -13.0, -11.0, -9.0,
17976            -7.0, -5.0, -3.0, -1.0, 1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0,
17977            23.0, 25.0, 27.0, 29.0, 31.0,
17978        );
17979        assert_eq_m512h(r, e);
17980    }
17981
17982    #[simd_test(enable = "avx512fp16")]
17983    unsafe fn test_mm512_mask_sub_round_ph() {
17984        let a = _mm512_set_ph(
17985            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
17986            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
17987            31.0, 32.0,
17988        );
17989        let b = _mm512_set_ph(
17990            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
17991            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
17992            3.0, 2.0, 1.0,
17993        );
17994        let src = _mm512_set_ph(
17995            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
17996            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
17997        );
17998        let r = _mm512_mask_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
17999            src,
18000            0b01010101010101010101010101010101,
18001            a,
18002            b,
18003        );
18004        let e = _mm512_set_ph(
18005            34., -29., 36., -25., 38., -21., 40., -17., 42., -13., 44., -9., 46., -5., 48., -1.,
18006            50., 3., 52., 7., 54., 11., 56., 15., 58., 19., 60., 23., 62., 27., 64., 31.,
18007        );
18008        assert_eq_m512h(r, e);
18009    }
18010
18011    #[simd_test(enable = "avx512fp16")]
18012    unsafe fn test_mm512_maskz_sub_round_ph() {
18013        let a = _mm512_set_ph(
18014            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18015            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18016            31.0, 32.0,
18017        );
18018        let b = _mm512_set_ph(
18019            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18020            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18021            3.0, 2.0, 1.0,
18022        );
18023        let r = _mm512_maskz_sub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18024            0b01010101010101010101010101010101,
18025            a,
18026            b,
18027        );
18028        let e = _mm512_set_ph(
18029            0., -29., 0., -25., 0., -21., 0., -17., 0., -13., 0., -9., 0., -5., 0., -1., 0., 3.,
18030            0., 7., 0., 11., 0., 15., 0., 19., 0., 23., 0., 27., 0., 31.,
18031        );
18032        assert_eq_m512h(r, e);
18033    }
18034
18035    #[simd_test(enable = "avx512fp16,avx512vl")]
18036    unsafe fn test_mm_sub_round_sh() {
18037        let a = _mm_set_sh(1.0);
18038        let b = _mm_set_sh(2.0);
18039        let r = _mm_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18040        let e = _mm_set_sh(-1.0);
18041        assert_eq_m128h(r, e);
18042    }
18043
18044    #[simd_test(enable = "avx512fp16,avx512vl")]
18045    unsafe fn test_mm_mask_sub_round_sh() {
18046        let a = _mm_set_sh(1.0);
18047        let b = _mm_set_sh(2.0);
18048        let src = _mm_set_sh(4.0);
18049        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18050            src, 0, a, b,
18051        );
18052        let e = _mm_set_sh(4.0);
18053        assert_eq_m128h(r, e);
18054        let r = _mm_mask_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18055            src, 1, a, b,
18056        );
18057        let e = _mm_set_sh(-1.0);
18058        assert_eq_m128h(r, e);
18059    }
18060
18061    #[simd_test(enable = "avx512fp16,avx512vl")]
18062    unsafe fn test_mm_maskz_sub_round_sh() {
18063        let a = _mm_set_sh(1.0);
18064        let b = _mm_set_sh(2.0);
18065        let r =
18066            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18067        let e = _mm_set_sh(0.0);
18068        assert_eq_m128h(r, e);
18069        let r =
18070            _mm_maskz_sub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18071        let e = _mm_set_sh(-1.0);
18072        assert_eq_m128h(r, e);
18073    }
18074
18075    #[simd_test(enable = "avx512fp16,avx512vl")]
18076    unsafe fn test_mm_sub_sh() {
18077        let a = _mm_set_sh(1.0);
18078        let b = _mm_set_sh(2.0);
18079        let r = _mm_sub_sh(a, b);
18080        let e = _mm_set_sh(-1.0);
18081        assert_eq_m128h(r, e);
18082    }
18083
18084    #[simd_test(enable = "avx512fp16,avx512vl")]
18085    unsafe fn test_mm_mask_sub_sh() {
18086        let a = _mm_set_sh(1.0);
18087        let b = _mm_set_sh(2.0);
18088        let src = _mm_set_sh(4.0);
18089        let r = _mm_mask_sub_sh(src, 0, a, b);
18090        let e = _mm_set_sh(4.0);
18091        assert_eq_m128h(r, e);
18092        let r = _mm_mask_sub_sh(src, 1, a, b);
18093        let e = _mm_set_sh(-1.0);
18094        assert_eq_m128h(r, e);
18095    }
18096
18097    #[simd_test(enable = "avx512fp16,avx512vl")]
18098    unsafe fn test_mm_maskz_sub_sh() {
18099        let a = _mm_set_sh(1.0);
18100        let b = _mm_set_sh(2.0);
18101        let r = _mm_maskz_sub_sh(0, a, b);
18102        let e = _mm_set_sh(0.0);
18103        assert_eq_m128h(r, e);
18104        let r = _mm_maskz_sub_sh(1, a, b);
18105        let e = _mm_set_sh(-1.0);
18106        assert_eq_m128h(r, e);
18107    }
18108
18109    #[simd_test(enable = "avx512fp16,avx512vl")]
18110    unsafe fn test_mm_mul_ph() {
18111        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18112        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18113        let r = _mm_mul_ph(a, b);
18114        let e = _mm_set_ph(8.0, 14.0, 18.0, 20.0, 20.0, 18.0, 14.0, 8.0);
18115        assert_eq_m128h(r, e);
18116    }
18117
18118    #[simd_test(enable = "avx512fp16,avx512vl")]
18119    unsafe fn test_mm_mask_mul_ph() {
18120        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18121        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18122        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
18123        let r = _mm_mask_mul_ph(src, 0b01010101, a, b);
18124        let e = _mm_set_ph(10., 14., 12., 20., 14., 18., 16., 8.);
18125        assert_eq_m128h(r, e);
18126    }
18127
18128    #[simd_test(enable = "avx512fp16,avx512vl")]
18129    unsafe fn test_mm_maskz_mul_ph() {
18130        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
18131        let b = _mm_set_ph(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0);
18132        let r = _mm_maskz_mul_ph(0b01010101, a, b);
18133        let e = _mm_set_ph(0., 14., 0., 20., 0., 18., 0., 8.);
18134        assert_eq_m128h(r, e);
18135    }
18136
18137    #[simd_test(enable = "avx512fp16,avx512vl")]
18138    unsafe fn test_mm256_mul_ph() {
18139        let a = _mm256_set_ph(
18140            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18141        );
18142        let b = _mm256_set_ph(
18143            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18144        );
18145        let r = _mm256_mul_ph(a, b);
18146        let e = _mm256_set_ph(
18147            16.0, 30.0, 42.0, 52.0, 60.0, 66.0, 70.0, 72.0, 72.0, 70.0, 66.0, 60.0, 52.0, 42.0,
18148            30.0, 16.0,
18149        );
18150        assert_eq_m256h(r, e);
18151    }
18152
18153    #[simd_test(enable = "avx512fp16,avx512vl")]
18154    unsafe fn test_mm256_mask_mul_ph() {
18155        let a = _mm256_set_ph(
18156            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18157        );
18158        let b = _mm256_set_ph(
18159            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18160        );
18161        let src = _mm256_set_ph(
18162            18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32., 33.,
18163        );
18164        let r = _mm256_mask_mul_ph(src, 0b0101010101010101, a, b);
18165        let e = _mm256_set_ph(
18166            18., 30., 20., 52., 22., 66., 24., 72., 26., 70., 28., 60., 30., 42., 32., 16.,
18167        );
18168        assert_eq_m256h(r, e);
18169    }
18170
18171    #[simd_test(enable = "avx512fp16,avx512vl")]
18172    unsafe fn test_mm256_maskz_mul_ph() {
18173        let a = _mm256_set_ph(
18174            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18175        );
18176        let b = _mm256_set_ph(
18177            16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0,
18178        );
18179        let r = _mm256_maskz_mul_ph(0b0101010101010101, a, b);
18180        let e = _mm256_set_ph(
18181            0., 30., 0., 52., 0., 66., 0., 72., 0., 70., 0., 60., 0., 42., 0., 16.,
18182        );
18183        assert_eq_m256h(r, e);
18184    }
18185
18186    #[simd_test(enable = "avx512fp16")]
18187    unsafe fn test_mm512_mul_ph() {
18188        let a = _mm512_set_ph(
18189            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18190            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18191            31.0, 32.0,
18192        );
18193        let b = _mm512_set_ph(
18194            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18195            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18196            3.0, 2.0, 1.0,
18197        );
18198        let r = _mm512_mul_ph(a, b);
18199        let e = _mm512_set_ph(
18200            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
18201            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
18202            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
18203        );
18204        assert_eq_m512h(r, e);
18205    }
18206
18207    #[simd_test(enable = "avx512fp16")]
18208    unsafe fn test_mm512_mask_mul_ph() {
18209        let a = _mm512_set_ph(
18210            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18211            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18212            31.0, 32.0,
18213        );
18214        let b = _mm512_set_ph(
18215            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18216            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18217            3.0, 2.0, 1.0,
18218        );
18219        let src = _mm512_set_ph(
18220            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18221            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18222        );
18223        let r = _mm512_mask_mul_ph(src, 0b01010101010101010101010101010101, a, b);
18224        let e = _mm512_set_ph(
18225            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
18226            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
18227        );
18228        assert_eq_m512h(r, e);
18229    }
18230
18231    #[simd_test(enable = "avx512fp16")]
18232    unsafe fn test_mm512_maskz_mul_ph() {
18233        let a = _mm512_set_ph(
18234            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18235            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18236            31.0, 32.0,
18237        );
18238        let b = _mm512_set_ph(
18239            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18240            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18241            3.0, 2.0, 1.0,
18242        );
18243        let r = _mm512_maskz_mul_ph(0b01010101010101010101010101010101, a, b);
18244        let e = _mm512_set_ph(
18245            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
18246            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
18247        );
18248        assert_eq_m512h(r, e);
18249    }
18250
18251    #[simd_test(enable = "avx512fp16")]
18252    unsafe fn test_mm512_mul_round_ph() {
18253        let a = _mm512_set_ph(
18254            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18255            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18256            31.0, 32.0,
18257        );
18258        let b = _mm512_set_ph(
18259            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18260            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18261            3.0, 2.0, 1.0,
18262        );
18263        let r = _mm512_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18264        let e = _mm512_set_ph(
18265            32.0, 62.0, 90.0, 116.0, 140.0, 162.0, 182.0, 200.0, 216.0, 230.0, 242.0, 252.0, 260.0,
18266            266.0, 270.0, 272.0, 272.0, 270.0, 266.0, 260.0, 252.0, 242.0, 230.0, 216.0, 200.0,
18267            182.0, 162.0, 140.0, 116.0, 90.0, 62.0, 32.0,
18268        );
18269        assert_eq_m512h(r, e);
18270    }
18271
18272    #[simd_test(enable = "avx512fp16")]
18273    unsafe fn test_mm512_mask_mul_round_ph() {
18274        let a = _mm512_set_ph(
18275            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18276            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18277            31.0, 32.0,
18278        );
18279        let b = _mm512_set_ph(
18280            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18281            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18282            3.0, 2.0, 1.0,
18283        );
18284        let src = _mm512_set_ph(
18285            34., 35., 36., 37., 38., 39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50.,
18286            51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63., 64., 65.,
18287        );
18288        let r = _mm512_mask_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18289            src,
18290            0b01010101010101010101010101010101,
18291            a,
18292            b,
18293        );
18294        let e = _mm512_set_ph(
18295            34., 62., 36., 116., 38., 162., 40., 200., 42., 230., 44., 252., 46., 266., 48., 272.,
18296            50., 270., 52., 260., 54., 242., 56., 216., 58., 182., 60., 140., 62., 90., 64., 32.,
18297        );
18298        assert_eq_m512h(r, e);
18299    }
18300
18301    #[simd_test(enable = "avx512fp16")]
18302    unsafe fn test_mm512_maskz_mul_round_ph() {
18303        let a = _mm512_set_ph(
18304            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
18305            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
18306            31.0, 32.0,
18307        );
18308        let b = _mm512_set_ph(
18309            32.0, 31.0, 30.0, 29.0, 28.0, 27.0, 26.0, 25.0, 24.0, 23.0, 22.0, 21.0, 20.0, 19.0,
18310            18.0, 17.0, 16.0, 15.0, 14.0, 13.0, 12.0, 11.0, 10.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0,
18311            3.0, 2.0, 1.0,
18312        );
18313        let r = _mm512_maskz_mul_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18314            0b01010101010101010101010101010101,
18315            a,
18316            b,
18317        );
18318        let e = _mm512_set_ph(
18319            0., 62., 0., 116., 0., 162., 0., 200., 0., 230., 0., 252., 0., 266., 0., 272., 0.,
18320            270., 0., 260., 0., 242., 0., 216., 0., 182., 0., 140., 0., 90., 0., 32.,
18321        );
18322        assert_eq_m512h(r, e);
18323    }
18324
18325    #[simd_test(enable = "avx512fp16,avx512vl")]
18326    unsafe fn test_mm_mul_round_sh() {
18327        let a = _mm_set_sh(1.0);
18328        let b = _mm_set_sh(2.0);
18329        let r = _mm_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18330        let e = _mm_set_sh(2.0);
18331        assert_eq_m128h(r, e);
18332    }
18333
18334    #[simd_test(enable = "avx512fp16,avx512vl")]
18335    unsafe fn test_mm_mask_mul_round_sh() {
18336        let a = _mm_set_sh(1.0);
18337        let b = _mm_set_sh(2.0);
18338        let src = _mm_set_sh(4.0);
18339        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18340            src, 0, a, b,
18341        );
18342        let e = _mm_set_sh(4.0);
18343        assert_eq_m128h(r, e);
18344        let r = _mm_mask_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18345            src, 1, a, b,
18346        );
18347        let e = _mm_set_sh(2.0);
18348        assert_eq_m128h(r, e);
18349    }
18350
18351    #[simd_test(enable = "avx512fp16,avx512vl")]
18352    unsafe fn test_mm_maskz_mul_round_sh() {
18353        let a = _mm_set_sh(1.0);
18354        let b = _mm_set_sh(2.0);
18355        let r =
18356            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18357        let e = _mm_set_sh(0.0);
18358        assert_eq_m128h(r, e);
18359        let r =
18360            _mm_maskz_mul_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18361        let e = _mm_set_sh(2.0);
18362        assert_eq_m128h(r, e);
18363    }
18364
18365    #[simd_test(enable = "avx512fp16,avx512vl")]
18366    unsafe fn test_mm_mul_sh() {
18367        let a = _mm_set_sh(1.0);
18368        let b = _mm_set_sh(2.0);
18369        let r = _mm_mul_sh(a, b);
18370        let e = _mm_set_sh(2.0);
18371        assert_eq_m128h(r, e);
18372    }
18373
18374    #[simd_test(enable = "avx512fp16,avx512vl")]
18375    unsafe fn test_mm_mask_mul_sh() {
18376        let a = _mm_set_sh(1.0);
18377        let b = _mm_set_sh(2.0);
18378        let src = _mm_set_sh(4.0);
18379        let r = _mm_mask_mul_sh(src, 0, a, b);
18380        let e = _mm_set_sh(4.0);
18381        assert_eq_m128h(r, e);
18382        let r = _mm_mask_mul_sh(src, 1, a, b);
18383        let e = _mm_set_sh(2.0);
18384        assert_eq_m128h(r, e);
18385    }
18386
18387    #[simd_test(enable = "avx512fp16,avx512vl")]
18388    unsafe fn test_mm_maskz_mul_sh() {
18389        let a = _mm_set_sh(1.0);
18390        let b = _mm_set_sh(2.0);
18391        let r = _mm_maskz_mul_sh(0, a, b);
18392        let e = _mm_set_sh(0.0);
18393        assert_eq_m128h(r, e);
18394        let r = _mm_maskz_mul_sh(1, a, b);
18395        let e = _mm_set_sh(2.0);
18396        assert_eq_m128h(r, e);
18397    }
18398
18399    #[simd_test(enable = "avx512fp16,avx512vl")]
18400    unsafe fn test_mm_div_ph() {
18401        let a = _mm_set1_ph(1.0);
18402        let b = _mm_set1_ph(2.0);
18403        let r = _mm_div_ph(a, b);
18404        let e = _mm_set1_ph(0.5);
18405        assert_eq_m128h(r, e);
18406    }
18407
18408    #[simd_test(enable = "avx512fp16,avx512vl")]
18409    unsafe fn test_mm_mask_div_ph() {
18410        let a = _mm_set1_ph(1.0);
18411        let b = _mm_set1_ph(2.0);
18412        let src = _mm_set_ph(4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0);
18413        let r = _mm_mask_div_ph(src, 0b01010101, a, b);
18414        let e = _mm_set_ph(4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5);
18415        assert_eq_m128h(r, e);
18416    }
18417
18418    #[simd_test(enable = "avx512fp16,avx512vl")]
18419    unsafe fn test_mm_maskz_div_ph() {
18420        let a = _mm_set1_ph(1.0);
18421        let b = _mm_set1_ph(2.0);
18422        let r = _mm_maskz_div_ph(0b01010101, a, b);
18423        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
18424        assert_eq_m128h(r, e);
18425    }
18426
18427    #[simd_test(enable = "avx512fp16,avx512vl")]
18428    unsafe fn test_mm256_div_ph() {
18429        let a = _mm256_set1_ph(1.0);
18430        let b = _mm256_set1_ph(2.0);
18431        let r = _mm256_div_ph(a, b);
18432        let e = _mm256_set1_ph(0.5);
18433        assert_eq_m256h(r, e);
18434    }
18435
18436    #[simd_test(enable = "avx512fp16,avx512vl")]
18437    unsafe fn test_mm256_mask_div_ph() {
18438        let a = _mm256_set1_ph(1.0);
18439        let b = _mm256_set1_ph(2.0);
18440        let src = _mm256_set_ph(
18441            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18442            19.0,
18443        );
18444        let r = _mm256_mask_div_ph(src, 0b0101010101010101, a, b);
18445        let e = _mm256_set_ph(
18446            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18447        );
18448        assert_eq_m256h(r, e);
18449    }
18450
18451    #[simd_test(enable = "avx512fp16,avx512vl")]
18452    unsafe fn test_mm256_maskz_div_ph() {
18453        let a = _mm256_set1_ph(1.0);
18454        let b = _mm256_set1_ph(2.0);
18455        let r = _mm256_maskz_div_ph(0b0101010101010101, a, b);
18456        let e = _mm256_set_ph(
18457            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18458        );
18459        assert_eq_m256h(r, e);
18460    }
18461
18462    #[simd_test(enable = "avx512fp16")]
18463    unsafe fn test_mm512_div_ph() {
18464        let a = _mm512_set1_ph(1.0);
18465        let b = _mm512_set1_ph(2.0);
18466        let r = _mm512_div_ph(a, b);
18467        let e = _mm512_set1_ph(0.5);
18468        assert_eq_m512h(r, e);
18469    }
18470
18471    #[simd_test(enable = "avx512fp16")]
18472    unsafe fn test_mm512_mask_div_ph() {
18473        let a = _mm512_set1_ph(1.0);
18474        let b = _mm512_set1_ph(2.0);
18475        let src = _mm512_set_ph(
18476            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18477            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18478            33.0, 34.0, 35.0,
18479        );
18480        let r = _mm512_mask_div_ph(src, 0b01010101010101010101010101010101, a, b);
18481        let e = _mm512_set_ph(
18482            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18483            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18484        );
18485        assert_eq_m512h(r, e);
18486    }
18487
18488    #[simd_test(enable = "avx512fp16")]
18489    unsafe fn test_mm512_maskz_div_ph() {
18490        let a = _mm512_set1_ph(1.0);
18491        let b = _mm512_set1_ph(2.0);
18492        let r = _mm512_maskz_div_ph(0b01010101010101010101010101010101, a, b);
18493        let e = _mm512_set_ph(
18494            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18495            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18496        );
18497        assert_eq_m512h(r, e);
18498    }
18499
18500    #[simd_test(enable = "avx512fp16")]
18501    unsafe fn test_mm512_div_round_ph() {
18502        let a = _mm512_set1_ph(1.0);
18503        let b = _mm512_set1_ph(2.0);
18504        let r = _mm512_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18505        let e = _mm512_set1_ph(0.5);
18506        assert_eq_m512h(r, e);
18507    }
18508
18509    #[simd_test(enable = "avx512fp16")]
18510    unsafe fn test_mm512_mask_div_round_ph() {
18511        let a = _mm512_set1_ph(1.0);
18512        let b = _mm512_set1_ph(2.0);
18513        let src = _mm512_set_ph(
18514            4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
18515            19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0,
18516            33.0, 34.0, 35.0,
18517        );
18518        let r = _mm512_mask_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18519            src,
18520            0b01010101010101010101010101010101,
18521            a,
18522            b,
18523        );
18524        let e = _mm512_set_ph(
18525            4.0, 0.5, 6.0, 0.5, 8.0, 0.5, 10.0, 0.5, 12.0, 0.5, 14.0, 0.5, 16.0, 0.5, 18.0, 0.5,
18526            20.0, 0.5, 22.0, 0.5, 24.0, 0.5, 26.0, 0.5, 28.0, 0.5, 30.0, 0.5, 32.0, 0.5, 34.0, 0.5,
18527        );
18528        assert_eq_m512h(r, e);
18529    }
18530
18531    #[simd_test(enable = "avx512fp16")]
18532    unsafe fn test_mm512_maskz_div_round_ph() {
18533        let a = _mm512_set1_ph(1.0);
18534        let b = _mm512_set1_ph(2.0);
18535        let r = _mm512_maskz_div_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18536            0b01010101010101010101010101010101,
18537            a,
18538            b,
18539        );
18540        let e = _mm512_set_ph(
18541            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
18542            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
18543        );
18544        assert_eq_m512h(r, e);
18545    }
18546
18547    #[simd_test(enable = "avx512fp16,avx512vl")]
18548    unsafe fn test_mm_div_round_sh() {
18549        let a = _mm_set_sh(1.0);
18550        let b = _mm_set_sh(2.0);
18551        let r = _mm_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18552        let e = _mm_set_sh(0.5);
18553        assert_eq_m128h(r, e);
18554    }
18555
18556    #[simd_test(enable = "avx512fp16,avx512vl")]
18557    unsafe fn test_mm_mask_div_round_sh() {
18558        let a = _mm_set_sh(1.0);
18559        let b = _mm_set_sh(2.0);
18560        let src = _mm_set_sh(4.0);
18561        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18562            src, 0, a, b,
18563        );
18564        let e = _mm_set_sh(4.0);
18565        assert_eq_m128h(r, e);
18566        let r = _mm_mask_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18567            src, 1, a, b,
18568        );
18569        let e = _mm_set_sh(0.5);
18570        assert_eq_m128h(r, e);
18571    }
18572
18573    #[simd_test(enable = "avx512fp16,avx512vl")]
18574    unsafe fn test_mm_maskz_div_round_sh() {
18575        let a = _mm_set_sh(1.0);
18576        let b = _mm_set_sh(2.0);
18577        let r =
18578            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18579        let e = _mm_set_sh(0.0);
18580        assert_eq_m128h(r, e);
18581        let r =
18582            _mm_maskz_div_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
18583        let e = _mm_set_sh(0.5);
18584        assert_eq_m128h(r, e);
18585    }
18586
18587    #[simd_test(enable = "avx512fp16,avx512vl")]
18588    unsafe fn test_mm_div_sh() {
18589        let a = _mm_set_sh(1.0);
18590        let b = _mm_set_sh(2.0);
18591        let r = _mm_div_sh(a, b);
18592        let e = _mm_set_sh(0.5);
18593        assert_eq_m128h(r, e);
18594    }
18595
18596    #[simd_test(enable = "avx512fp16,avx512vl")]
18597    unsafe fn test_mm_mask_div_sh() {
18598        let a = _mm_set_sh(1.0);
18599        let b = _mm_set_sh(2.0);
18600        let src = _mm_set_sh(4.0);
18601        let r = _mm_mask_div_sh(src, 0, a, b);
18602        let e = _mm_set_sh(4.0);
18603        assert_eq_m128h(r, e);
18604        let r = _mm_mask_div_sh(src, 1, a, b);
18605        let e = _mm_set_sh(0.5);
18606        assert_eq_m128h(r, e);
18607    }
18608
18609    #[simd_test(enable = "avx512fp16,avx512vl")]
18610    unsafe fn test_mm_maskz_div_sh() {
18611        let a = _mm_set_sh(1.0);
18612        let b = _mm_set_sh(2.0);
18613        let r = _mm_maskz_div_sh(0, a, b);
18614        let e = _mm_set_sh(0.0);
18615        assert_eq_m128h(r, e);
18616        let r = _mm_maskz_div_sh(1, a, b);
18617        let e = _mm_set_sh(0.5);
18618        assert_eq_m128h(r, e);
18619    }
18620
18621    #[simd_test(enable = "avx512fp16,avx512vl")]
18622    unsafe fn test_mm_mul_pch() {
18623        let a = _mm_set1_pch(0.0, 1.0);
18624        let b = _mm_set1_pch(0.0, 1.0);
18625        let r = _mm_mul_pch(a, b);
18626        let e = _mm_set1_pch(-1.0, 0.0);
18627        assert_eq_m128h(r, e);
18628    }
18629
18630    #[simd_test(enable = "avx512fp16,avx512vl")]
18631    unsafe fn test_mm_mask_mul_pch() {
18632        let a = _mm_set1_pch(0.0, 1.0);
18633        let b = _mm_set1_pch(0.0, 1.0);
18634        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18635        let r = _mm_mask_mul_pch(src, 0b0101, a, b);
18636        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18637        assert_eq_m128h(r, e);
18638    }
18639
18640    #[simd_test(enable = "avx512fp16,avx512vl")]
18641    unsafe fn test_mm_maskz_mul_pch() {
18642        let a = _mm_set1_pch(0.0, 1.0);
18643        let b = _mm_set1_pch(0.0, 1.0);
18644        let r = _mm_maskz_mul_pch(0b0101, a, b);
18645        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18646        assert_eq_m128h(r, e);
18647    }
18648
18649    #[simd_test(enable = "avx512fp16,avx512vl")]
18650    unsafe fn test_mm256_mul_pch() {
18651        let a = _mm256_set1_pch(0.0, 1.0);
18652        let b = _mm256_set1_pch(0.0, 1.0);
18653        let r = _mm256_mul_pch(a, b);
18654        let e = _mm256_set1_pch(-1.0, 0.0);
18655        assert_eq_m256h(r, e);
18656    }
18657
18658    #[simd_test(enable = "avx512fp16,avx512vl")]
18659    unsafe fn test_mm256_mask_mul_pch() {
18660        let a = _mm256_set1_pch(0.0, 1.0);
18661        let b = _mm256_set1_pch(0.0, 1.0);
18662        let src = _mm256_setr_ph(
18663            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18664        );
18665        let r = _mm256_mask_mul_pch(src, 0b01010101, a, b);
18666        let e = _mm256_setr_ph(
18667            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18668        );
18669        assert_eq_m256h(r, e);
18670    }
18671
18672    #[simd_test(enable = "avx512fp16,avx512vl")]
18673    unsafe fn test_mm256_maskz_mul_pch() {
18674        let a = _mm256_set1_pch(0.0, 1.0);
18675        let b = _mm256_set1_pch(0.0, 1.0);
18676        let r = _mm256_maskz_mul_pch(0b01010101, a, b);
18677        let e = _mm256_setr_ph(
18678            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18679        );
18680        assert_eq_m256h(r, e);
18681    }
18682
18683    #[simd_test(enable = "avx512fp16")]
18684    unsafe fn test_mm512_mul_pch() {
18685        let a = _mm512_set1_pch(0.0, 1.0);
18686        let b = _mm512_set1_pch(0.0, 1.0);
18687        let r = _mm512_mul_pch(a, b);
18688        let e = _mm512_set1_pch(-1.0, 0.0);
18689        assert_eq_m512h(r, e);
18690    }
18691
18692    #[simd_test(enable = "avx512fp16")]
18693    unsafe fn test_mm512_mask_mul_pch() {
18694        let a = _mm512_set1_pch(0.0, 1.0);
18695        let b = _mm512_set1_pch(0.0, 1.0);
18696        let src = _mm512_setr_ph(
18697            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18698            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18699            32.0, 33.0,
18700        );
18701        let r = _mm512_mask_mul_pch(src, 0b0101010101010101, a, b);
18702        let e = _mm512_setr_ph(
18703            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18704            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18705            33.0,
18706        );
18707        assert_eq_m512h(r, e);
18708    }
18709
18710    #[simd_test(enable = "avx512fp16")]
18711    unsafe fn test_mm512_maskz_mul_pch() {
18712        let a = _mm512_set1_pch(0.0, 1.0);
18713        let b = _mm512_set1_pch(0.0, 1.0);
18714        let r = _mm512_maskz_mul_pch(0b0101010101010101, a, b);
18715        let e = _mm512_setr_ph(
18716            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18717            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18718        );
18719        assert_eq_m512h(r, e);
18720    }
18721
18722    #[simd_test(enable = "avx512fp16")]
18723    unsafe fn test_mm512_mul_round_pch() {
18724        let a = _mm512_set1_pch(0.0, 1.0);
18725        let b = _mm512_set1_pch(0.0, 1.0);
18726        let r = _mm512_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18727        let e = _mm512_set1_pch(-1.0, 0.0);
18728        assert_eq_m512h(r, e);
18729    }
18730
18731    #[simd_test(enable = "avx512fp16")]
18732    unsafe fn test_mm512_mask_mul_round_pch() {
18733        let a = _mm512_set1_pch(0.0, 1.0);
18734        let b = _mm512_set1_pch(0.0, 1.0);
18735        let src = _mm512_setr_ph(
18736            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18737            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18738            32.0, 33.0,
18739        );
18740        let r = _mm512_mask_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18741            src,
18742            0b0101010101010101,
18743            a,
18744            b,
18745        );
18746        let e = _mm512_setr_ph(
18747            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18748            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18749            33.0,
18750        );
18751        assert_eq_m512h(r, e);
18752    }
18753
18754    #[simd_test(enable = "avx512fp16")]
18755    unsafe fn test_mm512_maskz_mul_round_pch() {
18756        let a = _mm512_set1_pch(0.0, 1.0);
18757        let b = _mm512_set1_pch(0.0, 1.0);
18758        let r = _mm512_maskz_mul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18759            0b0101010101010101,
18760            a,
18761            b,
18762        );
18763        let e = _mm512_setr_ph(
18764            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18765            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18766        );
18767        assert_eq_m512h(r, e);
18768    }
18769
18770    #[simd_test(enable = "avx512fp16,avx512vl")]
18771    unsafe fn test_mm_mul_round_sch() {
18772        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18773        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18774        let r = _mm_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18775        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18776        assert_eq_m128h(r, e);
18777    }
18778
18779    #[simd_test(enable = "avx512fp16,avx512vl")]
18780    unsafe fn test_mm_mask_mul_round_sch() {
18781        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18782        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18783        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18784        let r = _mm_mask_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18785            src, 0, a, b,
18786        );
18787        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18788        assert_eq_m128h(r, e);
18789    }
18790
18791    #[simd_test(enable = "avx512fp16,avx512vl")]
18792    unsafe fn test_mm_maskz_mul_round_sch() {
18793        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18794        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18795        let r =
18796            _mm_maskz_mul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
18797        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18798        assert_eq_m128h(r, e);
18799    }
18800
18801    #[simd_test(enable = "avx512fp16,avx512vl")]
18802    unsafe fn test_mm_mul_sch() {
18803        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18804        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18805        let r = _mm_mul_sch(a, b);
18806        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18807        assert_eq_m128h(r, e);
18808    }
18809
18810    #[simd_test(enable = "avx512fp16,avx512vl")]
18811    unsafe fn test_mm_mask_mul_sch() {
18812        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18813        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18814        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18815        let r = _mm_mask_mul_sch(src, 0, a, b);
18816        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18817        assert_eq_m128h(r, e);
18818    }
18819
18820    #[simd_test(enable = "avx512fp16,avx512vl")]
18821    unsafe fn test_mm_maskz_mul_sch() {
18822        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18823        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18824        let r = _mm_maskz_mul_sch(0, a, b);
18825        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18826        assert_eq_m128h(r, e);
18827    }
18828
18829    #[simd_test(enable = "avx512fp16,avx512vl")]
18830    unsafe fn test_mm_fmul_pch() {
18831        let a = _mm_set1_pch(0.0, 1.0);
18832        let b = _mm_set1_pch(0.0, 1.0);
18833        let r = _mm_fmul_pch(a, b);
18834        let e = _mm_set1_pch(-1.0, 0.0);
18835        assert_eq_m128h(r, e);
18836    }
18837
18838    #[simd_test(enable = "avx512fp16,avx512vl")]
18839    unsafe fn test_mm_mask_fmul_pch() {
18840        let a = _mm_set1_pch(0.0, 1.0);
18841        let b = _mm_set1_pch(0.0, 1.0);
18842        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
18843        let r = _mm_mask_fmul_pch(src, 0b0101, a, b);
18844        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
18845        assert_eq_m128h(r, e);
18846    }
18847
18848    #[simd_test(enable = "avx512fp16,avx512vl")]
18849    unsafe fn test_mm_maskz_fmul_pch() {
18850        let a = _mm_set1_pch(0.0, 1.0);
18851        let b = _mm_set1_pch(0.0, 1.0);
18852        let r = _mm_maskz_fmul_pch(0b0101, a, b);
18853        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
18854        assert_eq_m128h(r, e);
18855    }
18856
18857    #[simd_test(enable = "avx512fp16,avx512vl")]
18858    unsafe fn test_mm256_fmul_pch() {
18859        let a = _mm256_set1_pch(0.0, 1.0);
18860        let b = _mm256_set1_pch(0.0, 1.0);
18861        let r = _mm256_fmul_pch(a, b);
18862        let e = _mm256_set1_pch(-1.0, 0.0);
18863        assert_eq_m256h(r, e);
18864    }
18865
18866    #[simd_test(enable = "avx512fp16,avx512vl")]
18867    unsafe fn test_mm256_mask_fmul_pch() {
18868        let a = _mm256_set1_pch(0.0, 1.0);
18869        let b = _mm256_set1_pch(0.0, 1.0);
18870        let src = _mm256_setr_ph(
18871            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18872        );
18873        let r = _mm256_mask_fmul_pch(src, 0b01010101, a, b);
18874        let e = _mm256_setr_ph(
18875            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18876        );
18877        assert_eq_m256h(r, e);
18878    }
18879
18880    #[simd_test(enable = "avx512fp16,avx512vl")]
18881    unsafe fn test_mm256_maskz_fmul_pch() {
18882        let a = _mm256_set1_pch(0.0, 1.0);
18883        let b = _mm256_set1_pch(0.0, 1.0);
18884        let r = _mm256_maskz_fmul_pch(0b01010101, a, b);
18885        let e = _mm256_setr_ph(
18886            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18887        );
18888        assert_eq_m256h(r, e);
18889    }
18890
18891    #[simd_test(enable = "avx512fp16")]
18892    unsafe fn test_mm512_fmul_pch() {
18893        let a = _mm512_set1_pch(0.0, 1.0);
18894        let b = _mm512_set1_pch(0.0, 1.0);
18895        let r = _mm512_fmul_pch(a, b);
18896        let e = _mm512_set1_pch(-1.0, 0.0);
18897        assert_eq_m512h(r, e);
18898    }
18899
18900    #[simd_test(enable = "avx512fp16")]
18901    unsafe fn test_mm512_mask_fmul_pch() {
18902        let a = _mm512_set1_pch(0.0, 1.0);
18903        let b = _mm512_set1_pch(0.0, 1.0);
18904        let src = _mm512_setr_ph(
18905            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18906            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18907            32.0, 33.0,
18908        );
18909        let r = _mm512_mask_fmul_pch(src, 0b0101010101010101, a, b);
18910        let e = _mm512_setr_ph(
18911            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18912            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18913            33.0,
18914        );
18915        assert_eq_m512h(r, e);
18916    }
18917
18918    #[simd_test(enable = "avx512fp16")]
18919    unsafe fn test_mm512_maskz_fmul_pch() {
18920        let a = _mm512_set1_pch(0.0, 1.0);
18921        let b = _mm512_set1_pch(0.0, 1.0);
18922        let r = _mm512_maskz_fmul_pch(0b0101010101010101, a, b);
18923        let e = _mm512_setr_ph(
18924            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18925            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18926        );
18927        assert_eq_m512h(r, e);
18928    }
18929
18930    #[simd_test(enable = "avx512fp16")]
18931    unsafe fn test_mm512_fmul_round_pch() {
18932        let a = _mm512_set1_pch(0.0, 1.0);
18933        let b = _mm512_set1_pch(0.0, 1.0);
18934        let r = _mm512_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18935        let e = _mm512_set1_pch(-1.0, 0.0);
18936        assert_eq_m512h(r, e);
18937    }
18938
18939    #[simd_test(enable = "avx512fp16")]
18940    unsafe fn test_mm512_mask_fmul_round_pch() {
18941        let a = _mm512_set1_pch(0.0, 1.0);
18942        let b = _mm512_set1_pch(0.0, 1.0);
18943        let src = _mm512_setr_ph(
18944            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
18945            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
18946            32.0, 33.0,
18947        );
18948        let r = _mm512_mask_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18949            src,
18950            0b0101010101010101,
18951            a,
18952            b,
18953        );
18954        let e = _mm512_setr_ph(
18955            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
18956            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
18957            33.0,
18958        );
18959        assert_eq_m512h(r, e);
18960    }
18961
18962    #[simd_test(enable = "avx512fp16")]
18963    unsafe fn test_mm512_maskz_fmul_round_pch() {
18964        let a = _mm512_set1_pch(0.0, 1.0);
18965        let b = _mm512_set1_pch(0.0, 1.0);
18966        let r = _mm512_maskz_fmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18967            0b0101010101010101,
18968            a,
18969            b,
18970        );
18971        let e = _mm512_setr_ph(
18972            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18973            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
18974        );
18975        assert_eq_m512h(r, e);
18976    }
18977
18978    #[simd_test(enable = "avx512fp16,avx512vl")]
18979    unsafe fn test_mm_fmul_round_sch() {
18980        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18981        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18982        let r = _mm_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
18983        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18984        assert_eq_m128h(r, e);
18985    }
18986
18987    #[simd_test(enable = "avx512fp16,avx512vl")]
18988    unsafe fn test_mm_mask_fmul_round_sch() {
18989        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18990        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
18991        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
18992        let r = _mm_mask_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
18993            src, 0, a, b,
18994        );
18995        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
18996        assert_eq_m128h(r, e);
18997    }
18998
18999    #[simd_test(enable = "avx512fp16,avx512vl")]
19000    unsafe fn test_mm_maskz_fmul_round_sch() {
19001        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19002        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19003        let r =
19004            _mm_maskz_fmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19005        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19006        assert_eq_m128h(r, e);
19007    }
19008
19009    #[simd_test(enable = "avx512fp16,avx512vl")]
19010    unsafe fn test_mm_fmul_sch() {
19011        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19012        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19013        let r = _mm_fmul_sch(a, b);
19014        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19015        assert_eq_m128h(r, e);
19016    }
19017
19018    #[simd_test(enable = "avx512fp16,avx512vl")]
19019    unsafe fn test_mm_mask_fmul_sch() {
19020        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19021        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19022        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19023        let r = _mm_mask_fmul_sch(src, 0, a, b);
19024        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19025        assert_eq_m128h(r, e);
19026    }
19027
19028    #[simd_test(enable = "avx512fp16,avx512vl")]
19029    unsafe fn test_mm_maskz_fmul_sch() {
19030        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19031        let b = _mm_setr_ph(0.0, 1.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19032        let r = _mm_maskz_fmul_sch(0, a, b);
19033        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19034        assert_eq_m128h(r, e);
19035    }
19036
19037    #[simd_test(enable = "avx512fp16,avx512vl")]
19038    unsafe fn test_mm_cmul_pch() {
19039        let a = _mm_set1_pch(0.0, 1.0);
19040        let b = _mm_set1_pch(0.0, -1.0);
19041        let r = _mm_cmul_pch(a, b);
19042        let e = _mm_set1_pch(-1.0, 0.0);
19043        assert_eq_m128h(r, e);
19044    }
19045
19046    #[simd_test(enable = "avx512fp16,avx512vl")]
19047    unsafe fn test_mm_mask_cmul_pch() {
19048        let a = _mm_set1_pch(0.0, 1.0);
19049        let b = _mm_set1_pch(0.0, -1.0);
19050        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19051        let r = _mm_mask_cmul_pch(src, 0b0101, a, b);
19052        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
19053        assert_eq_m128h(r, e);
19054    }
19055
19056    #[simd_test(enable = "avx512fp16,avx512vl")]
19057    unsafe fn test_mm_maskz_cmul_pch() {
19058        let a = _mm_set1_pch(0.0, 1.0);
19059        let b = _mm_set1_pch(0.0, -1.0);
19060        let r = _mm_maskz_cmul_pch(0b0101, a, b);
19061        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
19062        assert_eq_m128h(r, e);
19063    }
19064
19065    #[simd_test(enable = "avx512fp16,avx512vl")]
19066    unsafe fn test_mm256_cmul_pch() {
19067        let a = _mm256_set1_pch(0.0, 1.0);
19068        let b = _mm256_set1_pch(0.0, -1.0);
19069        let r = _mm256_cmul_pch(a, b);
19070        let e = _mm256_set1_pch(-1.0, 0.0);
19071        assert_eq_m256h(r, e);
19072    }
19073
19074    #[simd_test(enable = "avx512fp16,avx512vl")]
19075    unsafe fn test_mm256_mask_cmul_pch() {
19076        let a = _mm256_set1_pch(0.0, 1.0);
19077        let b = _mm256_set1_pch(0.0, -1.0);
19078        let src = _mm256_setr_ph(
19079            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19080        );
19081        let r = _mm256_mask_cmul_pch(src, 0b01010101, a, b);
19082        let e = _mm256_setr_ph(
19083            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19084        );
19085        assert_eq_m256h(r, e);
19086    }
19087
19088    #[simd_test(enable = "avx512fp16,avx512vl")]
19089    unsafe fn test_mm256_maskz_cmul_pch() {
19090        let a = _mm256_set1_pch(0.0, 1.0);
19091        let b = _mm256_set1_pch(0.0, -1.0);
19092        let r = _mm256_maskz_cmul_pch(0b01010101, a, b);
19093        let e = _mm256_setr_ph(
19094            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19095        );
19096        assert_eq_m256h(r, e);
19097    }
19098
19099    #[simd_test(enable = "avx512fp16")]
19100    unsafe fn test_mm512_cmul_pch() {
19101        let a = _mm512_set1_pch(0.0, 1.0);
19102        let b = _mm512_set1_pch(0.0, -1.0);
19103        let r = _mm512_cmul_pch(a, b);
19104        let e = _mm512_set1_pch(-1.0, 0.0);
19105        assert_eq_m512h(r, e);
19106    }
19107
19108    #[simd_test(enable = "avx512fp16")]
19109    unsafe fn test_mm512_mask_cmul_pch() {
19110        let a = _mm512_set1_pch(0.0, 1.0);
19111        let b = _mm512_set1_pch(0.0, -1.0);
19112        let src = _mm512_setr_ph(
19113            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19114            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19115            32.0, 33.0,
19116        );
19117        let r = _mm512_mask_cmul_pch(src, 0b0101010101010101, a, b);
19118        let e = _mm512_setr_ph(
19119            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19120            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19121            33.0,
19122        );
19123        assert_eq_m512h(r, e);
19124    }
19125
19126    #[simd_test(enable = "avx512fp16")]
19127    unsafe fn test_mm512_maskz_cmul_pch() {
19128        let a = _mm512_set1_pch(0.0, 1.0);
19129        let b = _mm512_set1_pch(0.0, -1.0);
19130        let r = _mm512_maskz_cmul_pch(0b0101010101010101, a, b);
19131        let e = _mm512_setr_ph(
19132            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19133            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19134        );
19135        assert_eq_m512h(r, e);
19136    }
19137
19138    #[simd_test(enable = "avx512fp16")]
19139    unsafe fn test_mm512_cmul_round_pch() {
19140        let a = _mm512_set1_pch(0.0, 1.0);
19141        let b = _mm512_set1_pch(0.0, -1.0);
19142        let r = _mm512_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19143        let e = _mm512_set1_pch(-1.0, 0.0);
19144        assert_eq_m512h(r, e);
19145    }
19146
19147    #[simd_test(enable = "avx512fp16")]
19148    unsafe fn test_mm512_mask_cmul_round_pch() {
19149        let a = _mm512_set1_pch(0.0, 1.0);
19150        let b = _mm512_set1_pch(0.0, -1.0);
19151        let src = _mm512_setr_ph(
19152            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19153            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19154            32.0, 33.0,
19155        );
19156        let r = _mm512_mask_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19157            src,
19158            0b0101010101010101,
19159            a,
19160            b,
19161        );
19162        let e = _mm512_setr_ph(
19163            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19164            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19165            33.0,
19166        );
19167        assert_eq_m512h(r, e);
19168    }
19169
19170    #[simd_test(enable = "avx512fp16")]
19171    unsafe fn test_mm512_maskz_cmul_round_pch() {
19172        let a = _mm512_set1_pch(0.0, 1.0);
19173        let b = _mm512_set1_pch(0.0, -1.0);
19174        let r = _mm512_maskz_cmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19175            0b0101010101010101,
19176            a,
19177            b,
19178        );
19179        let e = _mm512_setr_ph(
19180            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19181            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19182        );
19183        assert_eq_m512h(r, e);
19184    }
19185
19186    #[simd_test(enable = "avx512fp16,avx512vl")]
19187    unsafe fn test_mm_cmul_sch() {
19188        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19189        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19190        let r = _mm_cmul_sch(a, b);
19191        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19192        assert_eq_m128h(r, e);
19193    }
19194
19195    #[simd_test(enable = "avx512fp16,avx512vl")]
19196    unsafe fn test_mm_mask_cmul_sch() {
19197        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19198        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19199        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19200        let r = _mm_mask_cmul_sch(src, 0, a, b);
19201        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19202        assert_eq_m128h(r, e);
19203    }
19204
19205    #[simd_test(enable = "avx512fp16,avx512vl")]
19206    unsafe fn test_mm_maskz_cmul_sch() {
19207        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19208        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19209        let r = _mm_maskz_cmul_sch(0, a, b);
19210        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19211        assert_eq_m128h(r, e);
19212    }
19213
19214    #[simd_test(enable = "avx512fp16,avx512vl")]
19215    unsafe fn test_mm_cmul_round_sch() {
19216        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19217        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19218        let r = _mm_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19219        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19220        assert_eq_m128h(r, e);
19221    }
19222
19223    #[simd_test(enable = "avx512fp16,avx512vl")]
19224    unsafe fn test_mm_mask_cmul_round_sch() {
19225        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19226        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19227        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19228        let r = _mm_mask_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19229            src, 0, a, b,
19230        );
19231        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19232        assert_eq_m128h(r, e);
19233    }
19234
19235    #[simd_test(enable = "avx512fp16,avx512vl")]
19236    unsafe fn test_mm_maskz_cmul_round_sch() {
19237        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19238        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19239        let r =
19240            _mm_maskz_cmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19241        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19242        assert_eq_m128h(r, e);
19243    }
19244
19245    #[simd_test(enable = "avx512fp16,avx512vl")]
19246    unsafe fn test_mm_fcmul_pch() {
19247        let a = _mm_set1_pch(0.0, 1.0);
19248        let b = _mm_set1_pch(0.0, -1.0);
19249        let r = _mm_fcmul_pch(a, b);
19250        let e = _mm_set1_pch(-1.0, 0.0);
19251        assert_eq_m128h(r, e);
19252    }
19253
19254    #[simd_test(enable = "avx512fp16,avx512vl")]
19255    unsafe fn test_mm_mask_fcmul_pch() {
19256        let a = _mm_set1_pch(0.0, 1.0);
19257        let b = _mm_set1_pch(0.0, -1.0);
19258        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19259        let r = _mm_mask_fcmul_pch(src, 0b0101, a, b);
19260        let e = _mm_setr_ph(-1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0);
19261        assert_eq_m128h(r, e);
19262    }
19263
19264    #[simd_test(enable = "avx512fp16,avx512vl")]
19265    unsafe fn test_mm_maskz_fcmul_pch() {
19266        let a = _mm_set1_pch(0.0, 1.0);
19267        let b = _mm_set1_pch(0.0, -1.0);
19268        let r = _mm_maskz_fcmul_pch(0b0101, a, b);
19269        let e = _mm_setr_ph(-1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0);
19270        assert_eq_m128h(r, e);
19271    }
19272
19273    #[simd_test(enable = "avx512fp16,avx512vl")]
19274    unsafe fn test_mm256_fcmul_pch() {
19275        let a = _mm256_set1_pch(0.0, 1.0);
19276        let b = _mm256_set1_pch(0.0, -1.0);
19277        let r = _mm256_fcmul_pch(a, b);
19278        let e = _mm256_set1_pch(-1.0, 0.0);
19279        assert_eq_m256h(r, e);
19280    }
19281
19282    #[simd_test(enable = "avx512fp16,avx512vl")]
19283    unsafe fn test_mm256_mask_fcmul_pch() {
19284        let a = _mm256_set1_pch(0.0, 1.0);
19285        let b = _mm256_set1_pch(0.0, -1.0);
19286        let src = _mm256_setr_ph(
19287            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19288        );
19289        let r = _mm256_mask_fcmul_pch(src, 0b01010101, a, b);
19290        let e = _mm256_setr_ph(
19291            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19292        );
19293        assert_eq_m256h(r, e);
19294    }
19295
19296    #[simd_test(enable = "avx512fp16,avx512vl")]
19297    unsafe fn test_mm256_maskz_fcmul_pch() {
19298        let a = _mm256_set1_pch(0.0, 1.0);
19299        let b = _mm256_set1_pch(0.0, -1.0);
19300        let r = _mm256_maskz_fcmul_pch(0b01010101, a, b);
19301        let e = _mm256_setr_ph(
19302            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19303        );
19304        assert_eq_m256h(r, e);
19305    }
19306
19307    #[simd_test(enable = "avx512fp16")]
19308    unsafe fn test_mm512_fcmul_pch() {
19309        let a = _mm512_set1_pch(0.0, 1.0);
19310        let b = _mm512_set1_pch(0.0, -1.0);
19311        let r = _mm512_fcmul_pch(a, b);
19312        let e = _mm512_set1_pch(-1.0, 0.0);
19313        assert_eq_m512h(r, e);
19314    }
19315
19316    #[simd_test(enable = "avx512fp16")]
19317    unsafe fn test_mm512_mask_fcmul_pch() {
19318        let a = _mm512_set1_pch(0.0, 1.0);
19319        let b = _mm512_set1_pch(0.0, -1.0);
19320        let src = _mm512_setr_ph(
19321            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19322            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19323            32.0, 33.0,
19324        );
19325        let r = _mm512_mask_fcmul_pch(src, 0b0101010101010101, a, b);
19326        let e = _mm512_setr_ph(
19327            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19328            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19329            33.0,
19330        );
19331        assert_eq_m512h(r, e);
19332    }
19333
19334    #[simd_test(enable = "avx512fp16")]
19335    unsafe fn test_mm512_maskz_fcmul_pch() {
19336        let a = _mm512_set1_pch(0.0, 1.0);
19337        let b = _mm512_set1_pch(0.0, -1.0);
19338        let r = _mm512_maskz_fcmul_pch(0b0101010101010101, a, b);
19339        let e = _mm512_setr_ph(
19340            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19341            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19342        );
19343        assert_eq_m512h(r, e);
19344    }
19345
19346    #[simd_test(enable = "avx512fp16")]
19347    unsafe fn test_mm512_fcmul_round_pch() {
19348        let a = _mm512_set1_pch(0.0, 1.0);
19349        let b = _mm512_set1_pch(0.0, -1.0);
19350        let r = _mm512_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19351        let e = _mm512_set1_pch(-1.0, 0.0);
19352        assert_eq_m512h(r, e);
19353    }
19354
19355    #[simd_test(enable = "avx512fp16")]
19356    unsafe fn test_mm512_mask_fcmul_round_pch() {
19357        let a = _mm512_set1_pch(0.0, 1.0);
19358        let b = _mm512_set1_pch(0.0, -1.0);
19359        let src = _mm512_setr_ph(
19360            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19361            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19362            32.0, 33.0,
19363        );
19364        let r = _mm512_mask_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19365            src,
19366            0b0101010101010101,
19367            a,
19368            b,
19369        );
19370        let e = _mm512_setr_ph(
19371            -1.0, 0.0, 4.0, 5.0, -1.0, 0.0, 8.0, 9.0, -1.0, 0.0, 12.0, 13.0, -1.0, 0.0, 16.0, 17.0,
19372            -1.0, 0.0, 20.0, 21.0, -1.0, 0.0, 24.0, 25.0, -1.0, 0.0, 28.0, 29.0, -1.0, 0.0, 32.0,
19373            33.0,
19374        );
19375        assert_eq_m512h(r, e);
19376    }
19377
19378    #[simd_test(enable = "avx512fp16")]
19379    unsafe fn test_mm512_maskz_fcmul_round_pch() {
19380        let a = _mm512_set1_pch(0.0, 1.0);
19381        let b = _mm512_set1_pch(0.0, -1.0);
19382        let r = _mm512_maskz_fcmul_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19383            0b0101010101010101,
19384            a,
19385            b,
19386        );
19387        let e = _mm512_setr_ph(
19388            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19389            -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0,
19390        );
19391        assert_eq_m512h(r, e);
19392    }
19393
19394    #[simd_test(enable = "avx512fp16,avx512vl")]
19395    unsafe fn test_mm_fcmul_sch() {
19396        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19397        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19398        let r = _mm_fcmul_sch(a, b);
19399        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19400        assert_eq_m128h(r, e);
19401    }
19402
19403    #[simd_test(enable = "avx512fp16,avx512vl")]
19404    unsafe fn test_mm_mask_fcmul_sch() {
19405        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19406        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19407        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19408        let r = _mm_mask_fcmul_sch(src, 0, a, b);
19409        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19410        assert_eq_m128h(r, e);
19411    }
19412
19413    #[simd_test(enable = "avx512fp16,avx512vl")]
19414    unsafe fn test_mm_maskz_fcmul_sch() {
19415        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19416        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19417        let r = _mm_maskz_fcmul_sch(0, a, b);
19418        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19419        assert_eq_m128h(r, e);
19420    }
19421
19422    #[simd_test(enable = "avx512fp16,avx512vl")]
19423    unsafe fn test_mm_fcmul_round_sch() {
19424        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19425        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19426        let r = _mm_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
19427        let e = _mm_setr_ph(-1.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19428        assert_eq_m128h(r, e);
19429    }
19430
19431    #[simd_test(enable = "avx512fp16,avx512vl")]
19432    unsafe fn test_mm_mask_fcmul_round_sch() {
19433        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19434        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19435        let src = _mm_setr_ph(14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0);
19436        let r = _mm_mask_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19437            src, 0, a, b,
19438        );
19439        let e = _mm_setr_ph(14.0, 15.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19440        assert_eq_m128h(r, e);
19441    }
19442
19443    #[simd_test(enable = "avx512fp16,avx512vl")]
19444    unsafe fn test_mm_maskz_fcmul_round_sch() {
19445        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19446        let b = _mm_setr_ph(0.0, -1.0, 8.0, -9.0, 10.0, -11.0, 12.0, -13.0);
19447        let r =
19448            _mm_maskz_fcmul_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
19449        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19450        assert_eq_m128h(r, e);
19451    }
19452
19453    #[simd_test(enable = "avx512fp16,avx512vl")]
19454    unsafe fn test_mm_abs_ph() {
19455        let a = _mm_set_ph(-1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0);
19456        let r = _mm_abs_ph(a);
19457        let e = _mm_set_ph(1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0);
19458        assert_eq_m128h(r, e);
19459    }
19460
19461    #[simd_test(enable = "avx512fp16,avx512vl")]
19462    unsafe fn test_mm256_abs_ph() {
19463        let a = _mm256_set_ph(
19464            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19465            -14.0,
19466        );
19467        let r = _mm256_abs_ph(a);
19468        let e = _mm256_set_ph(
19469            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19470        );
19471        assert_eq_m256h(r, e);
19472    }
19473
19474    #[simd_test(enable = "avx512fp16")]
19475    unsafe fn test_mm512_abs_ph() {
19476        let a = _mm512_set_ph(
19477            -1.0, 0.0, 1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0,
19478            -14.0, 15.0, -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0,
19479            27.0, -28.0, 29.0, -30.0,
19480        );
19481        let r = _mm512_abs_ph(a);
19482        let e = _mm512_set_ph(
19483            1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
19484            15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0,
19485            29.0, 30.0,
19486        );
19487        assert_eq_m512h(r, e);
19488    }
19489
19490    #[simd_test(enable = "avx512fp16,avx512vl")]
19491    unsafe fn test_mm_conj_pch() {
19492        let a = _mm_set1_pch(0.0, 1.0);
19493        let r = _mm_conj_pch(a);
19494        let e = _mm_set1_pch(0.0, -1.0);
19495        assert_eq_m128h(r, e);
19496    }
19497
19498    #[simd_test(enable = "avx512fp16,avx512vl")]
19499    unsafe fn test_mm_mask_conj_pch() {
19500        let a = _mm_set1_pch(0.0, 1.0);
19501        let src = _mm_setr_ph(2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0);
19502        let r = _mm_mask_conj_pch(src, 0b0101, a);
19503        let e = _mm_setr_ph(0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0);
19504        assert_eq_m128h(r, e);
19505    }
19506
19507    #[simd_test(enable = "avx512fp16,avx512vl")]
19508    unsafe fn test_mm_maskz_conj_pch() {
19509        let a = _mm_set1_pch(0.0, 1.0);
19510        let r = _mm_maskz_conj_pch(0b0101, a);
19511        let e = _mm_setr_ph(0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0);
19512        assert_eq_m128h(r, e);
19513    }
19514
19515    #[simd_test(enable = "avx512fp16,avx512vl")]
19516    unsafe fn test_mm256_conj_pch() {
19517        let a = _mm256_set1_pch(0.0, 1.0);
19518        let r = _mm256_conj_pch(a);
19519        let e = _mm256_set1_pch(0.0, -1.0);
19520        assert_eq_m256h(r, e);
19521    }
19522
19523    #[simd_test(enable = "avx512fp16,avx512vl")]
19524    unsafe fn test_mm256_mask_conj_pch() {
19525        let a = _mm256_set1_pch(0.0, 1.0);
19526        let src = _mm256_setr_ph(
19527            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19528        );
19529        let r = _mm256_mask_conj_pch(src, 0b01010101, a);
19530        let e = _mm256_setr_ph(
19531            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19532        );
19533        assert_eq_m256h(r, e);
19534    }
19535
19536    #[simd_test(enable = "avx512fp16,avx512vl")]
19537    unsafe fn test_mm256_maskz_conj_pch() {
19538        let a = _mm256_set1_pch(0.0, 1.0);
19539        let r = _mm256_maskz_conj_pch(0b01010101, a);
19540        let e = _mm256_setr_ph(
19541            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19542        );
19543        assert_eq_m256h(r, e);
19544    }
19545
19546    #[simd_test(enable = "avx512fp16")]
19547    unsafe fn test_mm512_conj_pch() {
19548        let a = _mm512_set1_pch(0.0, 1.0);
19549        let r = _mm512_conj_pch(a);
19550        let e = _mm512_set1_pch(0.0, -1.0);
19551        assert_eq_m512h(r, e);
19552    }
19553
19554    #[simd_test(enable = "avx512fp16")]
19555    unsafe fn test_mm512_mask_conj_pch() {
19556        let a = _mm512_set1_pch(0.0, 1.0);
19557        let src = _mm512_setr_ph(
19558            2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0,
19559            18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
19560            32.0, 33.0,
19561        );
19562        let r = _mm512_mask_conj_pch(src, 0b0101010101010101, a);
19563        let e = _mm512_setr_ph(
19564            0.0, -1.0, 4.0, 5.0, 0.0, -1.0, 8.0, 9.0, 0.0, -1.0, 12.0, 13.0, 0.0, -1.0, 16.0, 17.0,
19565            0.0, -1.0, 20.0, 21.0, 0.0, -1.0, 24.0, 25.0, 0.0, -1.0, 28.0, 29.0, 0.0, -1.0, 32.0,
19566            33.0,
19567        );
19568        assert_eq_m512h(r, e);
19569    }
19570
19571    #[simd_test(enable = "avx512fp16")]
19572    unsafe fn test_mm512_maskz_conj_pch() {
19573        let a = _mm512_set1_pch(0.0, 1.0);
19574        let r = _mm512_maskz_conj_pch(0b0101010101010101, a);
19575        let e = _mm512_setr_ph(
19576            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19577            0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, -1.0, 0.0, 0.0,
19578        );
19579        assert_eq_m512h(r, e);
19580    }
19581
19582    #[simd_test(enable = "avx512fp16,avx512vl")]
19583    unsafe fn test_mm_fmadd_pch() {
19584        let a = _mm_set1_pch(0.0, 1.0);
19585        let b = _mm_set1_pch(0.0, 2.0);
19586        let c = _mm_set1_pch(0.0, 3.0);
19587        let r = _mm_fmadd_pch(a, b, c);
19588        let e = _mm_set1_pch(-2.0, 3.0);
19589        assert_eq_m128h(r, e);
19590    }
19591
19592    #[simd_test(enable = "avx512fp16,avx512vl")]
19593    unsafe fn test_mm_mask_fmadd_pch() {
19594        let a = _mm_set1_pch(0.0, 1.0);
19595        let b = _mm_set1_pch(0.0, 2.0);
19596        let c = _mm_set1_pch(0.0, 3.0);
19597        let r = _mm_mask_fmadd_pch(a, 0b0101, b, c);
19598        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0);
19599        assert_eq_m128h(r, e);
19600    }
19601
19602    #[simd_test(enable = "avx512fp16,avx512vl")]
19603    unsafe fn test_mm_mask3_fmadd_pch() {
19604        let a = _mm_set1_pch(0.0, 1.0);
19605        let b = _mm_set1_pch(0.0, 2.0);
19606        let c = _mm_set1_pch(0.0, 3.0);
19607        let r = _mm_mask3_fmadd_pch(a, b, c, 0b0101);
19608        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0);
19609        assert_eq_m128h(r, e);
19610    }
19611
19612    #[simd_test(enable = "avx512fp16,avx512vl")]
19613    unsafe fn test_mm_maskz_fmadd_pch() {
19614        let a = _mm_set1_pch(0.0, 1.0);
19615        let b = _mm_set1_pch(0.0, 2.0);
19616        let c = _mm_set1_pch(0.0, 3.0);
19617        let r = _mm_maskz_fmadd_pch(0b0101, a, b, c);
19618        let e = _mm_setr_ph(-2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0);
19619        assert_eq_m128h(r, e);
19620    }
19621
19622    #[simd_test(enable = "avx512fp16,avx512vl")]
19623    unsafe fn test_mm256_fmadd_pch() {
19624        let a = _mm256_set1_pch(0.0, 1.0);
19625        let b = _mm256_set1_pch(0.0, 2.0);
19626        let c = _mm256_set1_pch(0.0, 3.0);
19627        let r = _mm256_fmadd_pch(a, b, c);
19628        let e = _mm256_set1_pch(-2.0, 3.0);
19629        assert_eq_m256h(r, e);
19630    }
19631
19632    #[simd_test(enable = "avx512fp16,avx512vl")]
19633    unsafe fn test_mm256_mask_fmadd_pch() {
19634        let a = _mm256_set1_pch(0.0, 1.0);
19635        let b = _mm256_set1_pch(0.0, 2.0);
19636        let c = _mm256_set1_pch(0.0, 3.0);
19637        let r = _mm256_mask_fmadd_pch(a, 0b01010101, b, c);
19638        let e = _mm256_setr_ph(
19639            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19640        );
19641        assert_eq_m256h(r, e);
19642    }
19643
19644    #[simd_test(enable = "avx512fp16,avx512vl")]
19645    unsafe fn test_mm256_mask3_fmadd_pch() {
19646        let a = _mm256_set1_pch(0.0, 1.0);
19647        let b = _mm256_set1_pch(0.0, 2.0);
19648        let c = _mm256_set1_pch(0.0, 3.0);
19649        let r = _mm256_mask3_fmadd_pch(a, b, c, 0b01010101);
19650        let e = _mm256_setr_ph(
19651            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19652        );
19653        assert_eq_m256h(r, e);
19654    }
19655
19656    #[simd_test(enable = "avx512fp16,avx512vl")]
19657    unsafe fn test_mm256_maskz_fmadd_pch() {
19658        let a = _mm256_set1_pch(0.0, 1.0);
19659        let b = _mm256_set1_pch(0.0, 2.0);
19660        let c = _mm256_set1_pch(0.0, 3.0);
19661        let r = _mm256_maskz_fmadd_pch(0b01010101, a, b, c);
19662        let e = _mm256_setr_ph(
19663            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19664        );
19665        assert_eq_m256h(r, e);
19666    }
19667
19668    #[simd_test(enable = "avx512fp16")]
19669    unsafe fn test_mm512_fmadd_pch() {
19670        let a = _mm512_set1_pch(0.0, 1.0);
19671        let b = _mm512_set1_pch(0.0, 2.0);
19672        let c = _mm512_set1_pch(0.0, 3.0);
19673        let r = _mm512_fmadd_pch(a, b, c);
19674        let e = _mm512_set1_pch(-2.0, 3.0);
19675        assert_eq_m512h(r, e);
19676    }
19677
19678    #[simd_test(enable = "avx512fp16")]
19679    unsafe fn test_mm512_mask_fmadd_pch() {
19680        let a = _mm512_set1_pch(0.0, 1.0);
19681        let b = _mm512_set1_pch(0.0, 2.0);
19682        let c = _mm512_set1_pch(0.0, 3.0);
19683        let r = _mm512_mask_fmadd_pch(a, 0b0101010101010101, b, c);
19684        let e = _mm512_setr_ph(
19685            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19686            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19687        );
19688        assert_eq_m512h(r, e);
19689    }
19690
19691    #[simd_test(enable = "avx512fp16")]
19692    unsafe fn test_mm512_mask3_fmadd_pch() {
19693        let a = _mm512_set1_pch(0.0, 1.0);
19694        let b = _mm512_set1_pch(0.0, 2.0);
19695        let c = _mm512_set1_pch(0.0, 3.0);
19696        let r = _mm512_mask3_fmadd_pch(a, b, c, 0b0101010101010101);
19697        let e = _mm512_setr_ph(
19698            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19699            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19700        );
19701        assert_eq_m512h(r, e);
19702    }
19703
19704    #[simd_test(enable = "avx512fp16")]
19705    unsafe fn test_mm512_maskz_fmadd_pch() {
19706        let a = _mm512_set1_pch(0.0, 1.0);
19707        let b = _mm512_set1_pch(0.0, 2.0);
19708        let c = _mm512_set1_pch(0.0, 3.0);
19709        let r = _mm512_maskz_fmadd_pch(0b0101010101010101, a, b, c);
19710        let e = _mm512_setr_ph(
19711            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19712            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19713        );
19714        assert_eq_m512h(r, e);
19715    }
19716
19717    #[simd_test(enable = "avx512fp16")]
19718    unsafe fn test_mm512_fmadd_round_pch() {
19719        let a = _mm512_set1_pch(0.0, 1.0);
19720        let b = _mm512_set1_pch(0.0, 2.0);
19721        let c = _mm512_set1_pch(0.0, 3.0);
19722        let r =
19723            _mm512_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19724        let e = _mm512_set1_pch(-2.0, 3.0);
19725        assert_eq_m512h(r, e);
19726    }
19727
19728    #[simd_test(enable = "avx512fp16")]
19729    unsafe fn test_mm512_mask_fmadd_round_pch() {
19730        let a = _mm512_set1_pch(0.0, 1.0);
19731        let b = _mm512_set1_pch(0.0, 2.0);
19732        let c = _mm512_set1_pch(0.0, 3.0);
19733        let r = _mm512_mask_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19734            a,
19735            0b0101010101010101,
19736            b,
19737            c,
19738        );
19739        let e = _mm512_setr_ph(
19740            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19741            -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0, -2.0, 3.0, 0.0, 1.0,
19742        );
19743        assert_eq_m512h(r, e);
19744    }
19745
19746    #[simd_test(enable = "avx512fp16")]
19747    unsafe fn test_mm512_mask3_fmadd_round_pch() {
19748        let a = _mm512_set1_pch(0.0, 1.0);
19749        let b = _mm512_set1_pch(0.0, 2.0);
19750        let c = _mm512_set1_pch(0.0, 3.0);
19751        let r = _mm512_mask3_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19752            a,
19753            b,
19754            c,
19755            0b0101010101010101,
19756        );
19757        let e = _mm512_setr_ph(
19758            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19759            -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0, -2.0, 3.0, 0.0, 3.0,
19760        );
19761        assert_eq_m512h(r, e);
19762    }
19763
19764    #[simd_test(enable = "avx512fp16")]
19765    unsafe fn test_mm512_maskz_fmadd_round_pch() {
19766        let a = _mm512_set1_pch(0.0, 1.0);
19767        let b = _mm512_set1_pch(0.0, 2.0);
19768        let c = _mm512_set1_pch(0.0, 3.0);
19769        let r = _mm512_maskz_fmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19770            0b0101010101010101,
19771            a,
19772            b,
19773            c,
19774        );
19775        let e = _mm512_setr_ph(
19776            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19777            -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0, -2.0, 3.0, 0.0, 0.0,
19778        );
19779        assert_eq_m512h(r, e);
19780    }
19781
19782    #[simd_test(enable = "avx512fp16,avx512vl")]
19783    unsafe fn test_mm_fmadd_sch() {
19784        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19785        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19786        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19787        let r = _mm_fmadd_sch(a, b, c);
19788        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19789        assert_eq_m128h(r, e);
19790    }
19791
19792    #[simd_test(enable = "avx512fp16,avx512vl")]
19793    unsafe fn test_mm_mask_fmadd_sch() {
19794        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19795        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19796        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19797        let r = _mm_mask_fmadd_sch(a, 0, b, c);
19798        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19799        assert_eq_m128h(r, e);
19800        let r = _mm_mask_fmadd_sch(a, 1, b, c);
19801        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19802        assert_eq_m128h(r, e);
19803    }
19804
19805    #[simd_test(enable = "avx512fp16,avx512vl")]
19806    unsafe fn test_mm_mask3_fmadd_sch() {
19807        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19808        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19809        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19810        let r = _mm_mask3_fmadd_sch(a, b, c, 0);
19811        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19812        assert_eq_m128h(r, e);
19813        let r = _mm_mask3_fmadd_sch(a, b, c, 1);
19814        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19815        assert_eq_m128h(r, e);
19816    }
19817
19818    #[simd_test(enable = "avx512fp16,avx512vl")]
19819    unsafe fn test_mm_maskz_fmadd_sch() {
19820        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19821        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19822        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19823        let r = _mm_maskz_fmadd_sch(0, a, b, c);
19824        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19825        assert_eq_m128h(r, e);
19826        let r = _mm_maskz_fmadd_sch(1, a, b, c);
19827        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19828        assert_eq_m128h(r, e);
19829    }
19830
19831    #[simd_test(enable = "avx512fp16,avx512vl")]
19832    unsafe fn test_mm_fmadd_round_sch() {
19833        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19834        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19835        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19836        let r = _mm_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
19837        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19838        assert_eq_m128h(r, e);
19839    }
19840
19841    #[simd_test(enable = "avx512fp16,avx512vl")]
19842    unsafe fn test_mm_mask_fmadd_round_sch() {
19843        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19844        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19845        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19846        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19847            a, 0, b, c,
19848        );
19849        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19850        assert_eq_m128h(r, e);
19851        let r = _mm_mask_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19852            a, 1, b, c,
19853        );
19854        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19855        assert_eq_m128h(r, e);
19856    }
19857
19858    #[simd_test(enable = "avx512fp16,avx512vl")]
19859    unsafe fn test_mm_mask3_fmadd_round_sch() {
19860        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19861        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19862        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19863        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19864            a, b, c, 0,
19865        );
19866        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19867        assert_eq_m128h(r, e);
19868        let r = _mm_mask3_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19869            a, b, c, 1,
19870        );
19871        let e = _mm_setr_ph(-2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19872        assert_eq_m128h(r, e);
19873    }
19874
19875    #[simd_test(enable = "avx512fp16,avx512vl")]
19876    unsafe fn test_mm_maskz_fmadd_round_sch() {
19877        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19878        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
19879        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
19880        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19881            0, a, b, c,
19882        );
19883        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19884        assert_eq_m128h(r, e);
19885        let r = _mm_maskz_fmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
19886            1, a, b, c,
19887        );
19888        let e = _mm_setr_ph(-2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
19889        assert_eq_m128h(r, e);
19890    }
19891
19892    #[simd_test(enable = "avx512fp16,avx512vl")]
19893    unsafe fn test_mm_fcmadd_pch() {
19894        let a = _mm_set1_pch(0.0, 1.0);
19895        let b = _mm_set1_pch(0.0, 2.0);
19896        let c = _mm_set1_pch(0.0, 3.0);
19897        let r = _mm_fcmadd_pch(a, b, c);
19898        let e = _mm_set1_pch(2.0, 3.0);
19899        assert_eq_m128h(r, e);
19900    }
19901
19902    #[simd_test(enable = "avx512fp16,avx512vl")]
19903    unsafe fn test_mm_mask_fcmadd_pch() {
19904        let a = _mm_set1_pch(0.0, 1.0);
19905        let b = _mm_set1_pch(0.0, 2.0);
19906        let c = _mm_set1_pch(0.0, 3.0);
19907        let r = _mm_mask_fcmadd_pch(a, 0b0101, b, c);
19908        let e = _mm_setr_ph(2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0);
19909        assert_eq_m128h(r, e);
19910    }
19911
19912    #[simd_test(enable = "avx512fp16,avx512vl")]
19913    unsafe fn test_mm_mask3_fcmadd_pch() {
19914        let a = _mm_set1_pch(0.0, 1.0);
19915        let b = _mm_set1_pch(0.0, 2.0);
19916        let c = _mm_set1_pch(0.0, 3.0);
19917        let r = _mm_mask3_fcmadd_pch(a, b, c, 0b0101);
19918        let e = _mm_setr_ph(2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0);
19919        assert_eq_m128h(r, e);
19920    }
19921
19922    #[simd_test(enable = "avx512fp16,avx512vl")]
19923    unsafe fn test_mm_maskz_fcmadd_pch() {
19924        let a = _mm_set1_pch(0.0, 1.0);
19925        let b = _mm_set1_pch(0.0, 2.0);
19926        let c = _mm_set1_pch(0.0, 3.0);
19927        let r = _mm_maskz_fcmadd_pch(0b0101, a, b, c);
19928        let e = _mm_setr_ph(2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0);
19929        assert_eq_m128h(r, e);
19930    }
19931
19932    #[simd_test(enable = "avx512fp16,avx512vl")]
19933    unsafe fn test_mm256_fcmadd_pch() {
19934        let a = _mm256_set1_pch(0.0, 1.0);
19935        let b = _mm256_set1_pch(0.0, 2.0);
19936        let c = _mm256_set1_pch(0.0, 3.0);
19937        let r = _mm256_fcmadd_pch(a, b, c);
19938        let e = _mm256_set1_pch(2.0, 3.0);
19939        assert_eq_m256h(r, e);
19940    }
19941
19942    #[simd_test(enable = "avx512fp16,avx512vl")]
19943    unsafe fn test_mm256_mask_fcmadd_pch() {
19944        let a = _mm256_set1_pch(0.0, 1.0);
19945        let b = _mm256_set1_pch(0.0, 2.0);
19946        let c = _mm256_set1_pch(0.0, 3.0);
19947        let r = _mm256_mask_fcmadd_pch(a, 0b01010101, b, c);
19948        let e = _mm256_setr_ph(
19949            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19950        );
19951        assert_eq_m256h(r, e);
19952    }
19953
19954    #[simd_test(enable = "avx512fp16,avx512vl")]
19955    unsafe fn test_mm256_mask3_fcmadd_pch() {
19956        let a = _mm256_set1_pch(0.0, 1.0);
19957        let b = _mm256_set1_pch(0.0, 2.0);
19958        let c = _mm256_set1_pch(0.0, 3.0);
19959        let r = _mm256_mask3_fcmadd_pch(a, b, c, 0b01010101);
19960        let e = _mm256_setr_ph(
19961            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
19962        );
19963        assert_eq_m256h(r, e);
19964    }
19965
19966    #[simd_test(enable = "avx512fp16,avx512vl")]
19967    unsafe fn test_mm256_maskz_fcmadd_pch() {
19968        let a = _mm256_set1_pch(0.0, 1.0);
19969        let b = _mm256_set1_pch(0.0, 2.0);
19970        let c = _mm256_set1_pch(0.0, 3.0);
19971        let r = _mm256_maskz_fcmadd_pch(0b01010101, a, b, c);
19972        let e = _mm256_setr_ph(
19973            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
19974        );
19975        assert_eq_m256h(r, e);
19976    }
19977
19978    #[simd_test(enable = "avx512fp16")]
19979    unsafe fn test_mm512_fcmadd_pch() {
19980        let a = _mm512_set1_pch(0.0, 1.0);
19981        let b = _mm512_set1_pch(0.0, 2.0);
19982        let c = _mm512_set1_pch(0.0, 3.0);
19983        let r = _mm512_fcmadd_pch(a, b, c);
19984        let e = _mm512_set1_pch(2.0, 3.0);
19985        assert_eq_m512h(r, e);
19986    }
19987
19988    #[simd_test(enable = "avx512fp16")]
19989    unsafe fn test_mm512_mask_fcmadd_pch() {
19990        let a = _mm512_set1_pch(0.0, 1.0);
19991        let b = _mm512_set1_pch(0.0, 2.0);
19992        let c = _mm512_set1_pch(0.0, 3.0);
19993        let r = _mm512_mask_fcmadd_pch(a, 0b0101010101010101, b, c);
19994        let e = _mm512_setr_ph(
19995            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
19996            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
19997        );
19998        assert_eq_m512h(r, e);
19999    }
20000
20001    #[simd_test(enable = "avx512fp16")]
20002    unsafe fn test_mm512_mask3_fcmadd_pch() {
20003        let a = _mm512_set1_pch(0.0, 1.0);
20004        let b = _mm512_set1_pch(0.0, 2.0);
20005        let c = _mm512_set1_pch(0.0, 3.0);
20006        let r = _mm512_mask3_fcmadd_pch(a, b, c, 0b0101010101010101);
20007        let e = _mm512_setr_ph(
20008            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
20009            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
20010        );
20011        assert_eq_m512h(r, e);
20012    }
20013
20014    #[simd_test(enable = "avx512fp16")]
20015    unsafe fn test_mm512_maskz_fcmadd_pch() {
20016        let a = _mm512_set1_pch(0.0, 1.0);
20017        let b = _mm512_set1_pch(0.0, 2.0);
20018        let c = _mm512_set1_pch(0.0, 3.0);
20019        let r = _mm512_maskz_fcmadd_pch(0b0101010101010101, a, b, c);
20020        let e = _mm512_setr_ph(
20021            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
20022            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
20023        );
20024        assert_eq_m512h(r, e);
20025    }
20026
20027    #[simd_test(enable = "avx512fp16")]
20028    unsafe fn test_mm512_fcmadd_round_pch() {
20029        let a = _mm512_set1_pch(0.0, 1.0);
20030        let b = _mm512_set1_pch(0.0, 2.0);
20031        let c = _mm512_set1_pch(0.0, 3.0);
20032        let r =
20033            _mm512_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20034        let e = _mm512_set1_pch(2.0, 3.0);
20035        assert_eq_m512h(r, e);
20036    }
20037
20038    #[simd_test(enable = "avx512fp16")]
20039    unsafe fn test_mm512_mask_fcmadd_round_pch() {
20040        let a = _mm512_set1_pch(0.0, 1.0);
20041        let b = _mm512_set1_pch(0.0, 2.0);
20042        let c = _mm512_set1_pch(0.0, 3.0);
20043        let r = _mm512_mask_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20044            a,
20045            0b0101010101010101,
20046            b,
20047            c,
20048        );
20049        let e = _mm512_setr_ph(
20050            2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0,
20051            3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0,
20052        );
20053        assert_eq_m512h(r, e);
20054    }
20055
20056    #[simd_test(enable = "avx512fp16")]
20057    unsafe fn test_mm512_mask3_fcmadd_round_pch() {
20058        let a = _mm512_set1_pch(0.0, 1.0);
20059        let b = _mm512_set1_pch(0.0, 2.0);
20060        let c = _mm512_set1_pch(0.0, 3.0);
20061        let r = _mm512_mask3_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20062            a,
20063            b,
20064            c,
20065            0b0101010101010101,
20066        );
20067        let e = _mm512_setr_ph(
20068            2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0,
20069            3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0, 2.0, 3.0, 0.0, 3.0,
20070        );
20071        assert_eq_m512h(r, e);
20072    }
20073
20074    #[simd_test(enable = "avx512fp16")]
20075    unsafe fn test_mm512_maskz_fcmadd_round_pch() {
20076        let a = _mm512_set1_pch(0.0, 1.0);
20077        let b = _mm512_set1_pch(0.0, 2.0);
20078        let c = _mm512_set1_pch(0.0, 3.0);
20079        let r = _mm512_maskz_fcmadd_round_pch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20080            0b0101010101010101,
20081            a,
20082            b,
20083            c,
20084        );
20085        let e = _mm512_setr_ph(
20086            2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0,
20087            3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0, 2.0, 3.0, 0.0, 0.0,
20088        );
20089        assert_eq_m512h(r, e);
20090    }
20091
20092    #[simd_test(enable = "avx512fp16,avx512vl")]
20093    unsafe fn test_mm_fcmadd_sch() {
20094        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20095        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20096        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20097        let r = _mm_fcmadd_sch(a, b, c);
20098        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20099        assert_eq_m128h(r, e);
20100    }
20101
20102    #[simd_test(enable = "avx512fp16,avx512vl")]
20103    unsafe fn test_mm_mask_fcmadd_sch() {
20104        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20105        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20106        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20107        let r = _mm_mask_fcmadd_sch(a, 0, b, c);
20108        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20109        assert_eq_m128h(r, e);
20110        let r = _mm_mask_fcmadd_sch(a, 1, b, c);
20111        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20112        assert_eq_m128h(r, e);
20113    }
20114
20115    #[simd_test(enable = "avx512fp16,avx512vl")]
20116    unsafe fn test_mm_mask3_fcmadd_sch() {
20117        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20118        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20119        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20120        let r = _mm_mask3_fcmadd_sch(a, b, c, 0);
20121        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20122        assert_eq_m128h(r, e);
20123        let r = _mm_mask3_fcmadd_sch(a, b, c, 1);
20124        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20125        assert_eq_m128h(r, e);
20126    }
20127
20128    #[simd_test(enable = "avx512fp16,avx512vl")]
20129    unsafe fn test_mm_maskz_fcmadd_sch() {
20130        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20131        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20132        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20133        let r = _mm_maskz_fcmadd_sch(0, a, b, c);
20134        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20135        assert_eq_m128h(r, e);
20136        let r = _mm_maskz_fcmadd_sch(1, a, b, c);
20137        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20138        assert_eq_m128h(r, e);
20139    }
20140
20141    #[simd_test(enable = "avx512fp16,avx512vl")]
20142    unsafe fn test_mm_fcmadd_round_sch() {
20143        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20144        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20145        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20146        let r = _mm_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20147        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20148        assert_eq_m128h(r, e);
20149    }
20150
20151    #[simd_test(enable = "avx512fp16,avx512vl")]
20152    unsafe fn test_mm_mask_fcmadd_round_sch() {
20153        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20154        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20155        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20156        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20157            a, 0, b, c,
20158        );
20159        let e = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20160        assert_eq_m128h(r, e);
20161        let r = _mm_mask_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20162            a, 1, b, c,
20163        );
20164        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20165        assert_eq_m128h(r, e);
20166    }
20167
20168    #[simd_test(enable = "avx512fp16,avx512vl")]
20169    unsafe fn test_mm_mask3_fcmadd_round_sch() {
20170        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20171        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20172        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20173        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20174            a, b, c, 0,
20175        );
20176        let e = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20177        assert_eq_m128h(r, e);
20178        let r = _mm_mask3_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20179            a, b, c, 1,
20180        );
20181        let e = _mm_setr_ph(2.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20182        assert_eq_m128h(r, e);
20183    }
20184
20185    #[simd_test(enable = "avx512fp16,avx512vl")]
20186    unsafe fn test_mm_maskz_fcmadd_round_sch() {
20187        let a = _mm_setr_ph(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20188        let b = _mm_setr_ph(0.0, 2.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0);
20189        let c = _mm_setr_ph(0.0, 3.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0);
20190        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20191            0, a, b, c,
20192        );
20193        let e = _mm_setr_ph(0.0, 0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20194        assert_eq_m128h(r, e);
20195        let r = _mm_maskz_fcmadd_round_sch::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20196            1, a, b, c,
20197        );
20198        let e = _mm_setr_ph(2.0, 3.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0);
20199        assert_eq_m128h(r, e);
20200    }
20201
20202    #[simd_test(enable = "avx512fp16,avx512vl")]
20203    unsafe fn test_mm_fmadd_ph() {
20204        let a = _mm_set1_ph(1.0);
20205        let b = _mm_set1_ph(2.0);
20206        let c = _mm_set1_ph(3.0);
20207        let r = _mm_fmadd_ph(a, b, c);
20208        let e = _mm_set1_ph(5.0);
20209        assert_eq_m128h(r, e);
20210    }
20211
20212    #[simd_test(enable = "avx512fp16,avx512vl")]
20213    unsafe fn test_mm_mask_fmadd_ph() {
20214        let a = _mm_set1_ph(1.0);
20215        let b = _mm_set1_ph(2.0);
20216        let c = _mm_set1_ph(3.0);
20217        let r = _mm_mask_fmadd_ph(a, 0b01010101, b, c);
20218        let e = _mm_set_ph(1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0);
20219        assert_eq_m128h(r, e);
20220    }
20221
20222    #[simd_test(enable = "avx512fp16,avx512vl")]
20223    unsafe fn test_mm_mask3_fmadd_ph() {
20224        let a = _mm_set1_ph(1.0);
20225        let b = _mm_set1_ph(2.0);
20226        let c = _mm_set1_ph(3.0);
20227        let r = _mm_mask3_fmadd_ph(a, b, c, 0b01010101);
20228        let e = _mm_set_ph(3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0);
20229        assert_eq_m128h(r, e);
20230    }
20231
20232    #[simd_test(enable = "avx512fp16,avx512vl")]
20233    unsafe fn test_mm_maskz_fmadd_ph() {
20234        let a = _mm_set1_ph(1.0);
20235        let b = _mm_set1_ph(2.0);
20236        let c = _mm_set1_ph(3.0);
20237        let r = _mm_maskz_fmadd_ph(0b01010101, a, b, c);
20238        let e = _mm_set_ph(0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0);
20239        assert_eq_m128h(r, e);
20240    }
20241
20242    #[simd_test(enable = "avx512fp16,avx512vl")]
20243    unsafe fn test_mm256_fmadd_ph() {
20244        let a = _mm256_set1_ph(1.0);
20245        let b = _mm256_set1_ph(2.0);
20246        let c = _mm256_set1_ph(3.0);
20247        let r = _mm256_fmadd_ph(a, b, c);
20248        let e = _mm256_set1_ph(5.0);
20249        assert_eq_m256h(r, e);
20250    }
20251
20252    #[simd_test(enable = "avx512fp16,avx512vl")]
20253    unsafe fn test_mm256_mask_fmadd_ph() {
20254        let a = _mm256_set1_ph(1.0);
20255        let b = _mm256_set1_ph(2.0);
20256        let c = _mm256_set1_ph(3.0);
20257        let r = _mm256_mask_fmadd_ph(a, 0b0101010101010101, b, c);
20258        let e = _mm256_set_ph(
20259            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20260        );
20261        assert_eq_m256h(r, e);
20262    }
20263
20264    #[simd_test(enable = "avx512fp16,avx512vl")]
20265    unsafe fn test_mm256_mask3_fmadd_ph() {
20266        let a = _mm256_set1_ph(1.0);
20267        let b = _mm256_set1_ph(2.0);
20268        let c = _mm256_set1_ph(3.0);
20269        let r = _mm256_mask3_fmadd_ph(a, b, c, 0b0101010101010101);
20270        let e = _mm256_set_ph(
20271            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20272        );
20273        assert_eq_m256h(r, e);
20274    }
20275
20276    #[simd_test(enable = "avx512fp16,avx512vl")]
20277    unsafe fn test_mm256_maskz_fmadd_ph() {
20278        let a = _mm256_set1_ph(1.0);
20279        let b = _mm256_set1_ph(2.0);
20280        let c = _mm256_set1_ph(3.0);
20281        let r = _mm256_maskz_fmadd_ph(0b0101010101010101, a, b, c);
20282        let e = _mm256_set_ph(
20283            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20284        );
20285        assert_eq_m256h(r, e);
20286    }
20287
20288    #[simd_test(enable = "avx512fp16")]
20289    unsafe fn test_mm512_fmadd_ph() {
20290        let a = _mm512_set1_ph(1.0);
20291        let b = _mm512_set1_ph(2.0);
20292        let c = _mm512_set1_ph(3.0);
20293        let r = _mm512_fmadd_ph(a, b, c);
20294        let e = _mm512_set1_ph(5.0);
20295        assert_eq_m512h(r, e);
20296    }
20297
20298    #[simd_test(enable = "avx512fp16")]
20299    unsafe fn test_mm512_mask_fmadd_ph() {
20300        let a = _mm512_set1_ph(1.0);
20301        let b = _mm512_set1_ph(2.0);
20302        let c = _mm512_set1_ph(3.0);
20303        let r = _mm512_mask_fmadd_ph(a, 0b01010101010101010101010101010101, b, c);
20304        let e = _mm512_set_ph(
20305            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20306            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20307        );
20308        assert_eq_m512h(r, e);
20309    }
20310
20311    #[simd_test(enable = "avx512fp16")]
20312    unsafe fn test_mm512_mask3_fmadd_ph() {
20313        let a = _mm512_set1_ph(1.0);
20314        let b = _mm512_set1_ph(2.0);
20315        let c = _mm512_set1_ph(3.0);
20316        let r = _mm512_mask3_fmadd_ph(a, b, c, 0b01010101010101010101010101010101);
20317        let e = _mm512_set_ph(
20318            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20319            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20320        );
20321        assert_eq_m512h(r, e);
20322    }
20323
20324    #[simd_test(enable = "avx512fp16")]
20325    unsafe fn test_mm512_maskz_fmadd_ph() {
20326        let a = _mm512_set1_ph(1.0);
20327        let b = _mm512_set1_ph(2.0);
20328        let c = _mm512_set1_ph(3.0);
20329        let r = _mm512_maskz_fmadd_ph(0b01010101010101010101010101010101, a, b, c);
20330        let e = _mm512_set_ph(
20331            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20332            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20333        );
20334        assert_eq_m512h(r, e);
20335    }
20336
20337    #[simd_test(enable = "avx512fp16")]
20338    unsafe fn test_mm512_fmadd_round_ph() {
20339        let a = _mm512_set1_ph(1.0);
20340        let b = _mm512_set1_ph(2.0);
20341        let c = _mm512_set1_ph(3.0);
20342        let r = _mm512_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20343        let e = _mm512_set1_ph(5.0);
20344        assert_eq_m512h(r, e);
20345    }
20346
20347    #[simd_test(enable = "avx512fp16")]
20348    unsafe fn test_mm512_mask_fmadd_round_ph() {
20349        let a = _mm512_set1_ph(1.0);
20350        let b = _mm512_set1_ph(2.0);
20351        let c = _mm512_set1_ph(3.0);
20352        let r = _mm512_mask_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20353            a,
20354            0b01010101010101010101010101010101,
20355            b,
20356            c,
20357        );
20358        let e = _mm512_set_ph(
20359            1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0,
20360            5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0, 1.0, 5.0,
20361        );
20362        assert_eq_m512h(r, e);
20363    }
20364
20365    #[simd_test(enable = "avx512fp16")]
20366    unsafe fn test_mm512_mask3_fmadd_round_ph() {
20367        let a = _mm512_set1_ph(1.0);
20368        let b = _mm512_set1_ph(2.0);
20369        let c = _mm512_set1_ph(3.0);
20370        let r = _mm512_mask3_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20371            a,
20372            b,
20373            c,
20374            0b01010101010101010101010101010101,
20375        );
20376        let e = _mm512_set_ph(
20377            3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0,
20378            5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0, 3.0, 5.0,
20379        );
20380        assert_eq_m512h(r, e);
20381    }
20382
20383    #[simd_test(enable = "avx512fp16")]
20384    unsafe fn test_mm512_maskz_fmadd_round_ph() {
20385        let a = _mm512_set1_ph(1.0);
20386        let b = _mm512_set1_ph(2.0);
20387        let c = _mm512_set1_ph(3.0);
20388        let r = _mm512_maskz_fmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20389            0b01010101010101010101010101010101,
20390            a,
20391            b,
20392            c,
20393        );
20394        let e = _mm512_set_ph(
20395            0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0,
20396            5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0, 0.0, 5.0,
20397        );
20398        assert_eq_m512h(r, e);
20399    }
20400
20401    #[simd_test(enable = "avx512fp16,avx512vl")]
20402    unsafe fn test_mm_fmadd_sh() {
20403        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20404        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20405        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20406        let r = _mm_fmadd_sh(a, b, c);
20407        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20408        assert_eq_m128h(r, e);
20409    }
20410
20411    #[simd_test(enable = "avx512fp16,avx512vl")]
20412    unsafe fn test_mm_mask_fmadd_sh() {
20413        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20414        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20415        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20416        let r = _mm_mask_fmadd_sh(a, 0, b, c);
20417        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20418        assert_eq_m128h(r, e);
20419        let r = _mm_mask_fmadd_sh(a, 1, b, c);
20420        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20421        assert_eq_m128h(r, e);
20422    }
20423
20424    #[simd_test(enable = "avx512fp16,avx512vl")]
20425    unsafe fn test_mm_mask3_fmadd_sh() {
20426        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20427        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20428        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20429        let r = _mm_mask3_fmadd_sh(a, b, c, 0);
20430        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20431        assert_eq_m128h(r, e);
20432        let r = _mm_mask3_fmadd_sh(a, b, c, 1);
20433        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20434        assert_eq_m128h(r, e);
20435    }
20436
20437    #[simd_test(enable = "avx512fp16,avx512vl")]
20438    unsafe fn test_mm_maskz_fmadd_sh() {
20439        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20440        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20441        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20442        let r = _mm_maskz_fmadd_sh(0, a, b, c);
20443        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20444        assert_eq_m128h(r, e);
20445        let r = _mm_maskz_fmadd_sh(1, a, b, c);
20446        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20447        assert_eq_m128h(r, e);
20448    }
20449
20450    #[simd_test(enable = "avx512fp16,avx512vl")]
20451    unsafe fn test_mm_fmadd_round_sh() {
20452        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20453        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20454        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20455        let r = _mm_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20456        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20457        assert_eq_m128h(r, e);
20458    }
20459
20460    #[simd_test(enable = "avx512fp16,avx512vl")]
20461    unsafe fn test_mm_mask_fmadd_round_sh() {
20462        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20463        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20464        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20465        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20466            a, 0, b, c,
20467        );
20468        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20469        assert_eq_m128h(r, e);
20470        let r = _mm_mask_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20471            a, 1, b, c,
20472        );
20473        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20474        assert_eq_m128h(r, e);
20475    }
20476
20477    #[simd_test(enable = "avx512fp16,avx512vl")]
20478    unsafe fn test_mm_mask3_fmadd_round_sh() {
20479        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20480        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20481        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20482        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20483            a, b, c, 0,
20484        );
20485        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20486        assert_eq_m128h(r, e);
20487        let r = _mm_mask3_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20488            a, b, c, 1,
20489        );
20490        let e = _mm_setr_ph(5.0, 30., 31., 32., 33., 34., 35., 36.);
20491        assert_eq_m128h(r, e);
20492    }
20493
20494    #[simd_test(enable = "avx512fp16,avx512vl")]
20495    unsafe fn test_mm_maskz_fmadd_round_sh() {
20496        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20497        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20498        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20499        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20500            0, a, b, c,
20501        );
20502        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20503        assert_eq_m128h(r, e);
20504        let r = _mm_maskz_fmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20505            1, a, b, c,
20506        );
20507        let e = _mm_setr_ph(5.0, 10., 11., 12., 13., 14., 15., 16.);
20508        assert_eq_m128h(r, e);
20509    }
20510
20511    #[simd_test(enable = "avx512fp16,avx512vl")]
20512    unsafe fn test_mm_fmsub_ph() {
20513        let a = _mm_set1_ph(1.0);
20514        let b = _mm_set1_ph(2.0);
20515        let c = _mm_set1_ph(3.0);
20516        let r = _mm_fmsub_ph(a, b, c);
20517        let e = _mm_set1_ph(-1.0);
20518        assert_eq_m128h(r, e);
20519    }
20520
20521    #[simd_test(enable = "avx512fp16,avx512vl")]
20522    unsafe fn test_mm_mask_fmsub_ph() {
20523        let a = _mm_set1_ph(1.0);
20524        let b = _mm_set1_ph(2.0);
20525        let c = _mm_set1_ph(3.0);
20526        let r = _mm_mask_fmsub_ph(a, 0b01010101, b, c);
20527        let e = _mm_set_ph(1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0);
20528        assert_eq_m128h(r, e);
20529    }
20530
20531    #[simd_test(enable = "avx512fp16,avx512vl")]
20532    unsafe fn test_mm_mask3_fmsub_ph() {
20533        let a = _mm_set1_ph(1.0);
20534        let b = _mm_set1_ph(2.0);
20535        let c = _mm_set1_ph(3.0);
20536        let r = _mm_mask3_fmsub_ph(a, b, c, 0b01010101);
20537        let e = _mm_set_ph(3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0);
20538        assert_eq_m128h(r, e);
20539    }
20540
20541    #[simd_test(enable = "avx512fp16,avx512vl")]
20542    unsafe fn test_mm_maskz_fmsub_ph() {
20543        let a = _mm_set1_ph(1.0);
20544        let b = _mm_set1_ph(2.0);
20545        let c = _mm_set1_ph(3.0);
20546        let r = _mm_maskz_fmsub_ph(0b01010101, a, b, c);
20547        let e = _mm_set_ph(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0);
20548        assert_eq_m128h(r, e);
20549    }
20550
20551    #[simd_test(enable = "avx512fp16,avx512vl")]
20552    unsafe fn test_mm256_fmsub_ph() {
20553        let a = _mm256_set1_ph(1.0);
20554        let b = _mm256_set1_ph(2.0);
20555        let c = _mm256_set1_ph(3.0);
20556        let r = _mm256_fmsub_ph(a, b, c);
20557        let e = _mm256_set1_ph(-1.0);
20558        assert_eq_m256h(r, e);
20559    }
20560
20561    #[simd_test(enable = "avx512fp16,avx512vl")]
20562    unsafe fn test_mm256_mask_fmsub_ph() {
20563        let a = _mm256_set1_ph(1.0);
20564        let b = _mm256_set1_ph(2.0);
20565        let c = _mm256_set1_ph(3.0);
20566        let r = _mm256_mask_fmsub_ph(a, 0b0101010101010101, b, c);
20567        let e = _mm256_set_ph(
20568            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20569        );
20570        assert_eq_m256h(r, e);
20571    }
20572
20573    #[simd_test(enable = "avx512fp16,avx512vl")]
20574    unsafe fn test_mm256_mask3_fmsub_ph() {
20575        let a = _mm256_set1_ph(1.0);
20576        let b = _mm256_set1_ph(2.0);
20577        let c = _mm256_set1_ph(3.0);
20578        let r = _mm256_mask3_fmsub_ph(a, b, c, 0b0101010101010101);
20579        let e = _mm256_set_ph(
20580            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20581        );
20582        assert_eq_m256h(r, e);
20583    }
20584
20585    #[simd_test(enable = "avx512fp16,avx512vl")]
20586    unsafe fn test_mm256_maskz_fmsub_ph() {
20587        let a = _mm256_set1_ph(1.0);
20588        let b = _mm256_set1_ph(2.0);
20589        let c = _mm256_set1_ph(3.0);
20590        let r = _mm256_maskz_fmsub_ph(0b0101010101010101, a, b, c);
20591        let e = _mm256_set_ph(
20592            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20593        );
20594        assert_eq_m256h(r, e);
20595    }
20596
20597    #[simd_test(enable = "avx512fp16")]
20598    unsafe fn test_mm512_fmsub_ph() {
20599        let a = _mm512_set1_ph(1.0);
20600        let b = _mm512_set1_ph(2.0);
20601        let c = _mm512_set1_ph(3.0);
20602        let r = _mm512_fmsub_ph(a, b, c);
20603        let e = _mm512_set1_ph(-1.0);
20604        assert_eq_m512h(r, e);
20605    }
20606
20607    #[simd_test(enable = "avx512fp16")]
20608    unsafe fn test_mm512_mask_fmsub_ph() {
20609        let a = _mm512_set1_ph(1.0);
20610        let b = _mm512_set1_ph(2.0);
20611        let c = _mm512_set1_ph(3.0);
20612        let r = _mm512_mask_fmsub_ph(a, 0b01010101010101010101010101010101, b, c);
20613        let e = _mm512_set_ph(
20614            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20615            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20616        );
20617        assert_eq_m512h(r, e);
20618    }
20619
20620    #[simd_test(enable = "avx512fp16")]
20621    unsafe fn test_mm512_mask3_fmsub_ph() {
20622        let a = _mm512_set1_ph(1.0);
20623        let b = _mm512_set1_ph(2.0);
20624        let c = _mm512_set1_ph(3.0);
20625        let r = _mm512_mask3_fmsub_ph(a, b, c, 0b01010101010101010101010101010101);
20626        let e = _mm512_set_ph(
20627            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20628            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20629        );
20630        assert_eq_m512h(r, e);
20631    }
20632
20633    #[simd_test(enable = "avx512fp16")]
20634    unsafe fn test_mm512_maskz_fmsub_ph() {
20635        let a = _mm512_set1_ph(1.0);
20636        let b = _mm512_set1_ph(2.0);
20637        let c = _mm512_set1_ph(3.0);
20638        let r = _mm512_maskz_fmsub_ph(0b01010101010101010101010101010101, a, b, c);
20639        let e = _mm512_set_ph(
20640            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20641            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20642        );
20643        assert_eq_m512h(r, e);
20644    }
20645
20646    #[simd_test(enable = "avx512fp16")]
20647    unsafe fn test_mm512_fmsub_round_ph() {
20648        let a = _mm512_set1_ph(1.0);
20649        let b = _mm512_set1_ph(2.0);
20650        let c = _mm512_set1_ph(3.0);
20651        let r = _mm512_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20652        let e = _mm512_set1_ph(-1.0);
20653        assert_eq_m512h(r, e);
20654    }
20655
20656    #[simd_test(enable = "avx512fp16")]
20657    unsafe fn test_mm512_mask_fmsub_round_ph() {
20658        let a = _mm512_set1_ph(1.0);
20659        let b = _mm512_set1_ph(2.0);
20660        let c = _mm512_set1_ph(3.0);
20661        let r = _mm512_mask_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20662            a,
20663            0b01010101010101010101010101010101,
20664            b,
20665            c,
20666        );
20667        let e = _mm512_set_ph(
20668            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20669            1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
20670        );
20671        assert_eq_m512h(r, e);
20672    }
20673
20674    #[simd_test(enable = "avx512fp16")]
20675    unsafe fn test_mm512_mask3_fmsub_round_ph() {
20676        let a = _mm512_set1_ph(1.0);
20677        let b = _mm512_set1_ph(2.0);
20678        let c = _mm512_set1_ph(3.0);
20679        let r = _mm512_mask3_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20680            a,
20681            b,
20682            c,
20683            0b01010101010101010101010101010101,
20684        );
20685        let e = _mm512_set_ph(
20686            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20687            3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0, 3.0, -1.0,
20688        );
20689        assert_eq_m512h(r, e);
20690    }
20691
20692    #[simd_test(enable = "avx512fp16")]
20693    unsafe fn test_mm512_maskz_fmsub_round_ph() {
20694        let a = _mm512_set1_ph(1.0);
20695        let b = _mm512_set1_ph(2.0);
20696        let c = _mm512_set1_ph(3.0);
20697        let r = _mm512_maskz_fmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20698            0b01010101010101010101010101010101,
20699            a,
20700            b,
20701            c,
20702        );
20703        let e = _mm512_set_ph(
20704            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20705            0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0,
20706        );
20707        assert_eq_m512h(r, e);
20708    }
20709
20710    #[simd_test(enable = "avx512fp16,avx512vl")]
20711    unsafe fn test_mm_fmsub_sh() {
20712        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20713        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20714        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20715        let r = _mm_fmsub_sh(a, b, c);
20716        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20717        assert_eq_m128h(r, e);
20718    }
20719
20720    #[simd_test(enable = "avx512fp16,avx512vl")]
20721    unsafe fn test_mm_mask_fmsub_sh() {
20722        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20723        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20724        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20725        let r = _mm_mask_fmsub_sh(a, 0, b, c);
20726        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20727        assert_eq_m128h(r, e);
20728        let r = _mm_mask_fmsub_sh(a, 1, b, c);
20729        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20730        assert_eq_m128h(r, e);
20731    }
20732
20733    #[simd_test(enable = "avx512fp16,avx512vl")]
20734    unsafe fn test_mm_mask3_fmsub_sh() {
20735        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20736        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20737        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20738        let r = _mm_mask3_fmsub_sh(a, b, c, 0);
20739        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20740        assert_eq_m128h(r, e);
20741        let r = _mm_mask3_fmsub_sh(a, b, c, 1);
20742        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
20743        assert_eq_m128h(r, e);
20744    }
20745
20746    #[simd_test(enable = "avx512fp16,avx512vl")]
20747    unsafe fn test_mm_maskz_fmsub_sh() {
20748        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20749        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20750        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20751        let r = _mm_maskz_fmsub_sh(0, a, b, c);
20752        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20753        assert_eq_m128h(r, e);
20754        let r = _mm_maskz_fmsub_sh(1, a, b, c);
20755        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20756        assert_eq_m128h(r, e);
20757    }
20758
20759    #[simd_test(enable = "avx512fp16,avx512vl")]
20760    unsafe fn test_mm_fmsub_round_sh() {
20761        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20762        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20763        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20764        let r = _mm_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20765        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20766        assert_eq_m128h(r, e);
20767    }
20768
20769    #[simd_test(enable = "avx512fp16")]
20770    unsafe fn test_mm_mask_fmsub_round_sh() {
20771        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20772        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20773        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20774        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20775            a, 0, b, c,
20776        );
20777        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20778        assert_eq_m128h(r, e);
20779        let r = _mm_mask_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20780            a, 1, b, c,
20781        );
20782        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20783        assert_eq_m128h(r, e);
20784    }
20785
20786    #[simd_test(enable = "avx512fp16")]
20787    unsafe fn test_mm_mask3_fmsub_round_sh() {
20788        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20789        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20790        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20791        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20792            a, b, c, 0,
20793        );
20794        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20795        assert_eq_m128h(r, e);
20796        let r = _mm_mask3_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20797            a, b, c, 1,
20798        );
20799        let e = _mm_setr_ph(-1.0, 30., 31., 32., 33., 34., 35., 36.);
20800        assert_eq_m128h(r, e);
20801    }
20802
20803    #[simd_test(enable = "avx512fp16")]
20804    unsafe fn test_mm_maskz_fmsub_round_sh() {
20805        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
20806        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
20807        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
20808        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20809            0, a, b, c,
20810        );
20811        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
20812        assert_eq_m128h(r, e);
20813        let r = _mm_maskz_fmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20814            1, a, b, c,
20815        );
20816        let e = _mm_setr_ph(-1.0, 10., 11., 12., 13., 14., 15., 16.);
20817        assert_eq_m128h(r, e);
20818    }
20819
20820    #[simd_test(enable = "avx512fp16,avx512vl")]
20821    unsafe fn test_mm_fnmadd_ph() {
20822        let a = _mm_set1_ph(1.0);
20823        let b = _mm_set1_ph(2.0);
20824        let c = _mm_set1_ph(3.0);
20825        let r = _mm_fnmadd_ph(a, b, c);
20826        let e = _mm_set1_ph(1.0);
20827        assert_eq_m128h(r, e);
20828    }
20829
20830    #[simd_test(enable = "avx512fp16,avx512vl")]
20831    unsafe fn test_mm_mask_fnmadd_ph() {
20832        let a = _mm_set1_ph(1.0);
20833        let b = _mm_set1_ph(2.0);
20834        let c = _mm_set1_ph(3.0);
20835        let r = _mm_mask_fnmadd_ph(a, 0b01010101, b, c);
20836        let e = _mm_set_ph(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0);
20837        assert_eq_m128h(r, e);
20838    }
20839
20840    #[simd_test(enable = "avx512fp16,avx512vl")]
20841    unsafe fn test_mm_mask3_fnmadd_ph() {
20842        let a = _mm_set1_ph(1.0);
20843        let b = _mm_set1_ph(2.0);
20844        let c = _mm_set1_ph(3.0);
20845        let r = _mm_mask3_fnmadd_ph(a, b, c, 0b01010101);
20846        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
20847        assert_eq_m128h(r, e);
20848    }
20849
20850    #[simd_test(enable = "avx512fp16,avx512vl")]
20851    unsafe fn test_mm_maskz_fnmadd_ph() {
20852        let a = _mm_set1_ph(1.0);
20853        let b = _mm_set1_ph(2.0);
20854        let c = _mm_set1_ph(3.0);
20855        let r = _mm_maskz_fnmadd_ph(0b01010101, a, b, c);
20856        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
20857        assert_eq_m128h(r, e);
20858    }
20859
20860    #[simd_test(enable = "avx512fp16,avx512vl")]
20861    unsafe fn test_mm256_fnmadd_ph() {
20862        let a = _mm256_set1_ph(1.0);
20863        let b = _mm256_set1_ph(2.0);
20864        let c = _mm256_set1_ph(3.0);
20865        let r = _mm256_fnmadd_ph(a, b, c);
20866        let e = _mm256_set1_ph(1.0);
20867        assert_eq_m256h(r, e);
20868    }
20869
20870    #[simd_test(enable = "avx512fp16,avx512vl")]
20871    unsafe fn test_mm256_mask_fnmadd_ph() {
20872        let a = _mm256_set1_ph(1.0);
20873        let b = _mm256_set1_ph(2.0);
20874        let c = _mm256_set1_ph(3.0);
20875        let r = _mm256_mask_fnmadd_ph(a, 0b0101010101010101, b, c);
20876        let e = _mm256_set_ph(
20877            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20878        );
20879        assert_eq_m256h(r, e);
20880    }
20881
20882    #[simd_test(enable = "avx512fp16,avx512vl")]
20883    unsafe fn test_mm256_mask3_fnmadd_ph() {
20884        let a = _mm256_set1_ph(1.0);
20885        let b = _mm256_set1_ph(2.0);
20886        let c = _mm256_set1_ph(3.0);
20887        let r = _mm256_mask3_fnmadd_ph(a, b, c, 0b0101010101010101);
20888        let e = _mm256_set_ph(
20889            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20890        );
20891        assert_eq_m256h(r, e);
20892    }
20893
20894    #[simd_test(enable = "avx512fp16,avx512vl")]
20895    unsafe fn test_mm256_maskz_fnmadd_ph() {
20896        let a = _mm256_set1_ph(1.0);
20897        let b = _mm256_set1_ph(2.0);
20898        let c = _mm256_set1_ph(3.0);
20899        let r = _mm256_maskz_fnmadd_ph(0b0101010101010101, a, b, c);
20900        let e = _mm256_set_ph(
20901            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20902        );
20903        assert_eq_m256h(r, e);
20904    }
20905
20906    #[simd_test(enable = "avx512fp16")]
20907    unsafe fn test_mm512_fnmadd_ph() {
20908        let a = _mm512_set1_ph(1.0);
20909        let b = _mm512_set1_ph(2.0);
20910        let c = _mm512_set1_ph(3.0);
20911        let r = _mm512_fnmadd_ph(a, b, c);
20912        let e = _mm512_set1_ph(1.0);
20913        assert_eq_m512h(r, e);
20914    }
20915
20916    #[simd_test(enable = "avx512fp16")]
20917    unsafe fn test_mm512_mask_fnmadd_ph() {
20918        let a = _mm512_set1_ph(1.0);
20919        let b = _mm512_set1_ph(2.0);
20920        let c = _mm512_set1_ph(3.0);
20921        let r = _mm512_mask_fnmadd_ph(a, 0b01010101010101010101010101010101, b, c);
20922        let e = _mm512_set_ph(
20923            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20924            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20925        );
20926        assert_eq_m512h(r, e);
20927    }
20928
20929    #[simd_test(enable = "avx512fp16")]
20930    unsafe fn test_mm512_mask3_fnmadd_ph() {
20931        let a = _mm512_set1_ph(1.0);
20932        let b = _mm512_set1_ph(2.0);
20933        let c = _mm512_set1_ph(3.0);
20934        let r = _mm512_mask3_fnmadd_ph(a, b, c, 0b01010101010101010101010101010101);
20935        let e = _mm512_set_ph(
20936            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
20937            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20938        );
20939        assert_eq_m512h(r, e);
20940    }
20941
20942    #[simd_test(enable = "avx512fp16")]
20943    unsafe fn test_mm512_maskz_fnmadd_ph() {
20944        let a = _mm512_set1_ph(1.0);
20945        let b = _mm512_set1_ph(2.0);
20946        let c = _mm512_set1_ph(3.0);
20947        let r = _mm512_maskz_fnmadd_ph(0b01010101010101010101010101010101, a, b, c);
20948        let e = _mm512_set_ph(
20949            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
20950            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
20951        );
20952        assert_eq_m512h(r, e);
20953    }
20954
20955    #[simd_test(enable = "avx512fp16")]
20956    unsafe fn test_mm512_fnmadd_round_ph() {
20957        let a = _mm512_set1_ph(1.0);
20958        let b = _mm512_set1_ph(2.0);
20959        let c = _mm512_set1_ph(3.0);
20960        let r =
20961            _mm512_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
20962        let e = _mm512_set1_ph(1.0);
20963        assert_eq_m512h(r, e);
20964    }
20965
20966    #[simd_test(enable = "avx512fp16")]
20967    unsafe fn test_mm512_mask_fnmadd_round_ph() {
20968        let a = _mm512_set1_ph(1.0);
20969        let b = _mm512_set1_ph(2.0);
20970        let c = _mm512_set1_ph(3.0);
20971        let r = _mm512_mask_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20972            a,
20973            0b01010101010101010101010101010101,
20974            b,
20975            c,
20976        );
20977        let e = _mm512_set_ph(
20978            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20979            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
20980        );
20981        assert_eq_m512h(r, e);
20982    }
20983
20984    #[simd_test(enable = "avx512fp16")]
20985    unsafe fn test_mm512_mask3_fnmadd_round_ph() {
20986        let a = _mm512_set1_ph(1.0);
20987        let b = _mm512_set1_ph(2.0);
20988        let c = _mm512_set1_ph(3.0);
20989        let r = _mm512_mask3_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
20990            a,
20991            b,
20992            c,
20993            0b01010101010101010101010101010101,
20994        );
20995        let e = _mm512_set_ph(
20996            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
20997            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
20998        );
20999        assert_eq_m512h(r, e);
21000    }
21001
21002    #[simd_test(enable = "avx512fp16")]
21003    unsafe fn test_mm512_maskz_fnmadd_round_ph() {
21004        let a = _mm512_set1_ph(1.0);
21005        let b = _mm512_set1_ph(2.0);
21006        let c = _mm512_set1_ph(3.0);
21007        let r = _mm512_maskz_fnmadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21008            0b01010101010101010101010101010101,
21009            a,
21010            b,
21011            c,
21012        );
21013        let e = _mm512_set_ph(
21014            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
21015            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
21016        );
21017        assert_eq_m512h(r, e);
21018    }
21019
21020    #[simd_test(enable = "avx512fp16,avx512vl")]
21021    unsafe fn test_mm_fnmadd_sh() {
21022        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21023        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21024        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21025        let r = _mm_fnmadd_sh(a, b, c);
21026        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21027        assert_eq_m128h(r, e);
21028    }
21029
21030    #[simd_test(enable = "avx512fp16,avx512vl")]
21031    unsafe fn test_mm_mask_fnmadd_sh() {
21032        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21033        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21034        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21035        let r = _mm_mask_fnmadd_sh(a, 0, b, c);
21036        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21037        assert_eq_m128h(r, e);
21038        let r = _mm_mask_fnmadd_sh(a, 1, b, c);
21039        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21040        assert_eq_m128h(r, e);
21041    }
21042
21043    #[simd_test(enable = "avx512fp16,avx512vl")]
21044    unsafe fn test_mm_mask3_fnmadd_sh() {
21045        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21046        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21047        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21048        let r = _mm_mask3_fnmadd_sh(a, b, c, 0);
21049        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21050        assert_eq_m128h(r, e);
21051        let r = _mm_mask3_fnmadd_sh(a, b, c, 1);
21052        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
21053        assert_eq_m128h(r, e);
21054    }
21055
21056    #[simd_test(enable = "avx512fp16,avx512vl")]
21057    unsafe fn test_mm_maskz_fnmadd_sh() {
21058        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21059        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21060        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21061        let r = _mm_maskz_fnmadd_sh(0, a, b, c);
21062        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21063        assert_eq_m128h(r, e);
21064        let r = _mm_maskz_fnmadd_sh(1, a, b, c);
21065        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21066        assert_eq_m128h(r, e);
21067    }
21068
21069    #[simd_test(enable = "avx512fp16,avx512vl")]
21070    unsafe fn test_mm_fnmadd_round_sh() {
21071        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21072        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21073        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21074        let r = _mm_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21075        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21076        assert_eq_m128h(r, e);
21077    }
21078
21079    #[simd_test(enable = "avx512fp16,avx512vl")]
21080    unsafe fn test_mm_mask_fnmadd_round_sh() {
21081        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21082        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21083        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21084        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21085            a, 0, b, c,
21086        );
21087        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21088        assert_eq_m128h(r, e);
21089        let r = _mm_mask_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21090            a, 1, b, c,
21091        );
21092        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21093        assert_eq_m128h(r, e);
21094    }
21095
21096    #[simd_test(enable = "avx512fp16,avx512vl")]
21097    unsafe fn test_mm_mask3_fnmadd_round_sh() {
21098        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21099        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21100        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21101        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21102            a, b, c, 0,
21103        );
21104        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21105        assert_eq_m128h(r, e);
21106        let r = _mm_mask3_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21107            a, b, c, 1,
21108        );
21109        let e = _mm_setr_ph(1.0, 30., 31., 32., 33., 34., 35., 36.);
21110        assert_eq_m128h(r, e);
21111    }
21112
21113    #[simd_test(enable = "avx512fp16,avx512vl")]
21114    unsafe fn test_mm_maskz_fnmadd_round_sh() {
21115        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21116        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21117        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21118        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21119            0, a, b, c,
21120        );
21121        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21122        assert_eq_m128h(r, e);
21123        let r = _mm_maskz_fnmadd_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21124            1, a, b, c,
21125        );
21126        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21127        assert_eq_m128h(r, e);
21128    }
21129
21130    #[simd_test(enable = "avx512fp16,avx512vl")]
21131    unsafe fn test_mm_fnmsub_ph() {
21132        let a = _mm_set1_ph(1.0);
21133        let b = _mm_set1_ph(2.0);
21134        let c = _mm_set1_ph(3.0);
21135        let r = _mm_fnmsub_ph(a, b, c);
21136        let e = _mm_set1_ph(-5.0);
21137        assert_eq_m128h(r, e);
21138    }
21139
21140    #[simd_test(enable = "avx512fp16,avx512vl")]
21141    unsafe fn test_mm_mask_fnmsub_ph() {
21142        let a = _mm_set1_ph(1.0);
21143        let b = _mm_set1_ph(2.0);
21144        let c = _mm_set1_ph(3.0);
21145        let r = _mm_mask_fnmsub_ph(a, 0b01010101, b, c);
21146        let e = _mm_set_ph(1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0);
21147        assert_eq_m128h(r, e);
21148    }
21149
21150    #[simd_test(enable = "avx512fp16,avx512vl")]
21151    unsafe fn test_mm_mask3_fnmsub_ph() {
21152        let a = _mm_set1_ph(1.0);
21153        let b = _mm_set1_ph(2.0);
21154        let c = _mm_set1_ph(3.0);
21155        let r = _mm_mask3_fnmsub_ph(a, b, c, 0b01010101);
21156        let e = _mm_set_ph(3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0);
21157        assert_eq_m128h(r, e);
21158    }
21159
21160    #[simd_test(enable = "avx512fp16,avx512vl")]
21161    unsafe fn test_mm_maskz_fnmsub_ph() {
21162        let a = _mm_set1_ph(1.0);
21163        let b = _mm_set1_ph(2.0);
21164        let c = _mm_set1_ph(3.0);
21165        let r = _mm_maskz_fnmsub_ph(0b01010101, a, b, c);
21166        let e = _mm_set_ph(0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0);
21167        assert_eq_m128h(r, e);
21168    }
21169
21170    #[simd_test(enable = "avx512fp16,avx512vl")]
21171    unsafe fn test_mm256_fnmsub_ph() {
21172        let a = _mm256_set1_ph(1.0);
21173        let b = _mm256_set1_ph(2.0);
21174        let c = _mm256_set1_ph(3.0);
21175        let r = _mm256_fnmsub_ph(a, b, c);
21176        let e = _mm256_set1_ph(-5.0);
21177        assert_eq_m256h(r, e);
21178    }
21179
21180    #[simd_test(enable = "avx512fp16,avx512vl")]
21181    unsafe fn test_mm256_mask_fnmsub_ph() {
21182        let a = _mm256_set1_ph(1.0);
21183        let b = _mm256_set1_ph(2.0);
21184        let c = _mm256_set1_ph(3.0);
21185        let r = _mm256_mask_fnmsub_ph(a, 0b0101010101010101, b, c);
21186        let e = _mm256_set_ph(
21187            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21188        );
21189        assert_eq_m256h(r, e);
21190    }
21191
21192    #[simd_test(enable = "avx512fp16,avx512vl")]
21193    unsafe fn test_mm256_mask3_fnmsub_ph() {
21194        let a = _mm256_set1_ph(1.0);
21195        let b = _mm256_set1_ph(2.0);
21196        let c = _mm256_set1_ph(3.0);
21197        let r = _mm256_mask3_fnmsub_ph(a, b, c, 0b0101010101010101);
21198        let e = _mm256_set_ph(
21199            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21200        );
21201        assert_eq_m256h(r, e);
21202    }
21203
21204    #[simd_test(enable = "avx512fp16,avx512vl")]
21205    unsafe fn test_mm256_maskz_fnmsub_ph() {
21206        let a = _mm256_set1_ph(1.0);
21207        let b = _mm256_set1_ph(2.0);
21208        let c = _mm256_set1_ph(3.0);
21209        let r = _mm256_maskz_fnmsub_ph(0b0101010101010101, a, b, c);
21210        let e = _mm256_set_ph(
21211            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21212        );
21213        assert_eq_m256h(r, e);
21214    }
21215
21216    #[simd_test(enable = "avx512fp16")]
21217    unsafe fn test_mm512_fnmsub_ph() {
21218        let a = _mm512_set1_ph(1.0);
21219        let b = _mm512_set1_ph(2.0);
21220        let c = _mm512_set1_ph(3.0);
21221        let r = _mm512_fnmsub_ph(a, b, c);
21222        let e = _mm512_set1_ph(-5.0);
21223        assert_eq_m512h(r, e);
21224    }
21225
21226    #[simd_test(enable = "avx512fp16")]
21227    unsafe fn test_mm512_mask_fnmsub_ph() {
21228        let a = _mm512_set1_ph(1.0);
21229        let b = _mm512_set1_ph(2.0);
21230        let c = _mm512_set1_ph(3.0);
21231        let r = _mm512_mask_fnmsub_ph(a, 0b01010101010101010101010101010101, b, c);
21232        let e = _mm512_set_ph(
21233            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21234            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21235        );
21236        assert_eq_m512h(r, e);
21237    }
21238
21239    #[simd_test(enable = "avx512fp16")]
21240    unsafe fn test_mm512_mask3_fnmsub_ph() {
21241        let a = _mm512_set1_ph(1.0);
21242        let b = _mm512_set1_ph(2.0);
21243        let c = _mm512_set1_ph(3.0);
21244        let r = _mm512_mask3_fnmsub_ph(a, b, c, 0b01010101010101010101010101010101);
21245        let e = _mm512_set_ph(
21246            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21247            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21248        );
21249        assert_eq_m512h(r, e);
21250    }
21251
21252    #[simd_test(enable = "avx512fp16")]
21253    unsafe fn test_mm512_maskz_fnmsub_ph() {
21254        let a = _mm512_set1_ph(1.0);
21255        let b = _mm512_set1_ph(2.0);
21256        let c = _mm512_set1_ph(3.0);
21257        let r = _mm512_maskz_fnmsub_ph(0b01010101010101010101010101010101, a, b, c);
21258        let e = _mm512_set_ph(
21259            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21260            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21261        );
21262        assert_eq_m512h(r, e);
21263    }
21264
21265    #[simd_test(enable = "avx512fp16")]
21266    unsafe fn test_mm512_fnmsub_round_ph() {
21267        let a = _mm512_set1_ph(1.0);
21268        let b = _mm512_set1_ph(2.0);
21269        let c = _mm512_set1_ph(3.0);
21270        let r =
21271            _mm512_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21272        let e = _mm512_set1_ph(-5.0);
21273        assert_eq_m512h(r, e);
21274    }
21275
21276    #[simd_test(enable = "avx512fp16")]
21277    unsafe fn test_mm512_mask_fnmsub_round_ph() {
21278        let a = _mm512_set1_ph(1.0);
21279        let b = _mm512_set1_ph(2.0);
21280        let c = _mm512_set1_ph(3.0);
21281        let r = _mm512_mask_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21282            a,
21283            0b01010101010101010101010101010101,
21284            b,
21285            c,
21286        );
21287        let e = _mm512_set_ph(
21288            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21289            1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0, 1.0, -5.0,
21290        );
21291        assert_eq_m512h(r, e);
21292    }
21293
21294    #[simd_test(enable = "avx512fp16")]
21295    unsafe fn test_mm512_mask3_fnmsub_round_ph() {
21296        let a = _mm512_set1_ph(1.0);
21297        let b = _mm512_set1_ph(2.0);
21298        let c = _mm512_set1_ph(3.0);
21299        let r = _mm512_mask3_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21300            a,
21301            b,
21302            c,
21303            0b01010101010101010101010101010101,
21304        );
21305        let e = _mm512_set_ph(
21306            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21307            3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0, 3.0, -5.0,
21308        );
21309        assert_eq_m512h(r, e);
21310    }
21311
21312    #[simd_test(enable = "avx512fp16")]
21313    unsafe fn test_mm512_maskz_fnmsub_round_ph() {
21314        let a = _mm512_set1_ph(1.0);
21315        let b = _mm512_set1_ph(2.0);
21316        let c = _mm512_set1_ph(3.0);
21317        let r = _mm512_maskz_fnmsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21318            0b01010101010101010101010101010101,
21319            a,
21320            b,
21321            c,
21322        );
21323        let e = _mm512_set_ph(
21324            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21325            0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0, 0.0, -5.0,
21326        );
21327        assert_eq_m512h(r, e);
21328    }
21329
21330    #[simd_test(enable = "avx512fp16,avx512vl")]
21331    unsafe fn test_mm_fnmsub_sh() {
21332        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21333        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21334        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21335        let r = _mm_fnmsub_sh(a, b, c);
21336        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21337        assert_eq_m128h(r, e);
21338    }
21339
21340    #[simd_test(enable = "avx512fp16,avx512vl")]
21341    unsafe fn test_mm_mask_fnmsub_sh() {
21342        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21343        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21344        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21345        let r = _mm_mask_fnmsub_sh(a, 0, b, c);
21346        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21347        assert_eq_m128h(r, e);
21348        let r = _mm_mask_fnmsub_sh(a, 1, b, c);
21349        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21350        assert_eq_m128h(r, e);
21351    }
21352
21353    #[simd_test(enable = "avx512fp16,avx512vl")]
21354    unsafe fn test_mm_mask3_fnmsub_sh() {
21355        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21356        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21357        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21358        let r = _mm_mask3_fnmsub_sh(a, b, c, 0);
21359        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21360        assert_eq_m128h(r, e);
21361        let r = _mm_mask3_fnmsub_sh(a, b, c, 1);
21362        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21363        assert_eq_m128h(r, e);
21364    }
21365
21366    #[simd_test(enable = "avx512fp16,avx512vl")]
21367    unsafe fn test_mm_maskz_fnmsub_sh() {
21368        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21369        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21370        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21371        let r = _mm_maskz_fnmsub_sh(0, a, b, c);
21372        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21373        assert_eq_m128h(r, e);
21374        let r = _mm_maskz_fnmsub_sh(1, a, b, c);
21375        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21376        assert_eq_m128h(r, e);
21377    }
21378
21379    #[simd_test(enable = "avx512fp16,avx512vl")]
21380    unsafe fn test_mm_fnmsub_round_sh() {
21381        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21382        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21383        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21384        let r = _mm_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21385        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21386        assert_eq_m128h(r, e);
21387    }
21388
21389    #[simd_test(enable = "avx512fp16,avx512vl")]
21390    unsafe fn test_mm_mask_fnmsub_round_sh() {
21391        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21392        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21393        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21394        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21395            a, 0, b, c,
21396        );
21397        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21398        assert_eq_m128h(r, e);
21399        let r = _mm_mask_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21400            a, 1, b, c,
21401        );
21402        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21403        assert_eq_m128h(r, e);
21404    }
21405
21406    #[simd_test(enable = "avx512fp16,avx512vl")]
21407    unsafe fn test_mm_mask3_fnmsub_round_sh() {
21408        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21409        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21410        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21411        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21412            a, b, c, 0,
21413        );
21414        let e = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21415        assert_eq_m128h(r, e);
21416        let r = _mm_mask3_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21417            a, b, c, 1,
21418        );
21419        let e = _mm_setr_ph(-5.0, 30., 31., 32., 33., 34., 35., 36.);
21420        assert_eq_m128h(r, e);
21421    }
21422
21423    #[simd_test(enable = "avx512fp16,avx512vl")]
21424    unsafe fn test_mm_maskz_fnmsub_round_sh() {
21425        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
21426        let b = _mm_setr_ph(2.0, 20., 21., 22., 23., 24., 25., 26.);
21427        let c = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
21428        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21429            0, a, b, c,
21430        );
21431        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
21432        assert_eq_m128h(r, e);
21433        let r = _mm_maskz_fnmsub_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21434            1, a, b, c,
21435        );
21436        let e = _mm_setr_ph(-5.0, 10., 11., 12., 13., 14., 15., 16.);
21437        assert_eq_m128h(r, e);
21438    }
21439
21440    #[simd_test(enable = "avx512fp16,avx512vl")]
21441    unsafe fn test_mm_fmaddsub_ph() {
21442        let a = _mm_set1_ph(1.0);
21443        let b = _mm_set1_ph(2.0);
21444        let c = _mm_set1_ph(3.0);
21445        let r = _mm_fmaddsub_ph(a, b, c);
21446        let e = _mm_set_ph(5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0);
21447        assert_eq_m128h(r, e);
21448    }
21449
21450    #[simd_test(enable = "avx512fp16,avx512vl")]
21451    unsafe fn test_mm_mask_fmaddsub_ph() {
21452        let a = _mm_set1_ph(1.0);
21453        let b = _mm_set1_ph(2.0);
21454        let c = _mm_set1_ph(3.0);
21455        let r = _mm_mask_fmaddsub_ph(a, 0b00110011, b, c);
21456        let e = _mm_set_ph(1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0);
21457        assert_eq_m128h(r, e);
21458    }
21459
21460    #[simd_test(enable = "avx512fp16,avx512vl")]
21461    unsafe fn test_mm_mask3_fmaddsub_ph() {
21462        let a = _mm_set1_ph(1.0);
21463        let b = _mm_set1_ph(2.0);
21464        let c = _mm_set1_ph(3.0);
21465        let r = _mm_mask3_fmaddsub_ph(a, b, c, 0b00110011);
21466        let e = _mm_set_ph(3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0);
21467        assert_eq_m128h(r, e);
21468    }
21469
21470    #[simd_test(enable = "avx512fp16,avx512vl")]
21471    unsafe fn test_mm_maskz_fmaddsub_ph() {
21472        let a = _mm_set1_ph(1.0);
21473        let b = _mm_set1_ph(2.0);
21474        let c = _mm_set1_ph(3.0);
21475        let r = _mm_maskz_fmaddsub_ph(0b00110011, a, b, c);
21476        let e = _mm_set_ph(0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0);
21477        assert_eq_m128h(r, e);
21478    }
21479
21480    #[simd_test(enable = "avx512fp16,avx512vl")]
21481    unsafe fn test_mm256_fmaddsub_ph() {
21482        let a = _mm256_set1_ph(1.0);
21483        let b = _mm256_set1_ph(2.0);
21484        let c = _mm256_set1_ph(3.0);
21485        let r = _mm256_fmaddsub_ph(a, b, c);
21486        let e = _mm256_set_ph(
21487            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21488        );
21489        assert_eq_m256h(r, e);
21490    }
21491
21492    #[simd_test(enable = "avx512fp16,avx512vl")]
21493    unsafe fn test_mm256_mask_fmaddsub_ph() {
21494        let a = _mm256_set1_ph(1.0);
21495        let b = _mm256_set1_ph(2.0);
21496        let c = _mm256_set1_ph(3.0);
21497        let r = _mm256_mask_fmaddsub_ph(a, 0b0011001100110011, b, c);
21498        let e = _mm256_set_ph(
21499            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21500        );
21501        assert_eq_m256h(r, e);
21502    }
21503
21504    #[simd_test(enable = "avx512fp16,avx512vl")]
21505    unsafe fn test_mm256_mask3_fmaddsub_ph() {
21506        let a = _mm256_set1_ph(1.0);
21507        let b = _mm256_set1_ph(2.0);
21508        let c = _mm256_set1_ph(3.0);
21509        let r = _mm256_mask3_fmaddsub_ph(a, b, c, 0b0011001100110011);
21510        let e = _mm256_set_ph(
21511            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21512        );
21513        assert_eq_m256h(r, e);
21514    }
21515
21516    #[simd_test(enable = "avx512fp16,avx512vl")]
21517    unsafe fn test_mm256_maskz_fmaddsub_ph() {
21518        let a = _mm256_set1_ph(1.0);
21519        let b = _mm256_set1_ph(2.0);
21520        let c = _mm256_set1_ph(3.0);
21521        let r = _mm256_maskz_fmaddsub_ph(0b0011001100110011, a, b, c);
21522        let e = _mm256_set_ph(
21523            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21524        );
21525        assert_eq_m256h(r, e);
21526    }
21527
21528    #[simd_test(enable = "avx512fp16")]
21529    unsafe fn test_mm512_fmaddsub_ph() {
21530        let a = _mm512_set1_ph(1.0);
21531        let b = _mm512_set1_ph(2.0);
21532        let c = _mm512_set1_ph(3.0);
21533        let r = _mm512_fmaddsub_ph(a, b, c);
21534        let e = _mm512_set_ph(
21535            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21536            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21537        );
21538        assert_eq_m512h(r, e);
21539    }
21540
21541    #[simd_test(enable = "avx512fp16")]
21542    unsafe fn test_mm512_mask_fmaddsub_ph() {
21543        let a = _mm512_set1_ph(1.0);
21544        let b = _mm512_set1_ph(2.0);
21545        let c = _mm512_set1_ph(3.0);
21546        let r = _mm512_mask_fmaddsub_ph(a, 0b00110011001100110011001100110011, b, c);
21547        let e = _mm512_set_ph(
21548            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21549            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21550        );
21551        assert_eq_m512h(r, e);
21552    }
21553
21554    #[simd_test(enable = "avx512fp16")]
21555    unsafe fn test_mm512_mask3_fmaddsub_ph() {
21556        let a = _mm512_set1_ph(1.0);
21557        let b = _mm512_set1_ph(2.0);
21558        let c = _mm512_set1_ph(3.0);
21559        let r = _mm512_mask3_fmaddsub_ph(a, b, c, 0b00110011001100110011001100110011);
21560        let e = _mm512_set_ph(
21561            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21562            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21563        );
21564        assert_eq_m512h(r, e);
21565    }
21566
21567    #[simd_test(enable = "avx512fp16")]
21568    unsafe fn test_mm512_maskz_fmaddsub_ph() {
21569        let a = _mm512_set1_ph(1.0);
21570        let b = _mm512_set1_ph(2.0);
21571        let c = _mm512_set1_ph(3.0);
21572        let r = _mm512_maskz_fmaddsub_ph(0b00110011001100110011001100110011, a, b, c);
21573        let e = _mm512_set_ph(
21574            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21575            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21576        );
21577        assert_eq_m512h(r, e);
21578    }
21579
21580    #[simd_test(enable = "avx512fp16")]
21581    unsafe fn test_mm512_fmaddsub_round_ph() {
21582        let a = _mm512_set1_ph(1.0);
21583        let b = _mm512_set1_ph(2.0);
21584        let c = _mm512_set1_ph(3.0);
21585        let r =
21586            _mm512_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21587        let e = _mm512_set_ph(
21588            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21589            5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0,
21590        );
21591        assert_eq_m512h(r, e);
21592    }
21593
21594    #[simd_test(enable = "avx512fp16")]
21595    unsafe fn test_mm512_mask_fmaddsub_round_ph() {
21596        let a = _mm512_set1_ph(1.0);
21597        let b = _mm512_set1_ph(2.0);
21598        let c = _mm512_set1_ph(3.0);
21599        let r = _mm512_mask_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21600            a,
21601            0b00110011001100110011001100110011,
21602            b,
21603            c,
21604        );
21605        let e = _mm512_set_ph(
21606            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21607            1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0, 1.0, 1.0, 5.0, -1.0,
21608        );
21609        assert_eq_m512h(r, e);
21610    }
21611
21612    #[simd_test(enable = "avx512fp16")]
21613    unsafe fn test_mm512_mask3_fmaddsub_round_ph() {
21614        let a = _mm512_set1_ph(1.0);
21615        let b = _mm512_set1_ph(2.0);
21616        let c = _mm512_set1_ph(3.0);
21617        let r = _mm512_mask3_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21618            a,
21619            b,
21620            c,
21621            0b00110011001100110011001100110011,
21622        );
21623        let e = _mm512_set_ph(
21624            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21625            3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0, 3.0, 3.0, 5.0, -1.0,
21626        );
21627        assert_eq_m512h(r, e);
21628    }
21629
21630    #[simd_test(enable = "avx512fp16")]
21631    unsafe fn test_mm512_maskz_fmaddsub_round_ph() {
21632        let a = _mm512_set1_ph(1.0);
21633        let b = _mm512_set1_ph(2.0);
21634        let c = _mm512_set1_ph(3.0);
21635        let r = _mm512_maskz_fmaddsub_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21636            0b00110011001100110011001100110011,
21637            a,
21638            b,
21639            c,
21640        );
21641        let e = _mm512_set_ph(
21642            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21643            0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0, 0.0, 0.0, 5.0, -1.0,
21644        );
21645        assert_eq_m512h(r, e);
21646    }
21647
21648    #[simd_test(enable = "avx512fp16,avx512vl")]
21649    unsafe fn test_mm_fmsubadd_ph() {
21650        let a = _mm_set1_ph(1.0);
21651        let b = _mm_set1_ph(2.0);
21652        let c = _mm_set1_ph(3.0);
21653        let r = _mm_fmsubadd_ph(a, b, c);
21654        let e = _mm_set_ph(-1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0);
21655        assert_eq_m128h(r, e);
21656    }
21657
21658    #[simd_test(enable = "avx512fp16,avx512vl")]
21659    unsafe fn test_mm_mask_fmsubadd_ph() {
21660        let a = _mm_set1_ph(1.0);
21661        let b = _mm_set1_ph(2.0);
21662        let c = _mm_set1_ph(3.0);
21663        let r = _mm_mask_fmsubadd_ph(a, 0b00110011, b, c);
21664        let e = _mm_set_ph(1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0);
21665        assert_eq_m128h(r, e);
21666    }
21667
21668    #[simd_test(enable = "avx512fp16,avx512vl")]
21669    unsafe fn test_mm_mask3_fmsubadd_ph() {
21670        let a = _mm_set1_ph(1.0);
21671        let b = _mm_set1_ph(2.0);
21672        let c = _mm_set1_ph(3.0);
21673        let r = _mm_mask3_fmsubadd_ph(a, b, c, 0b00110011);
21674        let e = _mm_set_ph(3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0);
21675        assert_eq_m128h(r, e);
21676    }
21677
21678    #[simd_test(enable = "avx512fp16,avx512vl")]
21679    unsafe fn test_mm_maskz_fmsubadd_ph() {
21680        let a = _mm_set1_ph(1.0);
21681        let b = _mm_set1_ph(2.0);
21682        let c = _mm_set1_ph(3.0);
21683        let r = _mm_maskz_fmsubadd_ph(0b00110011, a, b, c);
21684        let e = _mm_set_ph(0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0);
21685        assert_eq_m128h(r, e);
21686    }
21687
21688    #[simd_test(enable = "avx512fp16,avx512vl")]
21689    unsafe fn test_mm256_fmsubadd_ph() {
21690        let a = _mm256_set1_ph(1.0);
21691        let b = _mm256_set1_ph(2.0);
21692        let c = _mm256_set1_ph(3.0);
21693        let r = _mm256_fmsubadd_ph(a, b, c);
21694        let e = _mm256_set_ph(
21695            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21696        );
21697        assert_eq_m256h(r, e);
21698    }
21699
21700    #[simd_test(enable = "avx512fp16,avx512vl")]
21701    unsafe fn test_mm256_mask_fmsubadd_ph() {
21702        let a = _mm256_set1_ph(1.0);
21703        let b = _mm256_set1_ph(2.0);
21704        let c = _mm256_set1_ph(3.0);
21705        let r = _mm256_mask_fmsubadd_ph(a, 0b0011001100110011, b, c);
21706        let e = _mm256_set_ph(
21707            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21708        );
21709        assert_eq_m256h(r, e);
21710    }
21711
21712    #[simd_test(enable = "avx512fp16,avx512vl")]
21713    unsafe fn test_mm256_mask3_fmsubadd_ph() {
21714        let a = _mm256_set1_ph(1.0);
21715        let b = _mm256_set1_ph(2.0);
21716        let c = _mm256_set1_ph(3.0);
21717        let r = _mm256_mask3_fmsubadd_ph(a, b, c, 0b0011001100110011);
21718        let e = _mm256_set_ph(
21719            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21720        );
21721        assert_eq_m256h(r, e);
21722    }
21723
21724    #[simd_test(enable = "avx512fp16,avx512vl")]
21725    unsafe fn test_mm256_maskz_fmsubadd_ph() {
21726        let a = _mm256_set1_ph(1.0);
21727        let b = _mm256_set1_ph(2.0);
21728        let c = _mm256_set1_ph(3.0);
21729        let r = _mm256_maskz_fmsubadd_ph(0b0011001100110011, a, b, c);
21730        let e = _mm256_set_ph(
21731            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21732        );
21733        assert_eq_m256h(r, e);
21734    }
21735
21736    #[simd_test(enable = "avx512fp16")]
21737    unsafe fn test_mm512_fmsubadd_ph() {
21738        let a = _mm512_set1_ph(1.0);
21739        let b = _mm512_set1_ph(2.0);
21740        let c = _mm512_set1_ph(3.0);
21741        let r = _mm512_fmsubadd_ph(a, b, c);
21742        let e = _mm512_set_ph(
21743            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21744            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21745        );
21746        assert_eq_m512h(r, e);
21747    }
21748
21749    #[simd_test(enable = "avx512fp16")]
21750    unsafe fn test_mm512_mask_fmsubadd_ph() {
21751        let a = _mm512_set1_ph(1.0);
21752        let b = _mm512_set1_ph(2.0);
21753        let c = _mm512_set1_ph(3.0);
21754        let r = _mm512_mask_fmsubadd_ph(a, 0b00110011001100110011001100110011, b, c);
21755        let e = _mm512_set_ph(
21756            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21757            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21758        );
21759        assert_eq_m512h(r, e);
21760    }
21761
21762    #[simd_test(enable = "avx512fp16")]
21763    unsafe fn test_mm512_mask3_fmsubadd_ph() {
21764        let a = _mm512_set1_ph(1.0);
21765        let b = _mm512_set1_ph(2.0);
21766        let c = _mm512_set1_ph(3.0);
21767        let r = _mm512_mask3_fmsubadd_ph(a, b, c, 0b00110011001100110011001100110011);
21768        let e = _mm512_set_ph(
21769            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21770            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21771        );
21772        assert_eq_m512h(r, e);
21773    }
21774
21775    #[simd_test(enable = "avx512fp16")]
21776    unsafe fn test_mm512_maskz_fmsubadd_ph() {
21777        let a = _mm512_set1_ph(1.0);
21778        let b = _mm512_set1_ph(2.0);
21779        let c = _mm512_set1_ph(3.0);
21780        let r = _mm512_maskz_fmsubadd_ph(0b00110011001100110011001100110011, a, b, c);
21781        let e = _mm512_set_ph(
21782            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21783            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21784        );
21785        assert_eq_m512h(r, e);
21786    }
21787
21788    #[simd_test(enable = "avx512fp16")]
21789    unsafe fn test_mm512_fmsubadd_round_ph() {
21790        let a = _mm512_set1_ph(1.0);
21791        let b = _mm512_set1_ph(2.0);
21792        let c = _mm512_set1_ph(3.0);
21793        let r =
21794            _mm512_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
21795        let e = _mm512_set_ph(
21796            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21797            -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0, -1.0, 5.0,
21798        );
21799        assert_eq_m512h(r, e);
21800    }
21801
21802    #[simd_test(enable = "avx512fp16")]
21803    unsafe fn test_mm512_mask_fmsubadd_round_ph() {
21804        let a = _mm512_set1_ph(1.0);
21805        let b = _mm512_set1_ph(2.0);
21806        let c = _mm512_set1_ph(3.0);
21807        let r = _mm512_mask_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21808            a,
21809            0b00110011001100110011001100110011,
21810            b,
21811            c,
21812        );
21813        let e = _mm512_set_ph(
21814            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21815            1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0, 1.0, 1.0, -1.0, 5.0,
21816        );
21817        assert_eq_m512h(r, e);
21818    }
21819
21820    #[simd_test(enable = "avx512fp16")]
21821    unsafe fn test_mm512_mask3_fmsubadd_round_ph() {
21822        let a = _mm512_set1_ph(1.0);
21823        let b = _mm512_set1_ph(2.0);
21824        let c = _mm512_set1_ph(3.0);
21825        let r = _mm512_mask3_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21826            a,
21827            b,
21828            c,
21829            0b00110011001100110011001100110011,
21830        );
21831        let e = _mm512_set_ph(
21832            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21833            3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0, 3.0, 3.0, -1.0, 5.0,
21834        );
21835        assert_eq_m512h(r, e);
21836    }
21837
21838    #[simd_test(enable = "avx512fp16")]
21839    unsafe fn test_mm512_maskz_fmsubadd_round_ph() {
21840        let a = _mm512_set1_ph(1.0);
21841        let b = _mm512_set1_ph(2.0);
21842        let c = _mm512_set1_ph(3.0);
21843        let r = _mm512_maskz_fmsubadd_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
21844            0b00110011001100110011001100110011,
21845            a,
21846            b,
21847            c,
21848        );
21849        let e = _mm512_set_ph(
21850            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21851            0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0, 0.0, 0.0, -1.0, 5.0,
21852        );
21853        assert_eq_m512h(r, e);
21854    }
21855
21856    #[simd_test(enable = "avx512fp16,avx512vl")]
21857    unsafe fn test_mm_rcp_ph() {
21858        let a = _mm_set1_ph(2.0);
21859        let r = _mm_rcp_ph(a);
21860        let e = _mm_set1_ph(0.5);
21861        assert_eq_m128h(r, e);
21862    }
21863
21864    #[simd_test(enable = "avx512fp16,avx512vl")]
21865    unsafe fn test_mm_mask_rcp_ph() {
21866        let a = _mm_set1_ph(2.0);
21867        let src = _mm_set1_ph(1.0);
21868        let r = _mm_mask_rcp_ph(src, 0b01010101, a);
21869        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
21870        assert_eq_m128h(r, e);
21871    }
21872
21873    #[simd_test(enable = "avx512fp16,avx512vl")]
21874    unsafe fn test_mm_maskz_rcp_ph() {
21875        let a = _mm_set1_ph(2.0);
21876        let r = _mm_maskz_rcp_ph(0b01010101, a);
21877        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
21878        assert_eq_m128h(r, e);
21879    }
21880
21881    #[simd_test(enable = "avx512fp16,avx512vl")]
21882    unsafe fn test_mm256_rcp_ph() {
21883        let a = _mm256_set1_ph(2.0);
21884        let r = _mm256_rcp_ph(a);
21885        let e = _mm256_set1_ph(0.5);
21886        assert_eq_m256h(r, e);
21887    }
21888
21889    #[simd_test(enable = "avx512fp16,avx512vl")]
21890    unsafe fn test_mm256_mask_rcp_ph() {
21891        let a = _mm256_set1_ph(2.0);
21892        let src = _mm256_set1_ph(1.0);
21893        let r = _mm256_mask_rcp_ph(src, 0b0101010101010101, a);
21894        let e = _mm256_set_ph(
21895            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21896        );
21897        assert_eq_m256h(r, e);
21898    }
21899
21900    #[simd_test(enable = "avx512fp16,avx512vl")]
21901    unsafe fn test_mm256_maskz_rcp_ph() {
21902        let a = _mm256_set1_ph(2.0);
21903        let r = _mm256_maskz_rcp_ph(0b0101010101010101, a);
21904        let e = _mm256_set_ph(
21905            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21906        );
21907        assert_eq_m256h(r, e);
21908    }
21909
21910    #[simd_test(enable = "avx512fp16")]
21911    unsafe fn test_mm512_rcp_ph() {
21912        let a = _mm512_set1_ph(2.0);
21913        let r = _mm512_rcp_ph(a);
21914        let e = _mm512_set1_ph(0.5);
21915        assert_eq_m512h(r, e);
21916    }
21917
21918    #[simd_test(enable = "avx512fp16")]
21919    unsafe fn test_mm512_mask_rcp_ph() {
21920        let a = _mm512_set1_ph(2.0);
21921        let src = _mm512_set1_ph(1.0);
21922        let r = _mm512_mask_rcp_ph(src, 0b01010101010101010101010101010101, a);
21923        let e = _mm512_set_ph(
21924            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
21925            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
21926        );
21927        assert_eq_m512h(r, e);
21928    }
21929
21930    #[simd_test(enable = "avx512fp16")]
21931    unsafe fn test_mm512_maskz_rcp_ph() {
21932        let a = _mm512_set1_ph(2.0);
21933        let r = _mm512_maskz_rcp_ph(0b01010101010101010101010101010101, a);
21934        let e = _mm512_set_ph(
21935            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
21936            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
21937        );
21938        assert_eq_m512h(r, e);
21939    }
21940
21941    #[simd_test(enable = "avx512fp16,avx512vl")]
21942    unsafe fn test_mm_rcp_sh() {
21943        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21944        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21945        let r = _mm_rcp_sh(a, b);
21946        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21947        assert_eq_m128h(r, e);
21948    }
21949
21950    #[simd_test(enable = "avx512fp16,avx512vl")]
21951    unsafe fn test_mm_mask_rcp_sh() {
21952        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21953        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21954        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
21955        let r = _mm_mask_rcp_sh(src, 0, a, b);
21956        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21957        assert_eq_m128h(r, e);
21958        let r = _mm_mask_rcp_sh(src, 1, a, b);
21959        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21960        assert_eq_m128h(r, e);
21961    }
21962
21963    #[simd_test(enable = "avx512fp16,avx512vl")]
21964    unsafe fn test_mm_maskz_rcp_sh() {
21965        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21966        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
21967        let r = _mm_maskz_rcp_sh(0, a, b);
21968        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21969        assert_eq_m128h(r, e);
21970        let r = _mm_maskz_rcp_sh(1, a, b);
21971        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
21972        assert_eq_m128h(r, e);
21973    }
21974
21975    #[simd_test(enable = "avx512fp16,avx512vl")]
21976    unsafe fn test_mm_rsqrt_ph() {
21977        let a = _mm_set1_ph(4.0);
21978        let r = _mm_rsqrt_ph(a);
21979        let e = _mm_set1_ph(0.5);
21980        assert_eq_m128h(r, e);
21981    }
21982
21983    #[simd_test(enable = "avx512fp16,avx512vl")]
21984    unsafe fn test_mm_mask_rsqrt_ph() {
21985        let a = _mm_set1_ph(4.0);
21986        let src = _mm_set1_ph(1.0);
21987        let r = _mm_mask_rsqrt_ph(src, 0b01010101, a);
21988        let e = _mm_set_ph(1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5);
21989        assert_eq_m128h(r, e);
21990    }
21991
21992    #[simd_test(enable = "avx512fp16,avx512vl")]
21993    unsafe fn test_mm_maskz_rsqrt_ph() {
21994        let a = _mm_set1_ph(4.0);
21995        let r = _mm_maskz_rsqrt_ph(0b01010101, a);
21996        let e = _mm_set_ph(0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5);
21997        assert_eq_m128h(r, e);
21998    }
21999
22000    #[simd_test(enable = "avx512fp16,avx512vl")]
22001    unsafe fn test_mm256_rsqrt_ph() {
22002        let a = _mm256_set1_ph(4.0);
22003        let r = _mm256_rsqrt_ph(a);
22004        let e = _mm256_set1_ph(0.5);
22005        assert_eq_m256h(r, e);
22006    }
22007
22008    #[simd_test(enable = "avx512fp16,avx512vl")]
22009    unsafe fn test_mm256_mask_rsqrt_ph() {
22010        let a = _mm256_set1_ph(4.0);
22011        let src = _mm256_set1_ph(1.0);
22012        let r = _mm256_mask_rsqrt_ph(src, 0b0101010101010101, a);
22013        let e = _mm256_set_ph(
22014            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
22015        );
22016        assert_eq_m256h(r, e);
22017    }
22018
22019    #[simd_test(enable = "avx512fp16,avx512vl")]
22020    unsafe fn test_mm256_maskz_rsqrt_ph() {
22021        let a = _mm256_set1_ph(4.0);
22022        let r = _mm256_maskz_rsqrt_ph(0b0101010101010101, a);
22023        let e = _mm256_set_ph(
22024            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
22025        );
22026        assert_eq_m256h(r, e);
22027    }
22028
22029    #[simd_test(enable = "avx512fp16")]
22030    unsafe fn test_mm512_rsqrt_ph() {
22031        let a = _mm512_set1_ph(4.0);
22032        let r = _mm512_rsqrt_ph(a);
22033        let e = _mm512_set1_ph(0.5);
22034        assert_eq_m512h(r, e);
22035    }
22036
22037    #[simd_test(enable = "avx512fp16")]
22038    unsafe fn test_mm512_mask_rsqrt_ph() {
22039        let a = _mm512_set1_ph(4.0);
22040        let src = _mm512_set1_ph(1.0);
22041        let r = _mm512_mask_rsqrt_ph(src, 0b01010101010101010101010101010101, a);
22042        let e = _mm512_set_ph(
22043            1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0,
22044            0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5, 1.0, 0.5,
22045        );
22046        assert_eq_m512h(r, e);
22047    }
22048
22049    #[simd_test(enable = "avx512fp16")]
22050    unsafe fn test_mm512_maskz_rsqrt_ph() {
22051        let a = _mm512_set1_ph(4.0);
22052        let r = _mm512_maskz_rsqrt_ph(0b01010101010101010101010101010101, a);
22053        let e = _mm512_set_ph(
22054            0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0,
22055            0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5,
22056        );
22057        assert_eq_m512h(r, e);
22058    }
22059
22060    #[simd_test(enable = "avx512fp16,avx512vl")]
22061    unsafe fn test_mm_rsqrt_sh() {
22062        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22063        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22064        let r = _mm_rsqrt_sh(a, b);
22065        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22066        assert_eq_m128h(r, e);
22067    }
22068
22069    #[simd_test(enable = "avx512fp16,avx512vl")]
22070    unsafe fn test_mm_mask_rsqrt_sh() {
22071        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22072        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22073        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22074        let r = _mm_mask_rsqrt_sh(src, 0, a, b);
22075        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22076        assert_eq_m128h(r, e);
22077        let r = _mm_mask_rsqrt_sh(src, 1, a, b);
22078        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22079        assert_eq_m128h(r, e);
22080    }
22081
22082    #[simd_test(enable = "avx512fp16,avx512vl")]
22083    unsafe fn test_mm_maskz_rsqrt_sh() {
22084        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22085        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22086        let r = _mm_maskz_rsqrt_sh(0, a, b);
22087        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22088        assert_eq_m128h(r, e);
22089        let r = _mm_maskz_rsqrt_sh(1, a, b);
22090        let e = _mm_setr_ph(0.5, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22091        assert_eq_m128h(r, e);
22092    }
22093
22094    #[simd_test(enable = "avx512fp16,avx512vl")]
22095    unsafe fn test_mm_sqrt_ph() {
22096        let a = _mm_set1_ph(4.0);
22097        let r = _mm_sqrt_ph(a);
22098        let e = _mm_set1_ph(2.0);
22099        assert_eq_m128h(r, e);
22100    }
22101
22102    #[simd_test(enable = "avx512fp16,avx512vl")]
22103    unsafe fn test_mm_mask_sqrt_ph() {
22104        let a = _mm_set1_ph(4.0);
22105        let src = _mm_set1_ph(1.0);
22106        let r = _mm_mask_sqrt_ph(src, 0b01010101, a);
22107        let e = _mm_set_ph(1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0);
22108        assert_eq_m128h(r, e);
22109    }
22110
22111    #[simd_test(enable = "avx512fp16,avx512vl")]
22112    unsafe fn test_mm_maskz_sqrt_ph() {
22113        let a = _mm_set1_ph(4.0);
22114        let r = _mm_maskz_sqrt_ph(0b01010101, a);
22115        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
22116        assert_eq_m128h(r, e);
22117    }
22118
22119    #[simd_test(enable = "avx512fp16,avx512vl")]
22120    unsafe fn test_mm256_sqrt_ph() {
22121        let a = _mm256_set1_ph(4.0);
22122        let r = _mm256_sqrt_ph(a);
22123        let e = _mm256_set1_ph(2.0);
22124        assert_eq_m256h(r, e);
22125    }
22126
22127    #[simd_test(enable = "avx512fp16,avx512vl")]
22128    unsafe fn test_mm256_mask_sqrt_ph() {
22129        let a = _mm256_set1_ph(4.0);
22130        let src = _mm256_set1_ph(1.0);
22131        let r = _mm256_mask_sqrt_ph(src, 0b0101010101010101, a);
22132        let e = _mm256_set_ph(
22133            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22134        );
22135        assert_eq_m256h(r, e);
22136    }
22137
22138    #[simd_test(enable = "avx512fp16,avx512vl")]
22139    unsafe fn test_mm256_maskz_sqrt_ph() {
22140        let a = _mm256_set1_ph(4.0);
22141        let r = _mm256_maskz_sqrt_ph(0b0101010101010101, a);
22142        let e = _mm256_set_ph(
22143            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22144        );
22145        assert_eq_m256h(r, e);
22146    }
22147
22148    #[simd_test(enable = "avx512fp16")]
22149    unsafe fn test_mm512_sqrt_ph() {
22150        let a = _mm512_set1_ph(4.0);
22151        let r = _mm512_sqrt_ph(a);
22152        let e = _mm512_set1_ph(2.0);
22153        assert_eq_m512h(r, e);
22154    }
22155
22156    #[simd_test(enable = "avx512fp16")]
22157    unsafe fn test_mm512_mask_sqrt_ph() {
22158        let a = _mm512_set1_ph(4.0);
22159        let src = _mm512_set1_ph(1.0);
22160        let r = _mm512_mask_sqrt_ph(src, 0b01010101010101010101010101010101, a);
22161        let e = _mm512_set_ph(
22162            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22163            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22164        );
22165        assert_eq_m512h(r, e);
22166    }
22167
22168    #[simd_test(enable = "avx512fp16")]
22169    unsafe fn test_mm512_maskz_sqrt_ph() {
22170        let a = _mm512_set1_ph(4.0);
22171        let r = _mm512_maskz_sqrt_ph(0b01010101010101010101010101010101, a);
22172        let e = _mm512_set_ph(
22173            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22174            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22175        );
22176        assert_eq_m512h(r, e);
22177    }
22178
22179    #[simd_test(enable = "avx512fp16")]
22180    unsafe fn test_mm512_sqrt_round_ph() {
22181        let a = _mm512_set1_ph(4.0);
22182        let r = _mm512_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
22183        let e = _mm512_set1_ph(2.0);
22184        assert_eq_m512h(r, e);
22185    }
22186
22187    #[simd_test(enable = "avx512fp16")]
22188    unsafe fn test_mm512_mask_sqrt_round_ph() {
22189        let a = _mm512_set1_ph(4.0);
22190        let src = _mm512_set1_ph(1.0);
22191        let r = _mm512_mask_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22192            src,
22193            0b01010101010101010101010101010101,
22194            a,
22195        );
22196        let e = _mm512_set_ph(
22197            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
22198            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
22199        );
22200        assert_eq_m512h(r, e);
22201    }
22202
22203    #[simd_test(enable = "avx512fp16")]
22204    unsafe fn test_mm512_maskz_sqrt_round_ph() {
22205        let a = _mm512_set1_ph(4.0);
22206        let r = _mm512_maskz_sqrt_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22207            0b01010101010101010101010101010101,
22208            a,
22209        );
22210        let e = _mm512_set_ph(
22211            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22212            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22213        );
22214        assert_eq_m512h(r, e);
22215    }
22216
22217    #[simd_test(enable = "avx512fp16,avx512vl")]
22218    unsafe fn test_mm_sqrt_sh() {
22219        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22220        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22221        let r = _mm_sqrt_sh(a, b);
22222        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22223        assert_eq_m128h(r, e);
22224    }
22225
22226    #[simd_test(enable = "avx512fp16,avx512vl")]
22227    unsafe fn test_mm_mask_sqrt_sh() {
22228        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22229        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22230        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22231        let r = _mm_mask_sqrt_sh(src, 0, a, b);
22232        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22233        assert_eq_m128h(r, e);
22234        let r = _mm_mask_sqrt_sh(src, 1, a, b);
22235        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22236        assert_eq_m128h(r, e);
22237    }
22238
22239    #[simd_test(enable = "avx512fp16,avx512vl")]
22240    unsafe fn test_mm_maskz_sqrt_sh() {
22241        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22242        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22243        let r = _mm_maskz_sqrt_sh(0, a, b);
22244        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22245        assert_eq_m128h(r, e);
22246        let r = _mm_maskz_sqrt_sh(1, a, b);
22247        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22248        assert_eq_m128h(r, e);
22249    }
22250
22251    #[simd_test(enable = "avx512fp16,avx512vl")]
22252    unsafe fn test_mm_sqrt_round_sh() {
22253        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22254        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22255        let r = _mm_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22256        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22257        assert_eq_m128h(r, e);
22258    }
22259
22260    #[simd_test(enable = "avx512fp16,avx512vl")]
22261    unsafe fn test_mm_mask_sqrt_round_sh() {
22262        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22263        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22264        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22265        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22266            src, 0, a, b,
22267        );
22268        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22269        assert_eq_m128h(r, e);
22270        let r = _mm_mask_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22271            src, 1, a, b,
22272        );
22273        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22274        assert_eq_m128h(r, e);
22275    }
22276
22277    #[simd_test(enable = "avx512fp16,avx512vl")]
22278    unsafe fn test_mm_maskz_sqrt_round_sh() {
22279        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22280        let b = _mm_setr_ph(4.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0);
22281        let r =
22282            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22283        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22284        assert_eq_m128h(r, e);
22285        let r =
22286            _mm_maskz_sqrt_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22287        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22288        assert_eq_m128h(r, e);
22289    }
22290
22291    #[simd_test(enable = "avx512fp16,avx512vl")]
22292    unsafe fn test_mm_max_ph() {
22293        let a = _mm_set1_ph(2.0);
22294        let b = _mm_set1_ph(1.0);
22295        let r = _mm_max_ph(a, b);
22296        let e = _mm_set1_ph(2.0);
22297        assert_eq_m128h(r, e);
22298    }
22299
22300    #[simd_test(enable = "avx512fp16,avx512vl")]
22301    unsafe fn test_mm_mask_max_ph() {
22302        let a = _mm_set1_ph(2.0);
22303        let b = _mm_set1_ph(1.0);
22304        let src = _mm_set1_ph(3.0);
22305        let r = _mm_mask_max_ph(src, 0b01010101, a, b);
22306        let e = _mm_set_ph(3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0);
22307        assert_eq_m128h(r, e);
22308    }
22309
22310    #[simd_test(enable = "avx512fp16,avx512vl")]
22311    unsafe fn test_mm_maskz_max_ph() {
22312        let a = _mm_set1_ph(2.0);
22313        let b = _mm_set1_ph(1.0);
22314        let r = _mm_maskz_max_ph(0b01010101, a, b);
22315        let e = _mm_set_ph(0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0);
22316        assert_eq_m128h(r, e);
22317    }
22318
22319    #[simd_test(enable = "avx512fp16,avx512vl")]
22320    unsafe fn test_mm256_max_ph() {
22321        let a = _mm256_set1_ph(2.0);
22322        let b = _mm256_set1_ph(1.0);
22323        let r = _mm256_max_ph(a, b);
22324        let e = _mm256_set1_ph(2.0);
22325        assert_eq_m256h(r, e);
22326    }
22327
22328    #[simd_test(enable = "avx512fp16,avx512vl")]
22329    unsafe fn test_mm256_mask_max_ph() {
22330        let a = _mm256_set1_ph(2.0);
22331        let b = _mm256_set1_ph(1.0);
22332        let src = _mm256_set1_ph(3.0);
22333        let r = _mm256_mask_max_ph(src, 0b0101010101010101, a, b);
22334        let e = _mm256_set_ph(
22335            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22336        );
22337        assert_eq_m256h(r, e);
22338    }
22339
22340    #[simd_test(enable = "avx512fp16,avx512vl")]
22341    unsafe fn test_mm256_maskz_max_ph() {
22342        let a = _mm256_set1_ph(2.0);
22343        let b = _mm256_set1_ph(1.0);
22344        let r = _mm256_maskz_max_ph(0b0101010101010101, a, b);
22345        let e = _mm256_set_ph(
22346            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22347        );
22348        assert_eq_m256h(r, e);
22349    }
22350
22351    #[simd_test(enable = "avx512fp16")]
22352    unsafe fn test_mm512_max_ph() {
22353        let a = _mm512_set1_ph(2.0);
22354        let b = _mm512_set1_ph(1.0);
22355        let r = _mm512_max_ph(a, b);
22356        let e = _mm512_set1_ph(2.0);
22357        assert_eq_m512h(r, e);
22358    }
22359
22360    #[simd_test(enable = "avx512fp16")]
22361    unsafe fn test_mm512_mask_max_ph() {
22362        let a = _mm512_set1_ph(2.0);
22363        let b = _mm512_set1_ph(1.0);
22364        let src = _mm512_set1_ph(3.0);
22365        let r = _mm512_mask_max_ph(src, 0b01010101010101010101010101010101, a, b);
22366        let e = _mm512_set_ph(
22367            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22368            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22369        );
22370        assert_eq_m512h(r, e);
22371    }
22372
22373    #[simd_test(enable = "avx512fp16")]
22374    unsafe fn test_mm512_maskz_max_ph() {
22375        let a = _mm512_set1_ph(2.0);
22376        let b = _mm512_set1_ph(1.0);
22377        let r = _mm512_maskz_max_ph(0b01010101010101010101010101010101, a, b);
22378        let e = _mm512_set_ph(
22379            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22380            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22381        );
22382        assert_eq_m512h(r, e);
22383    }
22384
22385    #[simd_test(enable = "avx512fp16")]
22386    unsafe fn test_mm512_max_round_ph() {
22387        let a = _mm512_set1_ph(2.0);
22388        let b = _mm512_set1_ph(1.0);
22389        let r = _mm512_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22390        let e = _mm512_set1_ph(2.0);
22391        assert_eq_m512h(r, e);
22392    }
22393
22394    #[simd_test(enable = "avx512fp16")]
22395    unsafe fn test_mm512_mask_max_round_ph() {
22396        let a = _mm512_set1_ph(2.0);
22397        let b = _mm512_set1_ph(1.0);
22398        let src = _mm512_set1_ph(3.0);
22399        let r = _mm512_mask_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22400            src,
22401            0b01010101010101010101010101010101,
22402            a,
22403            b,
22404        );
22405        let e = _mm512_set_ph(
22406            3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0,
22407            2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0,
22408        );
22409        assert_eq_m512h(r, e);
22410    }
22411
22412    #[simd_test(enable = "avx512fp16")]
22413    unsafe fn test_mm512_maskz_max_round_ph() {
22414        let a = _mm512_set1_ph(2.0);
22415        let b = _mm512_set1_ph(1.0);
22416        let r = _mm512_maskz_max_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22417            0b01010101010101010101010101010101,
22418            a,
22419            b,
22420        );
22421        let e = _mm512_set_ph(
22422            0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0,
22423            2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0, 0.0, 2.0,
22424        );
22425        assert_eq_m512h(r, e);
22426    }
22427
22428    #[simd_test(enable = "avx512fp16,avx512vl")]
22429    unsafe fn test_mm_max_sh() {
22430        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22431        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22432        let r = _mm_max_sh(a, b);
22433        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22434        assert_eq_m128h(r, e);
22435    }
22436
22437    #[simd_test(enable = "avx512fp16,avx512vl")]
22438    unsafe fn test_mm_mask_max_sh() {
22439        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22440        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22441        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22442        let r = _mm_mask_max_sh(src, 0, a, b);
22443        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22444        assert_eq_m128h(r, e);
22445        let r = _mm_mask_max_sh(src, 1, a, b);
22446        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22447        assert_eq_m128h(r, e);
22448    }
22449
22450    #[simd_test(enable = "avx512fp16,avx512vl")]
22451    unsafe fn test_mm_maskz_max_sh() {
22452        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22453        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22454        let r = _mm_maskz_max_sh(0, a, b);
22455        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22456        assert_eq_m128h(r, e);
22457        let r = _mm_maskz_max_sh(1, a, b);
22458        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22459        assert_eq_m128h(r, e);
22460    }
22461
22462    #[simd_test(enable = "avx512fp16,avx512vl")]
22463    unsafe fn test_mm_max_round_sh() {
22464        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22465        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22466        let r = _mm_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22467        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22468        assert_eq_m128h(r, e);
22469    }
22470
22471    #[simd_test(enable = "avx512fp16,avx512vl")]
22472    unsafe fn test_mm_mask_max_round_sh() {
22473        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22474        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22475        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22476        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22477            src, 0, a, b,
22478        );
22479        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22480        assert_eq_m128h(r, e);
22481        let r = _mm_mask_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22482            src, 1, a, b,
22483        );
22484        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22485        assert_eq_m128h(r, e);
22486    }
22487
22488    #[simd_test(enable = "avx512fp16,avx512vl")]
22489    unsafe fn test_mm_maskz_max_round_sh() {
22490        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22491        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22492        let r =
22493            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22494        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22495        assert_eq_m128h(r, e);
22496        let r =
22497            _mm_maskz_max_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22498        let e = _mm_setr_ph(2.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22499        assert_eq_m128h(r, e);
22500    }
22501
22502    #[simd_test(enable = "avx512fp16,avx512vl")]
22503    unsafe fn test_mm_min_ph() {
22504        let a = _mm_set1_ph(2.0);
22505        let b = _mm_set1_ph(1.0);
22506        let r = _mm_min_ph(a, b);
22507        let e = _mm_set1_ph(1.0);
22508        assert_eq_m128h(r, e);
22509    }
22510
22511    #[simd_test(enable = "avx512fp16,avx512vl")]
22512    unsafe fn test_mm_mask_min_ph() {
22513        let a = _mm_set1_ph(2.0);
22514        let b = _mm_set1_ph(1.0);
22515        let src = _mm_set1_ph(3.0);
22516        let r = _mm_mask_min_ph(src, 0b01010101, a, b);
22517        let e = _mm_set_ph(3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0);
22518        assert_eq_m128h(r, e);
22519    }
22520
22521    #[simd_test(enable = "avx512fp16,avx512vl")]
22522    unsafe fn test_mm_maskz_min_ph() {
22523        let a = _mm_set1_ph(2.0);
22524        let b = _mm_set1_ph(1.0);
22525        let r = _mm_maskz_min_ph(0b01010101, a, b);
22526        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22527        assert_eq_m128h(r, e);
22528    }
22529
22530    #[simd_test(enable = "avx512fp16,avx512vl")]
22531    unsafe fn test_mm256_min_ph() {
22532        let a = _mm256_set1_ph(2.0);
22533        let b = _mm256_set1_ph(1.0);
22534        let r = _mm256_min_ph(a, b);
22535        let e = _mm256_set1_ph(1.0);
22536        assert_eq_m256h(r, e);
22537    }
22538
22539    #[simd_test(enable = "avx512fp16,avx512vl")]
22540    unsafe fn test_mm256_mask_min_ph() {
22541        let a = _mm256_set1_ph(2.0);
22542        let b = _mm256_set1_ph(1.0);
22543        let src = _mm256_set1_ph(3.0);
22544        let r = _mm256_mask_min_ph(src, 0b0101010101010101, a, b);
22545        let e = _mm256_set_ph(
22546            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22547        );
22548        assert_eq_m256h(r, e);
22549    }
22550
22551    #[simd_test(enable = "avx512fp16,avx512vl")]
22552    unsafe fn test_mm256_maskz_min_ph() {
22553        let a = _mm256_set1_ph(2.0);
22554        let b = _mm256_set1_ph(1.0);
22555        let r = _mm256_maskz_min_ph(0b0101010101010101, a, b);
22556        let e = _mm256_set_ph(
22557            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22558        );
22559        assert_eq_m256h(r, e);
22560    }
22561
22562    #[simd_test(enable = "avx512fp16")]
22563    unsafe fn test_mm512_min_ph() {
22564        let a = _mm512_set1_ph(2.0);
22565        let b = _mm512_set1_ph(1.0);
22566        let r = _mm512_min_ph(a, b);
22567        let e = _mm512_set1_ph(1.0);
22568        assert_eq_m512h(r, e);
22569    }
22570
22571    #[simd_test(enable = "avx512fp16")]
22572    unsafe fn test_mm512_mask_min_ph() {
22573        let a = _mm512_set1_ph(2.0);
22574        let b = _mm512_set1_ph(1.0);
22575        let src = _mm512_set1_ph(3.0);
22576        let r = _mm512_mask_min_ph(src, 0b01010101010101010101010101010101, a, b);
22577        let e = _mm512_set_ph(
22578            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22579            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22580        );
22581        assert_eq_m512h(r, e);
22582    }
22583
22584    #[simd_test(enable = "avx512fp16")]
22585    unsafe fn test_mm512_maskz_min_ph() {
22586        let a = _mm512_set1_ph(2.0);
22587        let b = _mm512_set1_ph(1.0);
22588        let r = _mm512_maskz_min_ph(0b01010101010101010101010101010101, a, b);
22589        let e = _mm512_set_ph(
22590            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22591            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22592        );
22593        assert_eq_m512h(r, e);
22594    }
22595
22596    #[simd_test(enable = "avx512fp16")]
22597    unsafe fn test_mm512_min_round_ph() {
22598        let a = _mm512_set1_ph(2.0);
22599        let b = _mm512_set1_ph(1.0);
22600        let r = _mm512_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22601        let e = _mm512_set1_ph(1.0);
22602        assert_eq_m512h(r, e);
22603    }
22604
22605    #[simd_test(enable = "avx512fp16")]
22606    unsafe fn test_mm512_mask_min_round_ph() {
22607        let a = _mm512_set1_ph(2.0);
22608        let b = _mm512_set1_ph(1.0);
22609        let src = _mm512_set1_ph(3.0);
22610        let r = _mm512_mask_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22611            src,
22612            0b01010101010101010101010101010101,
22613            a,
22614            b,
22615        );
22616        let e = _mm512_set_ph(
22617            3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0,
22618            1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0, 3.0, 1.0,
22619        );
22620        assert_eq_m512h(r, e);
22621    }
22622
22623    #[simd_test(enable = "avx512fp16")]
22624    unsafe fn test_mm512_maskz_min_round_ph() {
22625        let a = _mm512_set1_ph(2.0);
22626        let b = _mm512_set1_ph(1.0);
22627        let r = _mm512_maskz_min_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22628            0b01010101010101010101010101010101,
22629            a,
22630            b,
22631        );
22632        let e = _mm512_set_ph(
22633            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22634            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22635        );
22636        assert_eq_m512h(r, e);
22637    }
22638
22639    #[simd_test(enable = "avx512fp16,avx512vl")]
22640    unsafe fn test_mm_min_sh() {
22641        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22642        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22643        let r = _mm_min_sh(a, b);
22644        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22645        assert_eq_m128h(r, e);
22646    }
22647
22648    #[simd_test(enable = "avx512fp16,avx512vl")]
22649    unsafe fn test_mm_mask_min_sh() {
22650        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22651        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22652        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22653        let r = _mm_mask_min_sh(src, 0, a, b);
22654        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22655        assert_eq_m128h(r, e);
22656        let r = _mm_mask_min_sh(src, 1, a, b);
22657        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22658        assert_eq_m128h(r, e);
22659    }
22660
22661    #[simd_test(enable = "avx512fp16,avx512vl")]
22662    unsafe fn test_mm_maskz_min_sh() {
22663        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22664        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22665        let r = _mm_maskz_min_sh(0, a, b);
22666        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22667        assert_eq_m128h(r, e);
22668        let r = _mm_maskz_min_sh(1, a, b);
22669        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22670        assert_eq_m128h(r, e);
22671    }
22672
22673    #[simd_test(enable = "avx512fp16,avx512vl")]
22674    unsafe fn test_mm_min_round_sh() {
22675        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22676        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22677        let r = _mm_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
22678        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22679        assert_eq_m128h(r, e);
22680    }
22681
22682    #[simd_test(enable = "avx512fp16,avx512vl")]
22683    unsafe fn test_mm_mask_min_round_sh() {
22684        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22685        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22686        let src = _mm_setr_ph(3.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0);
22687        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22688            src, 0, a, b,
22689        );
22690        let e = _mm_setr_ph(3.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22691        assert_eq_m128h(r, e);
22692        let r = _mm_mask_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
22693            src, 1, a, b,
22694        );
22695        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22696        assert_eq_m128h(r, e);
22697    }
22698
22699    #[simd_test(enable = "avx512fp16,avx512vl")]
22700    unsafe fn test_mm_maskz_min_round_sh() {
22701        let a = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22702        let b = _mm_setr_ph(2.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0);
22703        let r =
22704            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
22705        let e = _mm_setr_ph(0.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22706        assert_eq_m128h(r, e);
22707        let r =
22708            _mm_maskz_min_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
22709        let e = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
22710        assert_eq_m128h(r, e);
22711    }
22712
22713    #[simd_test(enable = "avx512fp16,avx512vl")]
22714    unsafe fn test_mm_getexp_ph() {
22715        let a = _mm_set1_ph(3.0);
22716        let r = _mm_getexp_ph(a);
22717        let e = _mm_set1_ph(1.0);
22718        assert_eq_m128h(r, e);
22719    }
22720
22721    #[simd_test(enable = "avx512fp16,avx512vl")]
22722    unsafe fn test_mm_mask_getexp_ph() {
22723        let a = _mm_set1_ph(3.0);
22724        let src = _mm_set1_ph(4.0);
22725        let r = _mm_mask_getexp_ph(src, 0b01010101, a);
22726        let e = _mm_set_ph(4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0);
22727        assert_eq_m128h(r, e);
22728    }
22729
22730    #[simd_test(enable = "avx512fp16,avx512vl")]
22731    unsafe fn test_mm_maskz_getexp_ph() {
22732        let a = _mm_set1_ph(3.0);
22733        let r = _mm_maskz_getexp_ph(0b01010101, a);
22734        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
22735        assert_eq_m128h(r, e);
22736    }
22737
22738    #[simd_test(enable = "avx512fp16,avx512vl")]
22739    unsafe fn test_mm256_getexp_ph() {
22740        let a = _mm256_set1_ph(3.0);
22741        let r = _mm256_getexp_ph(a);
22742        let e = _mm256_set1_ph(1.0);
22743        assert_eq_m256h(r, e);
22744    }
22745
22746    #[simd_test(enable = "avx512fp16,avx512vl")]
22747    unsafe fn test_mm256_mask_getexp_ph() {
22748        let a = _mm256_set1_ph(3.0);
22749        let src = _mm256_set1_ph(4.0);
22750        let r = _mm256_mask_getexp_ph(src, 0b0101010101010101, a);
22751        let e = _mm256_set_ph(
22752            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22753        );
22754        assert_eq_m256h(r, e);
22755    }
22756
22757    #[simd_test(enable = "avx512fp16,avx512vl")]
22758    unsafe fn test_mm256_maskz_getexp_ph() {
22759        let a = _mm256_set1_ph(3.0);
22760        let r = _mm256_maskz_getexp_ph(0b0101010101010101, a);
22761        let e = _mm256_set_ph(
22762            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22763        );
22764        assert_eq_m256h(r, e);
22765    }
22766
22767    #[simd_test(enable = "avx512fp16")]
22768    unsafe fn test_mm512_getexp_ph() {
22769        let a = _mm512_set1_ph(3.0);
22770        let r = _mm512_getexp_ph(a);
22771        let e = _mm512_set1_ph(1.0);
22772        assert_eq_m512h(r, e);
22773    }
22774
22775    #[simd_test(enable = "avx512fp16")]
22776    unsafe fn test_mm512_mask_getexp_ph() {
22777        let a = _mm512_set1_ph(3.0);
22778        let src = _mm512_set1_ph(4.0);
22779        let r = _mm512_mask_getexp_ph(src, 0b01010101010101010101010101010101, a);
22780        let e = _mm512_set_ph(
22781            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
22782            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22783        );
22784        assert_eq_m512h(r, e);
22785    }
22786
22787    #[simd_test(enable = "avx512fp16")]
22788    unsafe fn test_mm512_maskz_getexp_ph() {
22789        let a = _mm512_set1_ph(3.0);
22790        let r = _mm512_maskz_getexp_ph(0b01010101010101010101010101010101, a);
22791        let e = _mm512_set_ph(
22792            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22793            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22794        );
22795        assert_eq_m512h(r, e);
22796    }
22797
22798    #[simd_test(enable = "avx512fp16")]
22799    unsafe fn test_mm512_getexp_round_ph() {
22800        let a = _mm512_set1_ph(3.0);
22801        let r = _mm512_getexp_round_ph::<_MM_FROUND_NO_EXC>(a);
22802        let e = _mm512_set1_ph(1.0);
22803        assert_eq_m512h(r, e);
22804    }
22805
22806    #[simd_test(enable = "avx512fp16")]
22807    unsafe fn test_mm512_mask_getexp_round_ph() {
22808        let a = _mm512_set1_ph(3.0);
22809        let src = _mm512_set1_ph(4.0);
22810        let r = _mm512_mask_getexp_round_ph::<_MM_FROUND_NO_EXC>(
22811            src,
22812            0b01010101010101010101010101010101,
22813            a,
22814        );
22815        let e = _mm512_set_ph(
22816            4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0,
22817            1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0, 4.0, 1.0,
22818        );
22819        assert_eq_m512h(r, e);
22820    }
22821
22822    #[simd_test(enable = "avx512fp16")]
22823    unsafe fn test_mm512_maskz_getexp_round_ph() {
22824        let a = _mm512_set1_ph(3.0);
22825        let r = _mm512_maskz_getexp_round_ph::<_MM_FROUND_NO_EXC>(
22826            0b01010101010101010101010101010101,
22827            a,
22828        );
22829        let e = _mm512_set_ph(
22830            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
22831            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
22832        );
22833        assert_eq_m512h(r, e);
22834    }
22835
22836    #[simd_test(enable = "avx512fp16,avx512vl")]
22837    unsafe fn test_mm_getexp_sh() {
22838        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22839        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22840        let r = _mm_getexp_sh(a, b);
22841        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22842        assert_eq_m128h(r, e);
22843    }
22844
22845    #[simd_test(enable = "avx512fp16,avx512vl")]
22846    unsafe fn test_mm_mask_getexp_sh() {
22847        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22848        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22849        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
22850        let r = _mm_mask_getexp_sh(src, 0, a, b);
22851        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22852        assert_eq_m128h(r, e);
22853        let r = _mm_mask_getexp_sh(src, 1, a, b);
22854        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22855        assert_eq_m128h(r, e);
22856    }
22857
22858    #[simd_test(enable = "avx512fp16,avx512vl")]
22859    unsafe fn test_mm_maskz_getexp_sh() {
22860        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22861        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22862        let r = _mm_maskz_getexp_sh(0, a, b);
22863        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22864        assert_eq_m128h(r, e);
22865        let r = _mm_maskz_getexp_sh(1, a, b);
22866        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22867        assert_eq_m128h(r, e);
22868    }
22869
22870    #[simd_test(enable = "avx512fp16,avx512vl")]
22871    unsafe fn test_mm_getexp_round_sh() {
22872        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22873        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22874        let r = _mm_getexp_round_sh::<_MM_FROUND_NO_EXC>(a, b);
22875        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22876        assert_eq_m128h(r, e);
22877    }
22878
22879    #[simd_test(enable = "avx512fp16,avx512vl")]
22880    unsafe fn test_mm_mask_getexp_round_sh() {
22881        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22882        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22883        let src = _mm_setr_ph(4.0, 30., 31., 32., 33., 34., 35., 36.);
22884        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 0, a, b);
22885        let e = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22886        assert_eq_m128h(r, e);
22887        let r = _mm_mask_getexp_round_sh::<_MM_FROUND_NO_EXC>(src, 1, a, b);
22888        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22889        assert_eq_m128h(r, e);
22890    }
22891
22892    #[simd_test(enable = "avx512fp16,avx512vl")]
22893    unsafe fn test_mm_maskz_getexp_round_sh() {
22894        let a = _mm_setr_ph(4.0, 10., 11., 12., 13., 14., 15., 16.);
22895        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
22896        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(0, a, b);
22897        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
22898        assert_eq_m128h(r, e);
22899        let r = _mm_maskz_getexp_round_sh::<_MM_FROUND_NO_EXC>(1, a, b);
22900        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
22901        assert_eq_m128h(r, e);
22902    }
22903
22904    #[simd_test(enable = "avx512fp16,avx512vl")]
22905    unsafe fn test_mm_getmant_ph() {
22906        let a = _mm_set1_ph(10.0);
22907        let r = _mm_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22908        let e = _mm_set1_ph(1.25);
22909        assert_eq_m128h(r, e);
22910    }
22911
22912    #[simd_test(enable = "avx512fp16,avx512vl")]
22913    unsafe fn test_mm_mask_getmant_ph() {
22914        let a = _mm_set1_ph(10.0);
22915        let src = _mm_set1_ph(20.0);
22916        let r = _mm_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0b01010101, a);
22917        let e = _mm_set_ph(20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25);
22918        assert_eq_m128h(r, e);
22919    }
22920
22921    #[simd_test(enable = "avx512fp16,avx512vl")]
22922    unsafe fn test_mm_maskz_getmant_ph() {
22923        let a = _mm_set1_ph(10.0);
22924        let r = _mm_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0b01010101, a);
22925        let e = _mm_set_ph(0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25);
22926        assert_eq_m128h(r, e);
22927    }
22928
22929    #[simd_test(enable = "avx512fp16,avx512vl")]
22930    unsafe fn test_mm256_getmant_ph() {
22931        let a = _mm256_set1_ph(10.0);
22932        let r = _mm256_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22933        let e = _mm256_set1_ph(1.25);
22934        assert_eq_m256h(r, e);
22935    }
22936
22937    #[simd_test(enable = "avx512fp16,avx512vl")]
22938    unsafe fn test_mm256_mask_getmant_ph() {
22939        let a = _mm256_set1_ph(10.0);
22940        let src = _mm256_set1_ph(20.0);
22941        let r = _mm256_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22942            src,
22943            0b0101010101010101,
22944            a,
22945        );
22946        let e = _mm256_set_ph(
22947            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22948            20.0, 1.25,
22949        );
22950        assert_eq_m256h(r, e);
22951    }
22952
22953    #[simd_test(enable = "avx512fp16,avx512vl")]
22954    unsafe fn test_mm256_maskz_getmant_ph() {
22955        let a = _mm256_set1_ph(10.0);
22956        let r = _mm256_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22957            0b0101010101010101,
22958            a,
22959        );
22960        let e = _mm256_set_ph(
22961            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
22962        );
22963        assert_eq_m256h(r, e);
22964    }
22965
22966    #[simd_test(enable = "avx512fp16")]
22967    unsafe fn test_mm512_getmant_ph() {
22968        let a = _mm512_set1_ph(10.0);
22969        let r = _mm512_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
22970        let e = _mm512_set1_ph(1.25);
22971        assert_eq_m512h(r, e);
22972    }
22973
22974    #[simd_test(enable = "avx512fp16")]
22975    unsafe fn test_mm512_mask_getmant_ph() {
22976        let a = _mm512_set1_ph(10.0);
22977        let src = _mm512_set1_ph(20.0);
22978        let r = _mm512_mask_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22979            src,
22980            0b01010101010101010101010101010101,
22981            a,
22982        );
22983        let e = _mm512_set_ph(
22984            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22985            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
22986            20.0, 1.25, 20.0, 1.25,
22987        );
22988        assert_eq_m512h(r, e);
22989    }
22990
22991    #[simd_test(enable = "avx512fp16")]
22992    unsafe fn test_mm512_maskz_getmant_ph() {
22993        let a = _mm512_set1_ph(10.0);
22994        let r = _mm512_maskz_getmant_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(
22995            0b01010101010101010101010101010101,
22996            a,
22997        );
22998        let e = _mm512_set_ph(
22999            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23000            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23001        );
23002        assert_eq_m512h(r, e);
23003    }
23004
23005    #[simd_test(enable = "avx512fp16")]
23006    unsafe fn test_mm512_getmant_round_ph() {
23007        let a = _mm512_set1_ph(10.0);
23008        let r =
23009            _mm512_getmant_round_ph::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
23010                a,
23011            );
23012        let e = _mm512_set1_ph(1.25);
23013        assert_eq_m512h(r, e);
23014    }
23015
23016    #[simd_test(enable = "avx512fp16")]
23017    unsafe fn test_mm512_mask_getmant_round_ph() {
23018        let a = _mm512_set1_ph(10.0);
23019        let src = _mm512_set1_ph(20.0);
23020        let r = _mm512_mask_getmant_round_ph::<
23021            _MM_MANT_NORM_P75_1P5,
23022            _MM_MANT_SIGN_NAN,
23023            _MM_FROUND_NO_EXC,
23024        >(src, 0b01010101010101010101010101010101, a);
23025        let e = _mm512_set_ph(
23026            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23027            20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25, 20.0, 1.25,
23028            20.0, 1.25, 20.0, 1.25,
23029        );
23030        assert_eq_m512h(r, e);
23031    }
23032
23033    #[simd_test(enable = "avx512fp16")]
23034    unsafe fn test_mm512_maskz_getmant_round_ph() {
23035        let a = _mm512_set1_ph(10.0);
23036        let r = _mm512_maskz_getmant_round_ph::<
23037            _MM_MANT_NORM_P75_1P5,
23038            _MM_MANT_SIGN_NAN,
23039            _MM_FROUND_NO_EXC,
23040        >(0b01010101010101010101010101010101, a);
23041        let e = _mm512_set_ph(
23042            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23043            0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25, 0.0, 1.25,
23044        );
23045        assert_eq_m512h(r, e);
23046    }
23047
23048    #[simd_test(enable = "avx512fp16,avx512vl")]
23049    unsafe fn test_mm_getmant_sh() {
23050        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23051        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23052        let r = _mm_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a, b);
23053        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23054        assert_eq_m128h(r, e);
23055    }
23056
23057    #[simd_test(enable = "avx512fp16,avx512vl")]
23058    unsafe fn test_mm_mask_getmant_sh() {
23059        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23060        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23061        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
23062        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 0, a, b);
23063        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
23064        assert_eq_m128h(r, e);
23065        let r = _mm_mask_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(src, 1, a, b);
23066        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23067        assert_eq_m128h(r, e);
23068    }
23069
23070    #[simd_test(enable = "avx512fp16,avx512vl")]
23071    unsafe fn test_mm_maskz_getmant_sh() {
23072        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23073        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23074        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(0, a, b);
23075        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23076        assert_eq_m128h(r, e);
23077        let r = _mm_maskz_getmant_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(1, a, b);
23078        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23079        assert_eq_m128h(r, e);
23080    }
23081
23082    #[simd_test(enable = "avx512fp16,avx512vl")]
23083    unsafe fn test_mm_getmant_round_sh() {
23084        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23085        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23086        let r = _mm_getmant_round_sh::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN, _MM_FROUND_NO_EXC>(
23087            a, b,
23088        );
23089        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23090        assert_eq_m128h(r, e);
23091    }
23092
23093    #[simd_test(enable = "avx512fp16,avx512vl")]
23094    unsafe fn test_mm_mask_getmant_round_sh() {
23095        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23096        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23097        let src = _mm_setr_ph(20.0, 30., 31., 32., 33., 34., 35., 36.);
23098        let r = _mm_mask_getmant_round_sh::<
23099            _MM_MANT_NORM_P75_1P5,
23100            _MM_MANT_SIGN_NAN,
23101            _MM_FROUND_NO_EXC,
23102        >(src, 0, a, b);
23103        let e = _mm_setr_ph(20.0, 10., 11., 12., 13., 14., 15., 16.);
23104        assert_eq_m128h(r, e);
23105        let r = _mm_mask_getmant_round_sh::<
23106            _MM_MANT_NORM_P75_1P5,
23107            _MM_MANT_SIGN_NAN,
23108            _MM_FROUND_NO_EXC,
23109        >(src, 1, a, b);
23110        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23111        assert_eq_m128h(r, e);
23112    }
23113
23114    #[simd_test(enable = "avx512fp16,avx512vl")]
23115    unsafe fn test_mm_maskz_getmant_round_sh() {
23116        let a = _mm_setr_ph(15.0, 10., 11., 12., 13., 14., 15., 16.);
23117        let b = _mm_setr_ph(10.0, 20., 21., 22., 23., 24., 25., 26.);
23118        let r = _mm_maskz_getmant_round_sh::<
23119            _MM_MANT_NORM_P75_1P5,
23120            _MM_MANT_SIGN_NAN,
23121            _MM_FROUND_NO_EXC,
23122        >(0, a, b);
23123        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23124        assert_eq_m128h(r, e);
23125        let r = _mm_maskz_getmant_round_sh::<
23126            _MM_MANT_NORM_P75_1P5,
23127            _MM_MANT_SIGN_NAN,
23128            _MM_FROUND_NO_EXC,
23129        >(1, a, b);
23130        let e = _mm_setr_ph(1.25, 10., 11., 12., 13., 14., 15., 16.);
23131        assert_eq_m128h(r, e);
23132    }
23133
23134    #[simd_test(enable = "avx512fp16,avx512vl")]
23135    unsafe fn test_mm_roundscale_ph() {
23136        let a = _mm_set1_ph(1.1);
23137        let r = _mm_roundscale_ph::<0>(a);
23138        let e = _mm_set1_ph(1.0);
23139        assert_eq_m128h(r, e);
23140    }
23141
23142    #[simd_test(enable = "avx512fp16,avx512vl")]
23143    unsafe fn test_mm_mask_roundscale_ph() {
23144        let a = _mm_set1_ph(1.1);
23145        let src = _mm_set1_ph(2.0);
23146        let r = _mm_mask_roundscale_ph::<0>(src, 0b01010101, a);
23147        let e = _mm_set_ph(2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0);
23148        assert_eq_m128h(r, e);
23149    }
23150
23151    #[simd_test(enable = "avx512fp16,avx512vl")]
23152    unsafe fn test_mm_maskz_roundscale_ph() {
23153        let a = _mm_set1_ph(1.1);
23154        let r = _mm_maskz_roundscale_ph::<0>(0b01010101, a);
23155        let e = _mm_set_ph(0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
23156        assert_eq_m128h(r, e);
23157    }
23158
23159    #[simd_test(enable = "avx512fp16,avx512vl")]
23160    unsafe fn test_mm256_roundscale_ph() {
23161        let a = _mm256_set1_ph(1.1);
23162        let r = _mm256_roundscale_ph::<0>(a);
23163        let e = _mm256_set1_ph(1.0);
23164        assert_eq_m256h(r, e);
23165    }
23166
23167    #[simd_test(enable = "avx512fp16,avx512vl")]
23168    unsafe fn test_mm256_mask_roundscale_ph() {
23169        let a = _mm256_set1_ph(1.1);
23170        let src = _mm256_set1_ph(2.0);
23171        let r = _mm256_mask_roundscale_ph::<0>(src, 0b0101010101010101, a);
23172        let e = _mm256_set_ph(
23173            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23174        );
23175        assert_eq_m256h(r, e);
23176    }
23177
23178    #[simd_test(enable = "avx512fp16,avx512vl")]
23179    unsafe fn test_mm256_maskz_roundscale_ph() {
23180        let a = _mm256_set1_ph(1.1);
23181        let r = _mm256_maskz_roundscale_ph::<0>(0b0101010101010101, a);
23182        let e = _mm256_set_ph(
23183            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23184        );
23185        assert_eq_m256h(r, e);
23186    }
23187
23188    #[simd_test(enable = "avx512fp16")]
23189    unsafe fn test_mm512_roundscale_ph() {
23190        let a = _mm512_set1_ph(1.1);
23191        let r = _mm512_roundscale_ph::<0>(a);
23192        let e = _mm512_set1_ph(1.0);
23193        assert_eq_m512h(r, e);
23194    }
23195
23196    #[simd_test(enable = "avx512fp16")]
23197    unsafe fn test_mm512_mask_roundscale_ph() {
23198        let a = _mm512_set1_ph(1.1);
23199        let src = _mm512_set1_ph(2.0);
23200        let r = _mm512_mask_roundscale_ph::<0>(src, 0b01010101010101010101010101010101, a);
23201        let e = _mm512_set_ph(
23202            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
23203            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23204        );
23205        assert_eq_m512h(r, e);
23206    }
23207
23208    #[simd_test(enable = "avx512fp16")]
23209    unsafe fn test_mm512_maskz_roundscale_ph() {
23210        let a = _mm512_set1_ph(1.1);
23211        let r = _mm512_maskz_roundscale_ph::<0>(0b01010101010101010101010101010101, a);
23212        let e = _mm512_set_ph(
23213            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23214            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23215        );
23216        assert_eq_m512h(r, e);
23217    }
23218
23219    #[simd_test(enable = "avx512fp16")]
23220    unsafe fn test_mm512_roundscale_round_ph() {
23221        let a = _mm512_set1_ph(1.1);
23222        let r = _mm512_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(a);
23223        let e = _mm512_set1_ph(1.0);
23224        assert_eq_m512h(r, e);
23225    }
23226
23227    #[simd_test(enable = "avx512fp16")]
23228    unsafe fn test_mm512_mask_roundscale_round_ph() {
23229        let a = _mm512_set1_ph(1.1);
23230        let src = _mm512_set1_ph(2.0);
23231        let r = _mm512_mask_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
23232            src,
23233            0b01010101010101010101010101010101,
23234            a,
23235        );
23236        let e = _mm512_set_ph(
23237            2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
23238            1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0,
23239        );
23240        assert_eq_m512h(r, e);
23241    }
23242
23243    #[simd_test(enable = "avx512fp16")]
23244    unsafe fn test_mm512_maskz_roundscale_round_ph() {
23245        let a = _mm512_set1_ph(1.1);
23246        let r = _mm512_maskz_roundscale_round_ph::<0, _MM_FROUND_NO_EXC>(
23247            0b01010101010101010101010101010101,
23248            a,
23249        );
23250        let e = _mm512_set_ph(
23251            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
23252            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0,
23253        );
23254        assert_eq_m512h(r, e);
23255    }
23256
23257    #[simd_test(enable = "avx512fp16,avx512vl")]
23258    unsafe fn test_mm_roundscale_sh() {
23259        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23260        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23261        let r = _mm_roundscale_sh::<0>(a, b);
23262        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23263        assert_eq_m128h(r, e);
23264    }
23265
23266    #[simd_test(enable = "avx512fp16,avx512vl")]
23267    unsafe fn test_mm_mask_roundscale_sh() {
23268        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23269        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23270        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
23271        let r = _mm_mask_roundscale_sh::<0>(src, 0, a, b);
23272        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23273        assert_eq_m128h(r, e);
23274        let r = _mm_mask_roundscale_sh::<0>(src, 1, a, b);
23275        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23276        assert_eq_m128h(r, e);
23277    }
23278
23279    #[simd_test(enable = "avx512fp16,avx512vl")]
23280    unsafe fn test_mm_maskz_roundscale_sh() {
23281        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23282        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23283        let r = _mm_maskz_roundscale_sh::<0>(0, a, b);
23284        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23285        assert_eq_m128h(r, e);
23286        let r = _mm_maskz_roundscale_sh::<0>(1, a, b);
23287        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23288        assert_eq_m128h(r, e);
23289    }
23290
23291    #[simd_test(enable = "avx512fp16,avx512vl")]
23292    unsafe fn test_mm_roundscale_round_sh() {
23293        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23294        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23295        let r = _mm_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(a, b);
23296        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23297        assert_eq_m128h(r, e);
23298    }
23299
23300    #[simd_test(enable = "avx512fp16,avx512vl")]
23301    unsafe fn test_mm_mask_roundscale_round_sh() {
23302        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23303        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23304        let src = _mm_setr_ph(3.0, 30., 31., 32., 33., 34., 35., 36.);
23305        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 0, a, b);
23306        let e = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23307        assert_eq_m128h(r, e);
23308        let r = _mm_mask_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(src, 1, a, b);
23309        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23310        assert_eq_m128h(r, e);
23311    }
23312
23313    #[simd_test(enable = "avx512fp16,avx512vl")]
23314    unsafe fn test_mm_maskz_roundscale_round_sh() {
23315        let a = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23316        let b = _mm_setr_ph(1.1, 20., 21., 22., 23., 24., 25., 26.);
23317        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(0, a, b);
23318        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23319        assert_eq_m128h(r, e);
23320        let r = _mm_maskz_roundscale_round_sh::<0, _MM_FROUND_NO_EXC>(1, a, b);
23321        let e = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23322        assert_eq_m128h(r, e);
23323    }
23324
23325    #[simd_test(enable = "avx512fp16,avx512vl")]
23326    unsafe fn test_mm_scalef_ph() {
23327        let a = _mm_set1_ph(1.);
23328        let b = _mm_set1_ph(3.);
23329        let r = _mm_scalef_ph(a, b);
23330        let e = _mm_set1_ph(8.0);
23331        assert_eq_m128h(r, e);
23332    }
23333
23334    #[simd_test(enable = "avx512fp16,avx512vl")]
23335    unsafe fn test_mm_mask_scalef_ph() {
23336        let a = _mm_set1_ph(1.);
23337        let b = _mm_set1_ph(3.);
23338        let src = _mm_set1_ph(2.);
23339        let r = _mm_mask_scalef_ph(src, 0b01010101, a, b);
23340        let e = _mm_set_ph(2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0);
23341        assert_eq_m128h(r, e);
23342    }
23343
23344    #[simd_test(enable = "avx512fp16,avx512vl")]
23345    unsafe fn test_mm_maskz_scalef_ph() {
23346        let a = _mm_set1_ph(1.);
23347        let b = _mm_set1_ph(3.);
23348        let r = _mm_maskz_scalef_ph(0b01010101, a, b);
23349        let e = _mm_set_ph(0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0);
23350        assert_eq_m128h(r, e);
23351    }
23352
23353    #[simd_test(enable = "avx512fp16,avx512vl")]
23354    unsafe fn test_mm256_scalef_ph() {
23355        let a = _mm256_set1_ph(1.);
23356        let b = _mm256_set1_ph(3.);
23357        let r = _mm256_scalef_ph(a, b);
23358        let e = _mm256_set1_ph(8.0);
23359        assert_eq_m256h(r, e);
23360    }
23361
23362    #[simd_test(enable = "avx512fp16,avx512vl")]
23363    unsafe fn test_mm256_mask_scalef_ph() {
23364        let a = _mm256_set1_ph(1.);
23365        let b = _mm256_set1_ph(3.);
23366        let src = _mm256_set1_ph(2.);
23367        let r = _mm256_mask_scalef_ph(src, 0b0101010101010101, a, b);
23368        let e = _mm256_set_ph(
23369            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23370        );
23371        assert_eq_m256h(r, e);
23372    }
23373
23374    #[simd_test(enable = "avx512fp16,avx512vl")]
23375    unsafe fn test_mm256_maskz_scalef_ph() {
23376        let a = _mm256_set1_ph(1.);
23377        let b = _mm256_set1_ph(3.);
23378        let r = _mm256_maskz_scalef_ph(0b0101010101010101, a, b);
23379        let e = _mm256_set_ph(
23380            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23381        );
23382        assert_eq_m256h(r, e);
23383    }
23384
23385    #[simd_test(enable = "avx512fp16")]
23386    unsafe fn test_mm512_scalef_ph() {
23387        let a = _mm512_set1_ph(1.);
23388        let b = _mm512_set1_ph(3.);
23389        let r = _mm512_scalef_ph(a, b);
23390        let e = _mm512_set1_ph(8.0);
23391        assert_eq_m512h(r, e);
23392    }
23393
23394    #[simd_test(enable = "avx512fp16")]
23395    unsafe fn test_mm512_mask_scalef_ph() {
23396        let a = _mm512_set1_ph(1.);
23397        let b = _mm512_set1_ph(3.);
23398        let src = _mm512_set1_ph(2.);
23399        let r = _mm512_mask_scalef_ph(src, 0b01010101010101010101010101010101, a, b);
23400        let e = _mm512_set_ph(
23401            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23402            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23403        );
23404        assert_eq_m512h(r, e);
23405    }
23406
23407    #[simd_test(enable = "avx512fp16")]
23408    unsafe fn test_mm512_maskz_scalef_ph() {
23409        let a = _mm512_set1_ph(1.);
23410        let b = _mm512_set1_ph(3.);
23411        let r = _mm512_maskz_scalef_ph(0b01010101010101010101010101010101, a, b);
23412        let e = _mm512_set_ph(
23413            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23414            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23415        );
23416        assert_eq_m512h(r, e);
23417    }
23418
23419    #[simd_test(enable = "avx512fp16")]
23420    unsafe fn test_mm512_scalef_round_ph() {
23421        let a = _mm512_set1_ph(1.);
23422        let b = _mm512_set1_ph(3.);
23423        let r = _mm512_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23424        let e = _mm512_set1_ph(8.0);
23425        assert_eq_m512h(r, e);
23426    }
23427
23428    #[simd_test(enable = "avx512fp16")]
23429    unsafe fn test_mm512_mask_scalef_round_ph() {
23430        let a = _mm512_set1_ph(1.);
23431        let b = _mm512_set1_ph(3.);
23432        let src = _mm512_set1_ph(2.);
23433        let r = _mm512_mask_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23434            src,
23435            0b01010101010101010101010101010101,
23436            a,
23437            b,
23438        );
23439        let e = _mm512_set_ph(
23440            2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0,
23441            8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0, 2.0, 8.0,
23442        );
23443        assert_eq_m512h(r, e);
23444    }
23445
23446    #[simd_test(enable = "avx512fp16")]
23447    unsafe fn test_mm512_maskz_scalef_round_ph() {
23448        let a = _mm512_set1_ph(1.);
23449        let b = _mm512_set1_ph(3.);
23450        let r = _mm512_maskz_scalef_round_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23451            0b01010101010101010101010101010101,
23452            a,
23453            b,
23454        );
23455        let e = _mm512_set_ph(
23456            0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0,
23457            8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0, 0.0, 8.0,
23458        );
23459        assert_eq_m512h(r, e);
23460    }
23461
23462    #[simd_test(enable = "avx512fp16,avx512vl")]
23463    unsafe fn test_mm_scalef_sh() {
23464        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23465        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23466        let r = _mm_scalef_sh(a, b);
23467        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23468        assert_eq_m128h(r, e);
23469    }
23470
23471    #[simd_test(enable = "avx512fp16,avx512vl")]
23472    unsafe fn test_mm_mask_scalef_sh() {
23473        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23474        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23475        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23476        let r = _mm_mask_scalef_sh(src, 0, a, b);
23477        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23478        assert_eq_m128h(r, e);
23479        let r = _mm_mask_scalef_sh(src, 1, a, b);
23480        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23481        assert_eq_m128h(r, e);
23482    }
23483
23484    #[simd_test(enable = "avx512fp16,avx512vl")]
23485    unsafe fn test_mm_maskz_scalef_sh() {
23486        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23487        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23488        let r = _mm_maskz_scalef_sh(0, a, b);
23489        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23490        assert_eq_m128h(r, e);
23491        let r = _mm_maskz_scalef_sh(1, a, b);
23492        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23493        assert_eq_m128h(r, e);
23494    }
23495
23496    #[simd_test(enable = "avx512fp16,avx512vl")]
23497    unsafe fn test_mm_scalef_round_sh() {
23498        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23499        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23500        let r = _mm_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
23501        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23502        assert_eq_m128h(r, e);
23503    }
23504
23505    #[simd_test(enable = "avx512fp16,avx512vl")]
23506    unsafe fn test_mm_mask_scalef_round_sh() {
23507        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23508        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23509        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23510        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23511            src, 0, a, b,
23512        );
23513        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23514        assert_eq_m128h(r, e);
23515        let r = _mm_mask_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
23516            src, 1, a, b,
23517        );
23518        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23519        assert_eq_m128h(r, e);
23520    }
23521
23522    #[simd_test(enable = "avx512fp16,avx512vl")]
23523    unsafe fn test_mm_maskz_scalef_round_sh() {
23524        let a = _mm_setr_ph(1.0, 10., 11., 12., 13., 14., 15., 16.);
23525        let b = _mm_setr_ph(3.0, 20., 21., 22., 23., 24., 25., 26.);
23526        let r =
23527            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
23528        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23529        assert_eq_m128h(r, e);
23530        let r =
23531            _mm_maskz_scalef_round_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
23532        let e = _mm_setr_ph(8.0, 10., 11., 12., 13., 14., 15., 16.);
23533        assert_eq_m128h(r, e);
23534    }
23535
23536    #[simd_test(enable = "avx512fp16,avx512vl")]
23537    unsafe fn test_mm_reduce_ph() {
23538        let a = _mm_set1_ph(1.25);
23539        let r = _mm_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23540        let e = _mm_set1_ph(0.25);
23541        assert_eq_m128h(r, e);
23542    }
23543
23544    #[simd_test(enable = "avx512fp16,avx512vl")]
23545    unsafe fn test_mm_mask_reduce_ph() {
23546        let a = _mm_set1_ph(1.25);
23547        let src = _mm_set1_ph(2.0);
23548        let r = _mm_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b01010101, a);
23549        let e = _mm_set_ph(2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25);
23550        assert_eq_m128h(r, e);
23551    }
23552
23553    #[simd_test(enable = "avx512fp16,avx512vl")]
23554    unsafe fn test_mm_maskz_reduce_ph() {
23555        let a = _mm_set1_ph(1.25);
23556        let r = _mm_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b01010101, a);
23557        let e = _mm_set_ph(0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25);
23558        assert_eq_m128h(r, e);
23559    }
23560
23561    #[simd_test(enable = "avx512fp16,avx512vl")]
23562    unsafe fn test_mm256_reduce_ph() {
23563        let a = _mm256_set1_ph(1.25);
23564        let r = _mm256_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23565        let e = _mm256_set1_ph(0.25);
23566        assert_eq_m256h(r, e);
23567    }
23568
23569    #[simd_test(enable = "avx512fp16,avx512vl")]
23570    unsafe fn test_mm256_mask_reduce_ph() {
23571        let a = _mm256_set1_ph(1.25);
23572        let src = _mm256_set1_ph(2.0);
23573        let r = _mm256_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0b0101010101010101, a);
23574        let e = _mm256_set_ph(
23575            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23576        );
23577        assert_eq_m256h(r, e);
23578    }
23579
23580    #[simd_test(enable = "avx512fp16,avx512vl")]
23581    unsafe fn test_mm256_maskz_reduce_ph() {
23582        let a = _mm256_set1_ph(1.25);
23583        let r = _mm256_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(0b0101010101010101, a);
23584        let e = _mm256_set_ph(
23585            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23586        );
23587        assert_eq_m256h(r, e);
23588    }
23589
23590    #[simd_test(enable = "avx512fp16")]
23591    unsafe fn test_mm512_reduce_ph() {
23592        let a = _mm512_set1_ph(1.25);
23593        let r = _mm512_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(a);
23594        let e = _mm512_set1_ph(0.25);
23595        assert_eq_m512h(r, e);
23596    }
23597
23598    #[simd_test(enable = "avx512fp16")]
23599    unsafe fn test_mm512_mask_reduce_ph() {
23600        let a = _mm512_set1_ph(1.25);
23601        let src = _mm512_set1_ph(2.0);
23602        let r = _mm512_mask_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23603            src,
23604            0b01010101010101010101010101010101,
23605            a,
23606        );
23607        let e = _mm512_set_ph(
23608            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23609            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23610        );
23611        assert_eq_m512h(r, e);
23612    }
23613
23614    #[simd_test(enable = "avx512fp16")]
23615    unsafe fn test_mm512_maskz_reduce_ph() {
23616        let a = _mm512_set1_ph(1.25);
23617        let r = _mm512_maskz_reduce_ph::<{ 16 | _MM_FROUND_TO_ZERO }>(
23618            0b01010101010101010101010101010101,
23619            a,
23620        );
23621        let e = _mm512_set_ph(
23622            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23623            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23624        );
23625        assert_eq_m512h(r, e);
23626    }
23627
23628    #[simd_test(enable = "avx512fp16")]
23629    unsafe fn test_mm512_reduce_round_ph() {
23630        let a = _mm512_set1_ph(1.25);
23631        let r = _mm512_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a);
23632        let e = _mm512_set1_ph(0.25);
23633        assert_eq_m512h(r, e);
23634    }
23635
23636    #[simd_test(enable = "avx512fp16")]
23637    unsafe fn test_mm512_mask_reduce_round_ph() {
23638        let a = _mm512_set1_ph(1.25);
23639        let src = _mm512_set1_ph(2.0);
23640        let r = _mm512_mask_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23641            src,
23642            0b01010101010101010101010101010101,
23643            a,
23644        );
23645        let e = _mm512_set_ph(
23646            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23647            2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25, 2.0, 0.25,
23648        );
23649        assert_eq_m512h(r, e);
23650    }
23651
23652    #[simd_test(enable = "avx512fp16")]
23653    unsafe fn test_mm512_maskz_reduce_round_ph() {
23654        let a = _mm512_set1_ph(1.25);
23655        let r = _mm512_maskz_reduce_round_ph::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23656            0b01010101010101010101010101010101,
23657            a,
23658        );
23659        let e = _mm512_set_ph(
23660            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23661            0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25, 0.0, 0.25,
23662        );
23663        assert_eq_m512h(r, e);
23664    }
23665
23666    #[simd_test(enable = "avx512fp16,avx512vl")]
23667    unsafe fn test_mm_reduce_sh() {
23668        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23669        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23670        let r = _mm_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(a, b);
23671        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23672        assert_eq_m128h(r, e);
23673    }
23674
23675    #[simd_test(enable = "avx512fp16,avx512vl")]
23676    unsafe fn test_mm_mask_reduce_sh() {
23677        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23678        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23679        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23680        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 0, a, b);
23681        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23682        assert_eq_m128h(r, e);
23683        let r = _mm_mask_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(src, 1, a, b);
23684        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23685        assert_eq_m128h(r, e);
23686    }
23687
23688    #[simd_test(enable = "avx512fp16,avx512vl")]
23689    unsafe fn test_mm_maskz_reduce_sh() {
23690        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23691        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23692        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(0, a, b);
23693        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23694        assert_eq_m128h(r, e);
23695        let r = _mm_maskz_reduce_sh::<{ 16 | _MM_FROUND_TO_ZERO }>(1, a, b);
23696        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23697        assert_eq_m128h(r, e);
23698    }
23699
23700    #[simd_test(enable = "avx512fp16,avx512vl")]
23701    unsafe fn test_mm_reduce_round_sh() {
23702        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23703        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23704        let r = _mm_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(a, b);
23705        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23706        assert_eq_m128h(r, e);
23707    }
23708
23709    #[simd_test(enable = "avx512fp16,avx512vl")]
23710    unsafe fn test_mm_mask_reduce_round_sh() {
23711        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23712        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23713        let src = _mm_setr_ph(2.0, 30., 31., 32., 33., 34., 35., 36.);
23714        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23715            src, 0, a, b,
23716        );
23717        let e = _mm_setr_ph(2.0, 10., 11., 12., 13., 14., 15., 16.);
23718        assert_eq_m128h(r, e);
23719        let r = _mm_mask_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(
23720            src, 1, a, b,
23721        );
23722        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23723        assert_eq_m128h(r, e);
23724    }
23725
23726    #[simd_test(enable = "avx512fp16,avx512vl")]
23727    unsafe fn test_mm_maskz_reduce_round_sh() {
23728        let a = _mm_setr_ph(3.0, 10., 11., 12., 13., 14., 15., 16.);
23729        let b = _mm_setr_ph(1.25, 20., 21., 22., 23., 24., 25., 26.);
23730        let r =
23731            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(0, a, b);
23732        let e = _mm_setr_ph(0.0, 10., 11., 12., 13., 14., 15., 16.);
23733        assert_eq_m128h(r, e);
23734        let r =
23735            _mm_maskz_reduce_round_sh::<{ 16 | _MM_FROUND_TO_ZERO }, _MM_FROUND_NO_EXC>(1, a, b);
23736        let e = _mm_setr_ph(0.25, 10., 11., 12., 13., 14., 15., 16.);
23737        assert_eq_m128h(r, e);
23738    }
23739
23740    #[simd_test(enable = "avx512fp16,avx512vl")]
23741    unsafe fn test_mm_reduce_add_ph() {
23742        let a = _mm_set1_ph(2.0);
23743        let r = _mm_reduce_add_ph(a);
23744        assert_eq!(r, 16.0);
23745    }
23746
23747    #[simd_test(enable = "avx512fp16,avx512vl")]
23748    unsafe fn test_mm256_reduce_add_ph() {
23749        let a = _mm256_set1_ph(2.0);
23750        let r = _mm256_reduce_add_ph(a);
23751        assert_eq!(r, 32.0);
23752    }
23753
23754    #[simd_test(enable = "avx512fp16")]
23755    unsafe fn test_mm512_reduce_add_ph() {
23756        let a = _mm512_set1_ph(2.0);
23757        let r = _mm512_reduce_add_ph(a);
23758        assert_eq!(r, 64.0);
23759    }
23760
23761    #[simd_test(enable = "avx512fp16,avx512vl")]
23762    unsafe fn test_mm_reduce_mul_ph() {
23763        let a = _mm_set1_ph(2.0);
23764        let r = _mm_reduce_mul_ph(a);
23765        assert_eq!(r, 256.0);
23766    }
23767
23768    #[simd_test(enable = "avx512fp16,avx512vl")]
23769    unsafe fn test_mm256_reduce_mul_ph() {
23770        let a = _mm256_set1_ph(2.0);
23771        let r = _mm256_reduce_mul_ph(a);
23772        assert_eq!(r, 65536.0);
23773    }
23774
23775    #[simd_test(enable = "avx512fp16")]
23776    unsafe fn test_mm512_reduce_mul_ph() {
23777        let a = _mm512_set1_ph(2.0);
23778        let r = _mm512_reduce_mul_ph(a);
23779        assert_eq!(r, 16777216.0);
23780    }
23781
23782    #[simd_test(enable = "avx512fp16,avx512vl")]
23783    unsafe fn test_mm_reduce_max_ph() {
23784        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23785        let r = _mm_reduce_max_ph(a);
23786        assert_eq!(r, 8.0);
23787    }
23788
23789    #[simd_test(enable = "avx512fp16,avx512vl")]
23790    unsafe fn test_mm256_reduce_max_ph() {
23791        let a = _mm256_set_ph(
23792            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23793        );
23794        let r = _mm256_reduce_max_ph(a);
23795        assert_eq!(r, 16.0);
23796    }
23797
23798    #[simd_test(enable = "avx512fp16")]
23799    unsafe fn test_mm512_reduce_max_ph() {
23800        let a = _mm512_set_ph(
23801            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23802            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23803            31.0, 32.0,
23804        );
23805        let r = _mm512_reduce_max_ph(a);
23806        assert_eq!(r, 32.0);
23807    }
23808
23809    #[simd_test(enable = "avx512fp16,avx512vl")]
23810    unsafe fn test_mm_reduce_min_ph() {
23811        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
23812        let r = _mm_reduce_min_ph(a);
23813        assert_eq!(r, 1.0);
23814    }
23815
23816    #[simd_test(enable = "avx512fp16,avx512vl")]
23817    unsafe fn test_mm256_reduce_min_ph() {
23818        let a = _mm256_set_ph(
23819            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23820        );
23821        let r = _mm256_reduce_min_ph(a);
23822        assert_eq!(r, 1.0);
23823    }
23824
23825    #[simd_test(enable = "avx512fp16")]
23826    unsafe fn test_mm512_reduce_min_ph() {
23827        let a = _mm512_set_ph(
23828            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
23829            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
23830            31.0, 32.0,
23831        );
23832        let r = _mm512_reduce_min_ph(a);
23833        assert_eq!(r, 1.0);
23834    }
23835
23836    #[simd_test(enable = "avx512fp16,avx512vl")]
23837    unsafe fn test_mm_fpclass_ph_mask() {
23838        let a = _mm_set_ph(
23839            1.,
23840            f16::INFINITY,
23841            f16::NEG_INFINITY,
23842            0.0,
23843            -0.0,
23844            -2.0,
23845            f16::NAN,
23846            5.9e-8, // Denormal
23847        );
23848        let r = _mm_fpclass_ph_mask::<0x18>(a); // infinities
23849        assert_eq!(r, 0b01100000);
23850    }
23851
23852    #[simd_test(enable = "avx512fp16,avx512vl")]
23853    unsafe fn test_mm_mask_fpclass_ph_mask() {
23854        let a = _mm_set_ph(
23855            1.,
23856            f16::INFINITY,
23857            f16::NEG_INFINITY,
23858            0.0,
23859            -0.0,
23860            -2.0,
23861            f16::NAN,
23862            5.9e-8, // Denormal
23863        );
23864        let r = _mm_mask_fpclass_ph_mask::<0x18>(0b01010101, a);
23865        assert_eq!(r, 0b01000000);
23866    }
23867
23868    #[simd_test(enable = "avx512fp16,avx512vl")]
23869    unsafe fn test_mm256_fpclass_ph_mask() {
23870        let a = _mm256_set_ph(
23871            1.,
23872            f16::INFINITY,
23873            f16::NEG_INFINITY,
23874            0.0,
23875            -0.0,
23876            -2.0,
23877            f16::NAN,
23878            5.9e-8, // Denormal
23879            1.,
23880            f16::INFINITY,
23881            f16::NEG_INFINITY,
23882            0.0,
23883            -0.0,
23884            -2.0,
23885            f16::NAN,
23886            5.9e-8, // Denormal
23887        );
23888        let r = _mm256_fpclass_ph_mask::<0x18>(a); // infinities
23889        assert_eq!(r, 0b0110000001100000);
23890    }
23891
23892    #[simd_test(enable = "avx512fp16,avx512vl")]
23893    unsafe fn test_mm256_mask_fpclass_ph_mask() {
23894        let a = _mm256_set_ph(
23895            1.,
23896            f16::INFINITY,
23897            f16::NEG_INFINITY,
23898            0.0,
23899            -0.0,
23900            -2.0,
23901            f16::NAN,
23902            5.9e-8, // Denormal
23903            1.,
23904            f16::INFINITY,
23905            f16::NEG_INFINITY,
23906            0.0,
23907            -0.0,
23908            -2.0,
23909            f16::NAN,
23910            5.9e-8, // Denormal
23911        );
23912        let r = _mm256_mask_fpclass_ph_mask::<0x18>(0b0101010101010101, a);
23913        assert_eq!(r, 0b0100000001000000);
23914    }
23915
23916    #[simd_test(enable = "avx512fp16")]
23917    unsafe fn test_mm512_fpclass_ph_mask() {
23918        let a = _mm512_set_ph(
23919            1.,
23920            f16::INFINITY,
23921            f16::NEG_INFINITY,
23922            0.0,
23923            -0.0,
23924            -2.0,
23925            f16::NAN,
23926            5.9e-8, // Denormal
23927            1.,
23928            f16::INFINITY,
23929            f16::NEG_INFINITY,
23930            0.0,
23931            -0.0,
23932            -2.0,
23933            f16::NAN,
23934            5.9e-8, // Denormal
23935            1.,
23936            f16::INFINITY,
23937            f16::NEG_INFINITY,
23938            0.0,
23939            -0.0,
23940            -2.0,
23941            f16::NAN,
23942            5.9e-8, // Denormal
23943            1.,
23944            f16::INFINITY,
23945            f16::NEG_INFINITY,
23946            0.0,
23947            -0.0,
23948            -2.0,
23949            f16::NAN,
23950            5.9e-8, // Denormal
23951        );
23952        let r = _mm512_fpclass_ph_mask::<0x18>(a); // infinities
23953        assert_eq!(r, 0b01100000011000000110000001100000);
23954    }
23955
23956    #[simd_test(enable = "avx512fp16")]
23957    unsafe fn test_mm512_mask_fpclass_ph_mask() {
23958        let a = _mm512_set_ph(
23959            1.,
23960            f16::INFINITY,
23961            f16::NEG_INFINITY,
23962            0.0,
23963            -0.0,
23964            -2.0,
23965            f16::NAN,
23966            5.9e-8, // Denormal
23967            1.,
23968            f16::INFINITY,
23969            f16::NEG_INFINITY,
23970            0.0,
23971            -0.0,
23972            -2.0,
23973            f16::NAN,
23974            5.9e-8, // Denormal
23975            1.,
23976            f16::INFINITY,
23977            f16::NEG_INFINITY,
23978            0.0,
23979            -0.0,
23980            -2.0,
23981            f16::NAN,
23982            5.9e-8, // Denormal
23983            1.,
23984            f16::INFINITY,
23985            f16::NEG_INFINITY,
23986            0.0,
23987            -0.0,
23988            -2.0,
23989            f16::NAN,
23990            5.9e-8, // Denormal
23991        );
23992        let r = _mm512_mask_fpclass_ph_mask::<0x18>(0b01010101010101010101010101010101, a);
23993        assert_eq!(r, 0b01000000010000000100000001000000);
23994    }
23995
23996    #[simd_test(enable = "avx512fp16")]
23997    unsafe fn test_mm_fpclass_sh_mask() {
23998        let a = _mm_set_sh(f16::INFINITY);
23999        let r = _mm_fpclass_sh_mask::<0x18>(a);
24000        assert_eq!(r, 1);
24001    }
24002
24003    #[simd_test(enable = "avx512fp16")]
24004    unsafe fn test_mm_mask_fpclass_sh_mask() {
24005        let a = _mm_set_sh(f16::INFINITY);
24006        let r = _mm_mask_fpclass_sh_mask::<0x18>(0, a);
24007        assert_eq!(r, 0);
24008        let r = _mm_mask_fpclass_sh_mask::<0x18>(1, a);
24009        assert_eq!(r, 1);
24010    }
24011
24012    #[simd_test(enable = "avx512fp16,avx512vl")]
24013    unsafe fn test_mm_mask_blend_ph() {
24014        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24015        let b = _mm_set_ph(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0);
24016        let r = _mm_mask_blend_ph(0b01010101, a, b);
24017        let e = _mm_set_ph(1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0);
24018        assert_eq_m128h(r, e);
24019    }
24020
24021    #[simd_test(enable = "avx512fp16,avx512vl")]
24022    unsafe fn test_mm256_mask_blend_ph() {
24023        let a = _mm256_set_ph(
24024            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24025        );
24026        let b = _mm256_set_ph(
24027            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
24028            -14.0, -15.0, -16.0,
24029        );
24030        let r = _mm256_mask_blend_ph(0b0101010101010101, a, b);
24031        let e = _mm256_set_ph(
24032            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
24033            -16.0,
24034        );
24035        assert_eq_m256h(r, e);
24036    }
24037
24038    #[simd_test(enable = "avx512fp16")]
24039    unsafe fn test_mm512_mask_blend_ph() {
24040        let a = _mm512_set_ph(
24041            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24042            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24043            31.0, 32.0,
24044        );
24045        let b = _mm512_set_ph(
24046            -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0, -9.0, -10.0, -11.0, -12.0, -13.0,
24047            -14.0, -15.0, -16.0, -17.0, -18.0, -19.0, -20.0, -21.0, -22.0, -23.0, -24.0, -25.0,
24048            -26.0, -27.0, -28.0, -29.0, -30.0, -31.0, -32.0,
24049        );
24050        let r = _mm512_mask_blend_ph(0b01010101010101010101010101010101, a, b);
24051        let e = _mm512_set_ph(
24052            1.0, -2.0, 3.0, -4.0, 5.0, -6.0, 7.0, -8.0, 9.0, -10.0, 11.0, -12.0, 13.0, -14.0, 15.0,
24053            -16.0, 17.0, -18.0, 19.0, -20.0, 21.0, -22.0, 23.0, -24.0, 25.0, -26.0, 27.0, -28.0,
24054            29.0, -30.0, 31.0, -32.0,
24055        );
24056        assert_eq_m512h(r, e);
24057    }
24058
24059    #[simd_test(enable = "avx512fp16,avx512vl")]
24060    unsafe fn test_mm_permutex2var_ph() {
24061        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24062        let b = _mm_setr_ph(9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
24063        let idx = _mm_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14);
24064        let r = _mm_permutex2var_ph(a, idx, b);
24065        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0);
24066        assert_eq_m128h(r, e);
24067    }
24068
24069    #[simd_test(enable = "avx512fp16,avx512vl")]
24070    unsafe fn test_mm256_permutex2var_ph() {
24071        let a = _mm256_setr_ph(
24072            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24073        );
24074        let b = _mm256_setr_ph(
24075            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24076            31.0, 32.0,
24077        );
24078        let idx = _mm256_setr_epi16(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
24079        let r = _mm256_permutex2var_ph(a, idx, b);
24080        let e = _mm256_setr_ph(
24081            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24082            31.0,
24083        );
24084        assert_eq_m256h(r, e);
24085    }
24086
24087    #[simd_test(enable = "avx512fp16")]
24088    unsafe fn test_mm512_permutex2var_ph() {
24089        let a = _mm512_setr_ph(
24090            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24091            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24092            31.0, 32.0,
24093        );
24094        let b = _mm512_setr_ph(
24095            33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0,
24096            47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0,
24097            61.0, 62.0, 63.0, 64.0,
24098        );
24099        let idx = _mm512_set_epi16(
24100            62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26, 24, 22, 20,
24101            18, 16, 14, 12, 10, 8, 6, 4, 2, 0,
24102        );
24103        let r = _mm512_permutex2var_ph(a, idx, b);
24104        let e = _mm512_setr_ph(
24105            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24106            31.0, 33.0, 35.0, 37.0, 39.0, 41.0, 43.0, 45.0, 47.0, 49.0, 51.0, 53.0, 55.0, 57.0,
24107            59.0, 61.0, 63.0,
24108        );
24109        assert_eq_m512h(r, e);
24110    }
24111
24112    #[simd_test(enable = "avx512fp16,avx512vl")]
24113    unsafe fn test_mm_permutexvar_ph() {
24114        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24115        let idx = _mm_set_epi16(0, 2, 4, 6, 1, 3, 5, 7);
24116        let r = _mm_permutexvar_ph(idx, a);
24117        let e = _mm_setr_ph(1.0, 3.0, 5.0, 7.0, 2.0, 4.0, 6.0, 8.0);
24118        assert_eq_m128h(r, e);
24119    }
24120
24121    #[simd_test(enable = "avx512fp16,avx512vl")]
24122    unsafe fn test_mm256_permutexvar_ph() {
24123        let a = _mm256_set_ph(
24124            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24125        );
24126        let idx = _mm256_set_epi16(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
24127        let r = _mm256_permutexvar_ph(idx, a);
24128        let e = _mm256_setr_ph(
24129            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0,
24130        );
24131        assert_eq_m256h(r, e);
24132    }
24133
24134    #[simd_test(enable = "avx512fp16")]
24135    unsafe fn test_mm512_permutexvar_ph() {
24136        let a = _mm512_set_ph(
24137            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24138            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24139            31.0, 32.0,
24140        );
24141        let idx = _mm512_set_epi16(
24142            0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 1, 3, 5, 7, 9, 11, 13, 15,
24143            17, 19, 21, 23, 25, 27, 29, 31,
24144        );
24145        let r = _mm512_permutexvar_ph(idx, a);
24146        let e = _mm512_setr_ph(
24147            1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0, 17.0, 19.0, 21.0, 23.0, 25.0, 27.0, 29.0,
24148            31.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0,
24149            30.0, 32.0,
24150        );
24151        assert_eq_m512h(r, e);
24152    }
24153
24154    #[simd_test(enable = "avx512fp16,avx512vl")]
24155    unsafe fn test_mm_cvtepi16_ph() {
24156        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24157        let r = _mm_cvtepi16_ph(a);
24158        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24159        assert_eq_m128h(r, e);
24160    }
24161
24162    #[simd_test(enable = "avx512fp16,avx512vl")]
24163    unsafe fn test_mm_mask_cvtepi16_ph() {
24164        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24165        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24166        let r = _mm_mask_cvtepi16_ph(src, 0b01010101, a);
24167        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24168        assert_eq_m128h(r, e);
24169    }
24170
24171    #[simd_test(enable = "avx512fp16,avx512vl")]
24172    unsafe fn test_mm_maskz_cvtepi16_ph() {
24173        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24174        let r = _mm_maskz_cvtepi16_ph(0b01010101, a);
24175        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
24176        assert_eq_m128h(r, e);
24177    }
24178
24179    #[simd_test(enable = "avx512fp16,avx512vl")]
24180    unsafe fn test_mm256_cvtepi16_ph() {
24181        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24182        let r = _mm256_cvtepi16_ph(a);
24183        let e = _mm256_set_ph(
24184            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24185        );
24186        assert_eq_m256h(r, e);
24187    }
24188
24189    #[simd_test(enable = "avx512fp16,avx512vl")]
24190    unsafe fn test_mm256_mask_cvtepi16_ph() {
24191        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24192        let src = _mm256_set_ph(
24193            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24194        );
24195        let r = _mm256_mask_cvtepi16_ph(src, 0b0101010101010101, a);
24196        let e = _mm256_set_ph(
24197            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24198        );
24199        assert_eq_m256h(r, e);
24200    }
24201
24202    #[simd_test(enable = "avx512fp16,avx512vl")]
24203    unsafe fn test_mm256_maskz_cvtepi16_ph() {
24204        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24205        let r = _mm256_maskz_cvtepi16_ph(0b0101010101010101, a);
24206        let e = _mm256_set_ph(
24207            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24208        );
24209        assert_eq_m256h(r, e);
24210    }
24211
24212    #[simd_test(enable = "avx512fp16")]
24213    unsafe fn test_mm512_cvtepi16_ph() {
24214        let a = _mm512_set_epi16(
24215            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24216            25, 26, 27, 28, 29, 30, 31, 32,
24217        );
24218        let r = _mm512_cvtepi16_ph(a);
24219        let e = _mm512_set_ph(
24220            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24221            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24222            31.0, 32.0,
24223        );
24224        assert_eq_m512h(r, e);
24225    }
24226
24227    #[simd_test(enable = "avx512fp16")]
24228    unsafe fn test_mm512_mask_cvtepi16_ph() {
24229        let a = _mm512_set_epi16(
24230            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24231            25, 26, 27, 28, 29, 30, 31, 32,
24232        );
24233        let src = _mm512_set_ph(
24234            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24235            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24236        );
24237        let r = _mm512_mask_cvtepi16_ph(src, 0b01010101010101010101010101010101, a);
24238        let e = _mm512_set_ph(
24239            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24240            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24241        );
24242        assert_eq_m512h(r, e);
24243    }
24244
24245    #[simd_test(enable = "avx512fp16")]
24246    unsafe fn test_mm512_maskz_cvtepi16_ph() {
24247        let a = _mm512_set_epi16(
24248            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24249            25, 26, 27, 28, 29, 30, 31, 32,
24250        );
24251        let r = _mm512_maskz_cvtepi16_ph(0b01010101010101010101010101010101, a);
24252        let e = _mm512_set_ph(
24253            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24254            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24255        );
24256        assert_eq_m512h(r, e);
24257    }
24258
24259    #[simd_test(enable = "avx512fp16")]
24260    unsafe fn test_mm512_cvt_roundepi16_ph() {
24261        let a = _mm512_set_epi16(
24262            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24263            25, 26, 27, 28, 29, 30, 31, 32,
24264        );
24265        let r = _mm512_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24266        let e = _mm512_set_ph(
24267            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24268            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24269            31.0, 32.0,
24270        );
24271        assert_eq_m512h(r, e);
24272    }
24273
24274    #[simd_test(enable = "avx512fp16")]
24275    unsafe fn test_mm512_mask_cvt_roundepi16_ph() {
24276        let a = _mm512_set_epi16(
24277            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24278            25, 26, 27, 28, 29, 30, 31, 32,
24279        );
24280        let src = _mm512_set_ph(
24281            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24282            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24283        );
24284        let r = _mm512_mask_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24285            src,
24286            0b01010101010101010101010101010101,
24287            a,
24288        );
24289        let e = _mm512_set_ph(
24290            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24291            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24292        );
24293        assert_eq_m512h(r, e);
24294    }
24295
24296    #[simd_test(enable = "avx512fp16")]
24297    unsafe fn test_mm512_maskz_cvt_roundepi16_ph() {
24298        let a = _mm512_set_epi16(
24299            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24300            25, 26, 27, 28, 29, 30, 31, 32,
24301        );
24302        let r = _mm512_maskz_cvt_roundepi16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24303            0b01010101010101010101010101010101,
24304            a,
24305        );
24306        let e = _mm512_set_ph(
24307            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24308            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24309        );
24310        assert_eq_m512h(r, e);
24311    }
24312
24313    #[simd_test(enable = "avx512fp16,avx512vl")]
24314    unsafe fn test_mm_cvtepu16_ph() {
24315        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24316        let r = _mm_cvtepu16_ph(a);
24317        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24318        assert_eq_m128h(r, e);
24319    }
24320
24321    #[simd_test(enable = "avx512fp16,avx512vl")]
24322    unsafe fn test_mm_mask_cvtepu16_ph() {
24323        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24324        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24325        let r = _mm_mask_cvtepu16_ph(src, 0b01010101, a);
24326        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24327        assert_eq_m128h(r, e);
24328    }
24329
24330    #[simd_test(enable = "avx512fp16,avx512vl")]
24331    unsafe fn test_mm_maskz_cvtepu16_ph() {
24332        let a = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
24333        let r = _mm_maskz_cvtepu16_ph(0b01010101, a);
24334        let e = _mm_set_ph(0., 2., 0., 4., 0., 6., 0., 8.);
24335        assert_eq_m128h(r, e);
24336    }
24337
24338    #[simd_test(enable = "avx512fp16,avx512vl")]
24339    unsafe fn test_mm256_cvtepu16_ph() {
24340        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24341        let r = _mm256_cvtepu16_ph(a);
24342        let e = _mm256_set_ph(
24343            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24344        );
24345        assert_eq_m256h(r, e);
24346    }
24347
24348    #[simd_test(enable = "avx512fp16,avx512vl")]
24349    unsafe fn test_mm256_mask_cvtepu16_ph() {
24350        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24351        let src = _mm256_set_ph(
24352            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24353        );
24354        let r = _mm256_mask_cvtepu16_ph(src, 0b0101010101010101, a);
24355        let e = _mm256_set_ph(
24356            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24357        );
24358        assert_eq_m256h(r, e);
24359    }
24360
24361    #[simd_test(enable = "avx512fp16,avx512vl")]
24362    unsafe fn test_mm256_maskz_cvtepu16_ph() {
24363        let a = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24364        let r = _mm256_maskz_cvtepu16_ph(0b0101010101010101, a);
24365        let e = _mm256_set_ph(
24366            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16.,
24367        );
24368        assert_eq_m256h(r, e);
24369    }
24370
24371    #[simd_test(enable = "avx512fp16")]
24372    unsafe fn test_mm512_cvtepu16_ph() {
24373        let a = _mm512_set_epi16(
24374            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24375            25, 26, 27, 28, 29, 30, 31, 32,
24376        );
24377        let r = _mm512_cvtepu16_ph(a);
24378        let e = _mm512_set_ph(
24379            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24380            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24381            31.0, 32.0,
24382        );
24383        assert_eq_m512h(r, e);
24384    }
24385
24386    #[simd_test(enable = "avx512fp16")]
24387    unsafe fn test_mm512_mask_cvtepu16_ph() {
24388        let a = _mm512_set_epi16(
24389            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24390            25, 26, 27, 28, 29, 30, 31, 32,
24391        );
24392        let src = _mm512_set_ph(
24393            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24394            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24395        );
24396        let r = _mm512_mask_cvtepu16_ph(src, 0b01010101010101010101010101010101, a);
24397        let e = _mm512_set_ph(
24398            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24399            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24400        );
24401        assert_eq_m512h(r, e);
24402    }
24403
24404    #[simd_test(enable = "avx512fp16")]
24405    unsafe fn test_mm512_maskz_cvtepu16_ph() {
24406        let a = _mm512_set_epi16(
24407            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24408            25, 26, 27, 28, 29, 30, 31, 32,
24409        );
24410        let r = _mm512_maskz_cvtepu16_ph(0b01010101010101010101010101010101, a);
24411        let e = _mm512_set_ph(
24412            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24413            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24414        );
24415        assert_eq_m512h(r, e);
24416    }
24417
24418    #[simd_test(enable = "avx512fp16")]
24419    unsafe fn test_mm512_cvt_roundepu16_ph() {
24420        let a = _mm512_set_epi16(
24421            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24422            25, 26, 27, 28, 29, 30, 31, 32,
24423        );
24424        let r = _mm512_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24425        let e = _mm512_set_ph(
24426            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24427            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
24428            31.0, 32.0,
24429        );
24430        assert_eq_m512h(r, e);
24431    }
24432
24433    #[simd_test(enable = "avx512fp16")]
24434    unsafe fn test_mm512_mask_cvt_roundepu16_ph() {
24435        let a = _mm512_set_epi16(
24436            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24437            25, 26, 27, 28, 29, 30, 31, 32,
24438        );
24439        let src = _mm512_set_ph(
24440            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
24441            27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
24442        );
24443        let r = _mm512_mask_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24444            src,
24445            0b01010101010101010101010101010101,
24446            a,
24447        );
24448        let e = _mm512_set_ph(
24449            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16., 26., 18.,
24450            28., 20., 30., 22., 32., 24., 34., 26., 36., 28., 38., 30., 40., 32.,
24451        );
24452        assert_eq_m512h(r, e);
24453    }
24454
24455    #[simd_test(enable = "avx512fp16")]
24456    unsafe fn test_mm512_maskz_cvt_roundepu16_ph() {
24457        let a = _mm512_set_epi16(
24458            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
24459            25, 26, 27, 28, 29, 30, 31, 32,
24460        );
24461        let r = _mm512_maskz_cvt_roundepu16_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24462            0b01010101010101010101010101010101,
24463            a,
24464        );
24465        let e = _mm512_set_ph(
24466            0., 2., 0., 4., 0., 6., 0., 8., 0., 10., 0., 12., 0., 14., 0., 16., 0., 18., 0., 20.,
24467            0., 22., 0., 24., 0., 26., 0., 28., 0., 30., 0., 32.,
24468        );
24469        assert_eq_m512h(r, e);
24470    }
24471
24472    #[simd_test(enable = "avx512fp16,avx512vl")]
24473    unsafe fn test_mm_cvtepi32_ph() {
24474        let a = _mm_set_epi32(1, 2, 3, 4);
24475        let r = _mm_cvtepi32_ph(a);
24476        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24477        assert_eq_m128h(r, e);
24478    }
24479
24480    #[simd_test(enable = "avx512fp16,avx512vl")]
24481    unsafe fn test_mm_mask_cvtepi32_ph() {
24482        let a = _mm_set_epi32(1, 2, 3, 4);
24483        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24484        let r = _mm_mask_cvtepi32_ph(src, 0b0101, a);
24485        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24486        assert_eq_m128h(r, e);
24487    }
24488
24489    #[simd_test(enable = "avx512fp16,avx512vl")]
24490    unsafe fn test_mm_maskz_cvtepi32_ph() {
24491        let a = _mm_set_epi32(1, 2, 3, 4);
24492        let r = _mm_maskz_cvtepi32_ph(0b0101, a);
24493        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24494        assert_eq_m128h(r, e);
24495    }
24496
24497    #[simd_test(enable = "avx512fp16,avx512vl")]
24498    unsafe fn test_mm256_cvtepi32_ph() {
24499        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24500        let r = _mm256_cvtepi32_ph(a);
24501        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24502        assert_eq_m128h(r, e);
24503    }
24504
24505    #[simd_test(enable = "avx512fp16,avx512vl")]
24506    unsafe fn test_mm256_mask_cvtepi32_ph() {
24507        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24508        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24509        let r = _mm256_mask_cvtepi32_ph(src, 0b01010101, a);
24510        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24511        assert_eq_m128h(r, e);
24512    }
24513
24514    #[simd_test(enable = "avx512fp16,avx512vl")]
24515    unsafe fn test_mm256_maskz_cvtepi32_ph() {
24516        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24517        let r = _mm256_maskz_cvtepi32_ph(0b01010101, a);
24518        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24519        assert_eq_m128h(r, e);
24520    }
24521
24522    #[simd_test(enable = "avx512fp16")]
24523    unsafe fn test_mm512_cvtepi32_ph() {
24524        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24525        let r = _mm512_cvtepi32_ph(a);
24526        let e = _mm256_set_ph(
24527            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24528        );
24529        assert_eq_m256h(r, e);
24530    }
24531
24532    #[simd_test(enable = "avx512fp16")]
24533    unsafe fn test_mm512_mask_cvtepi32_ph() {
24534        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24535        let src = _mm256_set_ph(
24536            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24537        );
24538        let r = _mm512_mask_cvtepi32_ph(src, 0b0101010101010101, a);
24539        let e = _mm256_set_ph(
24540            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24541        );
24542        assert_eq_m256h(r, e);
24543    }
24544
24545    #[simd_test(enable = "avx512fp16")]
24546    unsafe fn test_mm512_maskz_cvtepi32_ph() {
24547        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24548        let r = _mm512_maskz_cvtepi32_ph(0b0101010101010101, a);
24549        let e = _mm256_set_ph(
24550            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24551        );
24552        assert_eq_m256h(r, e);
24553    }
24554
24555    #[simd_test(enable = "avx512fp16")]
24556    unsafe fn test_mm512_cvt_roundepi32_ph() {
24557        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24558        let r = _mm512_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24559        let e = _mm256_set_ph(
24560            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24561        );
24562        assert_eq_m256h(r, e);
24563    }
24564
24565    #[simd_test(enable = "avx512fp16")]
24566    unsafe fn test_mm512_mask_cvt_roundepi32_ph() {
24567        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24568        let src = _mm256_set_ph(
24569            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24570        );
24571        let r = _mm512_mask_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24572            src,
24573            0b0101010101010101,
24574            a,
24575        );
24576        let e = _mm256_set_ph(
24577            10., 2., 12., 4., 14., 6., 16., 8., 18., 10., 20., 12., 22., 14., 24., 16.,
24578        );
24579        assert_eq_m256h(r, e);
24580    }
24581
24582    #[simd_test(enable = "avx512fp16")]
24583    unsafe fn test_mm512_maskz_cvt_roundepi32_ph() {
24584        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24585        let r = _mm512_maskz_cvt_roundepi32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24586            0b0101010101010101,
24587            a,
24588        );
24589        let e = _mm256_set_ph(
24590            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24591        );
24592        assert_eq_m256h(r, e);
24593    }
24594
24595    #[simd_test(enable = "avx512fp16,avx512vl")]
24596    unsafe fn test_mm_cvti32_sh() {
24597        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24598        let r = _mm_cvti32_sh(a, 10);
24599        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24600        assert_eq_m128h(r, e);
24601    }
24602
24603    #[simd_test(enable = "avx512fp16,avx512vl")]
24604    unsafe fn test_mm_cvt_roundi32_sh() {
24605        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24606        let r = _mm_cvt_roundi32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24607        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24608        assert_eq_m128h(r, e);
24609    }
24610
24611    #[simd_test(enable = "avx512fp16,avx512vl")]
24612    unsafe fn test_mm_cvtepu32_ph() {
24613        let a = _mm_set_epi32(1, 2, 3, 4);
24614        let r = _mm_cvtepu32_ph(a);
24615        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24616        assert_eq_m128h(r, e);
24617    }
24618
24619    #[simd_test(enable = "avx512fp16,avx512vl")]
24620    unsafe fn test_mm_mask_cvtepu32_ph() {
24621        let a = _mm_set_epi32(1, 2, 3, 4);
24622        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24623        let r = _mm_mask_cvtepu32_ph(src, 0b0101, a);
24624        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2., 16., 4.);
24625        assert_eq_m128h(r, e);
24626    }
24627
24628    #[simd_test(enable = "avx512fp16,avx512vl")]
24629    unsafe fn test_mm_maskz_cvtepu32_ph() {
24630        let a = _mm_set_epi32(1, 2, 3, 4);
24631        let r = _mm_maskz_cvtepu32_ph(0b0101, a);
24632        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2., 0.0, 4.);
24633        assert_eq_m128h(r, e);
24634    }
24635
24636    #[simd_test(enable = "avx512fp16,avx512vl")]
24637    unsafe fn test_mm256_cvtepu32_ph() {
24638        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24639        let r = _mm256_cvtepu32_ph(a);
24640        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24641        assert_eq_m128h(r, e);
24642    }
24643
24644    #[simd_test(enable = "avx512fp16,avx512vl")]
24645    unsafe fn test_mm256_mask_cvtepu32_ph() {
24646        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24647        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24648        let r = _mm256_mask_cvtepu32_ph(src, 0b01010101, a);
24649        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24650        assert_eq_m128h(r, e);
24651    }
24652
24653    #[simd_test(enable = "avx512fp16,avx512vl")]
24654    unsafe fn test_mm256_maskz_cvtepu32_ph() {
24655        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
24656        let r = _mm256_maskz_cvtepu32_ph(0b01010101, a);
24657        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
24658        assert_eq_m128h(r, e);
24659    }
24660
24661    #[simd_test(enable = "avx512fp16")]
24662    unsafe fn test_mm512_cvtepu32_ph() {
24663        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24664        let r = _mm512_cvtepu32_ph(a);
24665        let e = _mm256_set_ph(
24666            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24667        );
24668        assert_eq_m256h(r, e);
24669    }
24670
24671    #[simd_test(enable = "avx512fp16")]
24672    unsafe fn test_mm512_mask_cvtepu32_ph() {
24673        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24674        let src = _mm256_set_ph(
24675            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24676        );
24677        let r = _mm512_mask_cvtepu32_ph(src, 0b0101010101010101, a);
24678        let e = _mm256_set_ph(
24679            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
24680        );
24681        assert_eq_m256h(r, e);
24682    }
24683
24684    #[simd_test(enable = "avx512fp16")]
24685    unsafe fn test_mm512_maskz_cvtepu32_ph() {
24686        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24687        let r = _mm512_maskz_cvtepu32_ph(0b0101010101010101, a);
24688        let e = _mm256_set_ph(
24689            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24690        );
24691        assert_eq_m256h(r, e);
24692    }
24693
24694    #[simd_test(enable = "avx512fp16")]
24695    unsafe fn test_mm512_cvt_roundepu32_ph() {
24696        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24697        let r = _mm512_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24698        let e = _mm256_set_ph(
24699            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
24700        );
24701        assert_eq_m256h(r, e);
24702    }
24703
24704    #[simd_test(enable = "avx512fp16")]
24705    unsafe fn test_mm512_mask_cvt_roundepu32_ph() {
24706        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24707        let src = _mm256_set_ph(
24708            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
24709        );
24710        let r = _mm512_mask_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24711            src,
24712            0b0101010101010101,
24713            a,
24714        );
24715        let e = _mm256_set_ph(
24716            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
24717            16.0,
24718        );
24719        assert_eq_m256h(r, e);
24720    }
24721
24722    #[simd_test(enable = "avx512fp16")]
24723    unsafe fn test_mm512_maskz_cvt_roundepu32_ph() {
24724        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
24725        let r = _mm512_maskz_cvt_roundepu32_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24726            0b0101010101010101,
24727            a,
24728        );
24729        let e = _mm256_set_ph(
24730            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
24731        );
24732        assert_eq_m256h(r, e);
24733    }
24734
24735    #[simd_test(enable = "avx512fp16,avx512vl")]
24736    unsafe fn test_mm_cvtu32_sh() {
24737        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24738        let r = _mm_cvtu32_sh(a, 10);
24739        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24740        assert_eq_m128h(r, e);
24741    }
24742
24743    #[simd_test(enable = "avx512fp16,avx512vl")]
24744    unsafe fn test_mm_cvt_roundu32_sh() {
24745        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24746        let r = _mm_cvt_roundu32_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 10);
24747        let e = _mm_setr_ph(10.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24748        assert_eq_m128h(r, e);
24749    }
24750
24751    #[simd_test(enable = "avx512fp16,avx512vl")]
24752    unsafe fn test_mm_cvtepi64_ph() {
24753        let a = _mm_set_epi64x(1, 2);
24754        let r = _mm_cvtepi64_ph(a);
24755        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
24756        assert_eq_m128h(r, e);
24757    }
24758
24759    #[simd_test(enable = "avx512fp16,avx512vl")]
24760    unsafe fn test_mm_mask_cvtepi64_ph() {
24761        let a = _mm_set_epi64x(1, 2);
24762        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24763        let r = _mm_mask_cvtepi64_ph(src, 0b01, a);
24764        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
24765        assert_eq_m128h(r, e);
24766    }
24767
24768    #[simd_test(enable = "avx512fp16,avx512vl")]
24769    unsafe fn test_mm_maskz_cvtepi64_ph() {
24770        let a = _mm_set_epi64x(1, 2);
24771        let r = _mm_maskz_cvtepi64_ph(0b01, a);
24772        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.);
24773        assert_eq_m128h(r, e);
24774    }
24775
24776    #[simd_test(enable = "avx512fp16,avx512vl")]
24777    unsafe fn test_mm256_cvtepi64_ph() {
24778        let a = _mm256_set_epi64x(1, 2, 3, 4);
24779        let r = _mm256_cvtepi64_ph(a);
24780        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24781        assert_eq_m128h(r, e);
24782    }
24783
24784    #[simd_test(enable = "avx512fp16,avx512vl")]
24785    unsafe fn test_mm256_mask_cvtepi64_ph() {
24786        let a = _mm256_set_epi64x(1, 2, 3, 4);
24787        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24788        let r = _mm256_mask_cvtepi64_ph(src, 0b0101, a);
24789        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
24790        assert_eq_m128h(r, e);
24791    }
24792
24793    #[simd_test(enable = "avx512fp16,avx512vl")]
24794    unsafe fn test_mm256_maskz_cvtepi64_ph() {
24795        let a = _mm256_set_epi64x(1, 2, 3, 4);
24796        let r = _mm256_maskz_cvtepi64_ph(0b0101, a);
24797        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24798        assert_eq_m128h(r, e);
24799    }
24800
24801    #[simd_test(enable = "avx512fp16,avx512vl")]
24802    unsafe fn test_mm512_cvtepi64_ph() {
24803        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24804        let r = _mm512_cvtepi64_ph(a);
24805        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24806        assert_eq_m128h(r, e);
24807    }
24808
24809    #[simd_test(enable = "avx512fp16,avx512vl")]
24810    unsafe fn test_mm512_mask_cvtepi64_ph() {
24811        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24812        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24813        let r = _mm512_mask_cvtepi64_ph(src, 0b01010101, a);
24814        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24815        assert_eq_m128h(r, e);
24816    }
24817
24818    #[simd_test(enable = "avx512fp16,avx512vl")]
24819    unsafe fn test_mm512_maskz_cvtepi64_ph() {
24820        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24821        let r = _mm512_maskz_cvtepi64_ph(0b01010101, a);
24822        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24823        assert_eq_m128h(r, e);
24824    }
24825
24826    #[simd_test(enable = "avx512fp16,avx512vl")]
24827    unsafe fn test_mm512_cvt_roundepi64_ph() {
24828        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24829        let r = _mm512_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24830        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24831        assert_eq_m128h(r, e);
24832    }
24833
24834    #[simd_test(enable = "avx512fp16")]
24835    unsafe fn test_mm512_mask_cvt_roundepi64_ph() {
24836        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24837        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24838        let r = _mm512_mask_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24839            src, 0b01010101, a,
24840        );
24841        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24842        assert_eq_m128h(r, e);
24843    }
24844
24845    #[simd_test(enable = "avx512fp16,avx512vl")]
24846    unsafe fn test_mm512_maskz_cvt_roundepi64_ph() {
24847        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24848        let r = _mm512_maskz_cvt_roundepi64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24849            0b01010101, a,
24850        );
24851        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24852        assert_eq_m128h(r, e);
24853    }
24854
24855    #[simd_test(enable = "avx512fp16,avx512vl")]
24856    unsafe fn test_mm_cvtepu64_ph() {
24857        let a = _mm_set_epi64x(1, 2);
24858        let r = _mm_cvtepu64_ph(a);
24859        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
24860        assert_eq_m128h(r, e);
24861    }
24862
24863    #[simd_test(enable = "avx512fp16,avx512vl")]
24864    unsafe fn test_mm_mask_cvtepu64_ph() {
24865        let a = _mm_set_epi64x(1, 2);
24866        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24867        let r = _mm_mask_cvtepu64_ph(src, 0b01, a);
24868        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
24869        assert_eq_m128h(r, e);
24870    }
24871
24872    #[simd_test(enable = "avx512fp16,avx512vl")]
24873    unsafe fn test_mm_maskz_cvtepu64_ph() {
24874        let a = _mm_set_epi64x(1, 2);
24875        let r = _mm_maskz_cvtepu64_ph(0b01, a);
24876        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
24877        assert_eq_m128h(r, e);
24878    }
24879
24880    #[simd_test(enable = "avx512fp16,avx512vl")]
24881    unsafe fn test_mm256_cvtepu64_ph() {
24882        let a = _mm256_set_epi64x(1, 2, 3, 4);
24883        let r = _mm256_cvtepu64_ph(a);
24884        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24885        assert_eq_m128h(r, e);
24886    }
24887
24888    #[simd_test(enable = "avx512fp16,avx512vl")]
24889    unsafe fn test_mm256_mask_cvtepu64_ph() {
24890        let a = _mm256_set_epi64x(1, 2, 3, 4);
24891        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24892        let r = _mm256_mask_cvtepu64_ph(src, 0b0101, a);
24893        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
24894        assert_eq_m128h(r, e);
24895    }
24896
24897    #[simd_test(enable = "avx512fp16,avx512vl")]
24898    unsafe fn test_mm256_maskz_cvtepu64_ph() {
24899        let a = _mm256_set_epi64x(1, 2, 3, 4);
24900        let r = _mm256_maskz_cvtepu64_ph(0b0101, a);
24901        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24902        assert_eq_m128h(r, e);
24903    }
24904
24905    #[simd_test(enable = "avx512fp16,avx512vl")]
24906    unsafe fn test_mm512_cvtepu64_ph() {
24907        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24908        let r = _mm512_cvtepu64_ph(a);
24909        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24910        assert_eq_m128h(r, e);
24911    }
24912
24913    #[simd_test(enable = "avx512fp16,avx512vl")]
24914    unsafe fn test_mm512_mask_cvtepu64_ph() {
24915        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24916        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24917        let r = _mm512_mask_cvtepu64_ph(src, 0b01010101, a);
24918        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24919        assert_eq_m128h(r, e);
24920    }
24921
24922    #[simd_test(enable = "avx512fp16,avx512vl")]
24923    unsafe fn test_mm512_maskz_cvtepu64_ph() {
24924        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24925        let r = _mm512_maskz_cvtepu64_ph(0b01010101, a);
24926        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24927        assert_eq_m128h(r, e);
24928    }
24929
24930    #[simd_test(enable = "avx512fp16,avx512vl")]
24931    unsafe fn test_mm512_cvt_roundepu64_ph() {
24932        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24933        let r = _mm512_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
24934        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24935        assert_eq_m128h(r, e);
24936    }
24937
24938    #[simd_test(enable = "avx512fp16,avx512vl")]
24939    unsafe fn test_mm512_mask_cvt_roundepu64_ph() {
24940        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24941        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24942        let r = _mm512_mask_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24943            src, 0b01010101, a,
24944        );
24945        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24946        assert_eq_m128h(r, e);
24947    }
24948
24949    #[simd_test(enable = "avx512fp16,avx512vl")]
24950    unsafe fn test_mm512_maskz_cvt_roundepu64_ph() {
24951        let a = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
24952        let r = _mm512_maskz_cvt_roundepu64_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
24953            0b01010101, a,
24954        );
24955        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
24956        assert_eq_m128h(r, e);
24957    }
24958
24959    #[simd_test(enable = "avx512fp16,avx512vl")]
24960    unsafe fn test_mm_cvtxps_ph() {
24961        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24962        let r = _mm_cvtxps_ph(a);
24963        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
24964        assert_eq_m128h(r, e);
24965    }
24966
24967    #[simd_test(enable = "avx512fp16,avx512vl")]
24968    unsafe fn test_mm_mask_cvtxps_ph() {
24969        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24970        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24971        let r = _mm_mask_cvtxps_ph(src, 0b0101, a);
24972        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16., 4.0);
24973        assert_eq_m128h(r, e);
24974    }
24975
24976    #[simd_test(enable = "avx512fp16,avx512vl")]
24977    unsafe fn test_mm_maskz_cvtxps_ph() {
24978        let a = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
24979        let r = _mm_maskz_cvtxps_ph(0b0101, a);
24980        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
24981        assert_eq_m128h(r, e);
24982    }
24983
24984    #[simd_test(enable = "avx512fp16,avx512vl")]
24985    unsafe fn test_mm256_cvtxps_ph() {
24986        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24987        let r = _mm256_cvtxps_ph(a);
24988        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24989        assert_eq_m128h(r, e);
24990    }
24991
24992    #[simd_test(enable = "avx512fp16,avx512vl")]
24993    unsafe fn test_mm256_mask_cvtxps_ph() {
24994        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
24995        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
24996        let r = _mm256_mask_cvtxps_ph(src, 0b01010101, a);
24997        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
24998        assert_eq_m128h(r, e);
24999    }
25000
25001    #[simd_test(enable = "avx512fp16,avx512vl")]
25002    unsafe fn test_mm256_maskz_cvtxps_ph() {
25003        let a = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25004        let r = _mm256_maskz_cvtxps_ph(0b01010101, a);
25005        let e = _mm_set_ph(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
25006        assert_eq_m128h(r, e);
25007    }
25008
25009    #[simd_test(enable = "avx512fp16")]
25010    unsafe fn test_mm512_cvtxps_ph() {
25011        let a = _mm512_set_ps(
25012            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25013        );
25014        let r = _mm512_cvtxps_ph(a);
25015        let e = _mm256_set_ph(
25016            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25017        );
25018        assert_eq_m256h(r, e);
25019    }
25020
25021    #[simd_test(enable = "avx512fp16")]
25022    unsafe fn test_mm512_mask_cvtxps_ph() {
25023        let a = _mm512_set_ps(
25024            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25025        );
25026        let src = _mm256_set_ph(
25027            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
25028        );
25029        let r = _mm512_mask_cvtxps_ph(src, 0b0101010101010101, a);
25030        let e = _mm256_set_ph(
25031            10., 2.0, 12., 4.0, 14., 6.0, 16., 8.0, 18., 10.0, 20., 12.0, 22., 14.0, 24., 16.0,
25032        );
25033        assert_eq_m256h(r, e);
25034    }
25035
25036    #[simd_test(enable = "avx512fp16")]
25037    unsafe fn test_mm512_maskz_cvtxps_ph() {
25038        let a = _mm512_set_ps(
25039            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25040        );
25041        let r = _mm512_maskz_cvtxps_ph(0b0101010101010101, a);
25042        let e = _mm256_set_ph(
25043            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
25044        );
25045        assert_eq_m256h(r, e);
25046    }
25047
25048    #[simd_test(enable = "avx512fp16")]
25049    unsafe fn test_mm512_cvtx_roundps_ph() {
25050        let a = _mm512_set_ps(
25051            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25052        );
25053        let r = _mm512_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25054        let e = _mm256_set_ph(
25055            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25056        );
25057        assert_eq_m256h(r, e);
25058    }
25059
25060    #[simd_test(enable = "avx512fp16")]
25061    unsafe fn test_mm512_mask_cvtx_roundps_ph() {
25062        let a = _mm512_set_ps(
25063            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25064        );
25065        let src = _mm256_set_ph(
25066            10., 11., 12., 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
25067        );
25068        let r = _mm512_mask_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25069            src,
25070            0b0101010101010101,
25071            a,
25072        );
25073        let e = _mm256_set_ph(
25074            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
25075            16.0,
25076        );
25077        assert_eq_m256h(r, e);
25078    }
25079
25080    #[simd_test(enable = "avx512fp16")]
25081    unsafe fn test_mm512_maskz_cvtx_roundps_ph() {
25082        let a = _mm512_set_ps(
25083            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25084        );
25085        let r = _mm512_maskz_cvtx_roundps_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25086            0b0101010101010101,
25087            a,
25088        );
25089        let e = _mm256_set_ph(
25090            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
25091        );
25092        assert_eq_m256h(r, e);
25093    }
25094
25095    #[simd_test(enable = "avx512fp16,avx512vl")]
25096    unsafe fn test_mm_cvtss_sh() {
25097        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25098        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25099        let r = _mm_cvtss_sh(a, b);
25100        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25101        assert_eq_m128h(r, e);
25102    }
25103
25104    #[simd_test(enable = "avx512fp16,avx512vl")]
25105    unsafe fn test_mm_mask_cvtss_sh() {
25106        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25107        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25108        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25109        let r = _mm_mask_cvtss_sh(src, 0, a, b);
25110        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25111        assert_eq_m128h(r, e);
25112        let r = _mm_mask_cvtss_sh(src, 1, a, b);
25113        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25114        assert_eq_m128h(r, e);
25115    }
25116
25117    #[simd_test(enable = "avx512fp16,avx512vl")]
25118    unsafe fn test_mm_maskz_cvtss_sh() {
25119        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25120        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25121        let r = _mm_maskz_cvtss_sh(0, a, b);
25122        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25123        assert_eq_m128h(r, e);
25124        let r = _mm_maskz_cvtss_sh(1, a, b);
25125        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25126        assert_eq_m128h(r, e);
25127    }
25128
25129    #[simd_test(enable = "avx512fp16,avx512vl")]
25130    unsafe fn test_mm_cvt_roundss_sh() {
25131        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25132        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25133        let r = _mm_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
25134        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25135        assert_eq_m128h(r, e);
25136    }
25137
25138    #[simd_test(enable = "avx512fp16,avx512vl")]
25139    unsafe fn test_mm_mask_cvt_roundss_sh() {
25140        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25141        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25142        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25143        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25144            src, 0, a, b,
25145        );
25146        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25147        assert_eq_m128h(r, e);
25148        let r = _mm_mask_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25149            src, 1, a, b,
25150        );
25151        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25152        assert_eq_m128h(r, e);
25153    }
25154
25155    #[simd_test(enable = "avx512fp16,avx512vl")]
25156    unsafe fn test_mm_maskz_cvt_roundss_sh() {
25157        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25158        let b = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
25159        let r =
25160            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
25161        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25162        assert_eq_m128h(r, e);
25163        let r =
25164            _mm_maskz_cvt_roundss_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
25165        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25166        assert_eq_m128h(r, e);
25167    }
25168
25169    #[simd_test(enable = "avx512fp16,avx512vl")]
25170    unsafe fn test_mm_cvtpd_ph() {
25171        let a = _mm_set_pd(1.0, 2.0);
25172        let r = _mm_cvtpd_ph(a);
25173        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
25174        assert_eq_m128h(r, e);
25175    }
25176
25177    #[simd_test(enable = "avx512fp16,avx512vl")]
25178    unsafe fn test_mm_mask_cvtpd_ph() {
25179        let a = _mm_set_pd(1.0, 2.0);
25180        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25181        let r = _mm_mask_cvtpd_ph(src, 0b01, a);
25182        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16., 2.);
25183        assert_eq_m128h(r, e);
25184    }
25185
25186    #[simd_test(enable = "avx512fp16,avx512vl")]
25187    unsafe fn test_mm_maskz_cvtpd_ph() {
25188        let a = _mm_set_pd(1.0, 2.0);
25189        let r = _mm_maskz_cvtpd_ph(0b01, a);
25190        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0);
25191        assert_eq_m128h(r, e);
25192    }
25193
25194    #[simd_test(enable = "avx512fp16,avx512vl")]
25195    unsafe fn test_mm256_cvtpd_ph() {
25196        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25197        let r = _mm256_cvtpd_ph(a);
25198        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
25199        assert_eq_m128h(r, e);
25200    }
25201
25202    #[simd_test(enable = "avx512fp16,avx512vl")]
25203    unsafe fn test_mm256_mask_cvtpd_ph() {
25204        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25205        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25206        let r = _mm256_mask_cvtpd_ph(src, 0b0101, a);
25207        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 14., 2.0, 16.0, 4.0);
25208        assert_eq_m128h(r, e);
25209    }
25210
25211    #[simd_test(enable = "avx512fp16,avx512vl")]
25212    unsafe fn test_mm256_maskz_cvtpd_ph() {
25213        let a = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
25214        let r = _mm256_maskz_cvtpd_ph(0b0101, a);
25215        let e = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 4.0);
25216        assert_eq_m128h(r, e);
25217    }
25218
25219    #[simd_test(enable = "avx512fp16,avx512vl")]
25220    unsafe fn test_mm512_cvtpd_ph() {
25221        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25222        let r = _mm512_cvtpd_ph(a);
25223        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25224        assert_eq_m128h(r, e);
25225    }
25226
25227    #[simd_test(enable = "avx512fp16,avx512vl")]
25228    unsafe fn test_mm512_mask_cvtpd_ph() {
25229        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25230        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25231        let r = _mm512_mask_cvtpd_ph(src, 0b01010101, a);
25232        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25233        assert_eq_m128h(r, e);
25234    }
25235
25236    #[simd_test(enable = "avx512fp16,avx512vl")]
25237    unsafe fn test_mm512_maskz_cvtpd_ph() {
25238        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25239        let r = _mm512_maskz_cvtpd_ph(0b01010101, a);
25240        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25241        assert_eq_m128h(r, e);
25242    }
25243
25244    #[simd_test(enable = "avx512fp16,avx512vl")]
25245    unsafe fn test_mm512_cvt_roundpd_ph() {
25246        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25247        let r = _mm512_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25248        let e = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25249        assert_eq_m128h(r, e);
25250    }
25251
25252    #[simd_test(enable = "avx512fp16,avx512vl")]
25253    unsafe fn test_mm512_mask_cvt_roundpd_ph() {
25254        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25255        let src = _mm_set_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25256        let r = _mm512_mask_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25257            src, 0b01010101, a,
25258        );
25259        let e = _mm_set_ph(10., 2., 12., 4., 14., 6., 16., 8.);
25260        assert_eq_m128h(r, e);
25261    }
25262
25263    #[simd_test(enable = "avx512fp16,avx512vl")]
25264    unsafe fn test_mm512_maskz_cvt_roundpd_ph() {
25265        let a = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25266        let r = _mm512_maskz_cvt_roundpd_ph::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25267            0b01010101, a,
25268        );
25269        let e = _mm_set_ph(0.0, 2., 0.0, 4., 0.0, 6., 0.0, 8.);
25270        assert_eq_m128h(r, e);
25271    }
25272
25273    #[simd_test(enable = "avx512fp16,avx512vl")]
25274    unsafe fn test_mm_cvtsd_sh() {
25275        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25276        let b = _mm_setr_pd(1.0, 2.0);
25277        let r = _mm_cvtsd_sh(a, b);
25278        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25279        assert_eq_m128h(r, e);
25280    }
25281
25282    #[simd_test(enable = "avx512fp16,avx512vl")]
25283    unsafe fn test_mm_mask_cvtsd_sh() {
25284        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25285        let b = _mm_setr_pd(1.0, 2.0);
25286        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25287        let r = _mm_mask_cvtsd_sh(src, 0, a, b);
25288        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25289        assert_eq_m128h(r, e);
25290        let r = _mm_mask_cvtsd_sh(src, 1, a, b);
25291        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25292        assert_eq_m128h(r, e);
25293    }
25294
25295    #[simd_test(enable = "avx512fp16,avx512vl")]
25296    unsafe fn test_mm_maskz_cvtsd_sh() {
25297        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25298        let b = _mm_setr_pd(1.0, 2.0);
25299        let r = _mm_maskz_cvtsd_sh(0, a, b);
25300        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25301        assert_eq_m128h(r, e);
25302        let r = _mm_maskz_cvtsd_sh(1, a, b);
25303        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25304        assert_eq_m128h(r, e);
25305    }
25306
25307    #[simd_test(enable = "avx512fp16,avx512vl")]
25308    unsafe fn test_mm_cvt_roundsd_sh() {
25309        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25310        let b = _mm_setr_pd(1.0, 2.0);
25311        let r = _mm_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
25312        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25313        assert_eq_m128h(r, e);
25314    }
25315
25316    #[simd_test(enable = "avx512fp16,avx512vl")]
25317    unsafe fn test_mm_mask_cvt_roundsd_sh() {
25318        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25319        let b = _mm_setr_pd(1.0, 2.0);
25320        let src = _mm_setr_ph(20., 21., 22., 23., 24., 25., 26., 27.);
25321        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25322            src, 0, a, b,
25323        );
25324        let e = _mm_setr_ph(20., 11., 12., 13., 14., 15., 16., 17.);
25325        assert_eq_m128h(r, e);
25326        let r = _mm_mask_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25327            src, 1, a, b,
25328        );
25329        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25330        assert_eq_m128h(r, e);
25331    }
25332
25333    #[simd_test(enable = "avx512fp16,avx512vl")]
25334    unsafe fn test_mm_maskz_cvt_roundsd_sh() {
25335        let a = _mm_setr_ph(10., 11., 12., 13., 14., 15., 16., 17.);
25336        let b = _mm_setr_pd(1.0, 2.0);
25337        let r =
25338            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
25339        let e = _mm_setr_ph(0.0, 11., 12., 13., 14., 15., 16., 17.);
25340        assert_eq_m128h(r, e);
25341        let r =
25342            _mm_maskz_cvt_roundsd_sh::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(1, a, b);
25343        let e = _mm_setr_ph(1.0, 11., 12., 13., 14., 15., 16., 17.);
25344        assert_eq_m128h(r, e);
25345    }
25346
25347    #[simd_test(enable = "avx512fp16,avx512vl")]
25348    unsafe fn test_mm_cvtph_epi16() {
25349        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25350        let r = _mm_cvttph_epi16(a);
25351        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25352        assert_eq_m128i(r, e);
25353    }
25354
25355    #[simd_test(enable = "avx512fp16,avx512vl")]
25356    unsafe fn test_mm_mask_cvtph_epi16() {
25357        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25358        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25359        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25360        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25361        assert_eq_m128i(r, e);
25362    }
25363
25364    #[simd_test(enable = "avx512fp16,avx512vl")]
25365    unsafe fn test_mm_maskz_cvtph_epi16() {
25366        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25367        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25368        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25369        assert_eq_m128i(r, e);
25370    }
25371
25372    #[simd_test(enable = "avx512fp16,avx512vl")]
25373    unsafe fn test_mm256_cvtph_epi16() {
25374        let a = _mm256_set_ph(
25375            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25376        );
25377        let r = _mm256_cvttph_epi16(a);
25378        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25379        assert_eq_m256i(r, e);
25380    }
25381
25382    #[simd_test(enable = "avx512fp16,avx512vl")]
25383    unsafe fn test_mm256_mask_cvtph_epi16() {
25384        let a = _mm256_set_ph(
25385            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25386        );
25387        let src = _mm256_set_epi16(
25388            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25389        );
25390        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25391        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25392        assert_eq_m256i(r, e);
25393    }
25394
25395    #[simd_test(enable = "avx512fp16,avx512vl")]
25396    unsafe fn test_mm256_maskz_cvtph_epi16() {
25397        let a = _mm256_set_ph(
25398            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25399        );
25400        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25401        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25402        assert_eq_m256i(r, e);
25403    }
25404
25405    #[simd_test(enable = "avx512fp16")]
25406    unsafe fn test_mm512_cvtph_epi16() {
25407        let a = _mm512_set_ph(
25408            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25409            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25410            31.0, 32.0,
25411        );
25412        let r = _mm512_cvttph_epi16(a);
25413        let e = _mm512_set_epi16(
25414            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25415            25, 26, 27, 28, 29, 30, 31, 32,
25416        );
25417        assert_eq_m512i(r, e);
25418    }
25419
25420    #[simd_test(enable = "avx512fp16")]
25421    unsafe fn test_mm512_mask_cvtph_epi16() {
25422        let a = _mm512_set_ph(
25423            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25424            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25425            31.0, 32.0,
25426        );
25427        let src = _mm512_set_epi16(
25428            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25429            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25430        );
25431        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25432        let e = _mm512_set_epi16(
25433            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25434            24, 34, 26, 36, 28, 38, 30, 40, 32,
25435        );
25436        assert_eq_m512i(r, e);
25437    }
25438
25439    #[simd_test(enable = "avx512fp16")]
25440    unsafe fn test_mm512_maskz_cvtph_epi16() {
25441        let a = _mm512_set_ph(
25442            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25443            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25444            31.0, 32.0,
25445        );
25446        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25447        let e = _mm512_set_epi16(
25448            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25449            0, 28, 0, 30, 0, 32,
25450        );
25451        assert_eq_m512i(r, e);
25452    }
25453
25454    #[simd_test(enable = "avx512fp16")]
25455    unsafe fn test_mm512_cvt_roundph_epi16() {
25456        let a = _mm512_set_ph(
25457            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25458            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25459            31.0, 32.0,
25460        );
25461        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
25462        let e = _mm512_set_epi16(
25463            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25464            25, 26, 27, 28, 29, 30, 31, 32,
25465        );
25466        assert_eq_m512i(r, e);
25467    }
25468
25469    #[simd_test(enable = "avx512fp16")]
25470    unsafe fn test_mm512_mask_cvt_roundph_epi16() {
25471        let a = _mm512_set_ph(
25472            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25473            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25474            31.0, 32.0,
25475        );
25476        let src = _mm512_set_epi16(
25477            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25478            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25479        );
25480        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25481            src,
25482            0b01010101010101010101010101010101,
25483            a,
25484        );
25485        let e = _mm512_set_epi16(
25486            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25487            24, 34, 26, 36, 28, 38, 30, 40, 32,
25488        );
25489        assert_eq_m512i(r, e);
25490    }
25491
25492    #[simd_test(enable = "avx512fp16")]
25493    unsafe fn test_mm512_maskz_cvt_roundph_epi16() {
25494        let a = _mm512_set_ph(
25495            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25496            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25497            31.0, 32.0,
25498        );
25499        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25500            0b01010101010101010101010101010101,
25501            a,
25502        );
25503        let e = _mm512_set_epi16(
25504            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25505            0, 28, 0, 30, 0, 32,
25506        );
25507        assert_eq_m512i(r, e);
25508    }
25509
25510    #[simd_test(enable = "avx512fp16,avx512vl")]
25511    unsafe fn test_mm_cvtph_epu16() {
25512        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25513        let r = _mm_cvttph_epu16(a);
25514        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25515        assert_eq_m128i(r, e);
25516    }
25517
25518    #[simd_test(enable = "avx512fp16,avx512vl")]
25519    unsafe fn test_mm_mask_cvtph_epu16() {
25520        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25521        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25522        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
25523        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25524        assert_eq_m128i(r, e);
25525    }
25526
25527    #[simd_test(enable = "avx512fp16,avx512vl")]
25528    unsafe fn test_mm_maskz_cvtph_epu16() {
25529        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25530        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
25531        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25532        assert_eq_m128i(r, e);
25533    }
25534
25535    #[simd_test(enable = "avx512fp16,avx512vl")]
25536    unsafe fn test_mm256_cvtph_epu16() {
25537        let a = _mm256_set_ph(
25538            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25539        );
25540        let r = _mm256_cvttph_epu16(a);
25541        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25542        assert_eq_m256i(r, e);
25543    }
25544
25545    #[simd_test(enable = "avx512fp16,avx512vl")]
25546    unsafe fn test_mm256_mask_cvtph_epu16() {
25547        let a = _mm256_set_ph(
25548            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25549        );
25550        let src = _mm256_set_epi16(
25551            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25552        );
25553        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
25554        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25555        assert_eq_m256i(r, e);
25556    }
25557
25558    #[simd_test(enable = "avx512fp16,avx512vl")]
25559    unsafe fn test_mm256_maskz_cvtph_epu16() {
25560        let a = _mm256_set_ph(
25561            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25562        );
25563        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
25564        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25565        assert_eq_m256i(r, e);
25566    }
25567
25568    #[simd_test(enable = "avx512fp16")]
25569    unsafe fn test_mm512_cvtph_epu16() {
25570        let a = _mm512_set_ph(
25571            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25572            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25573            31.0, 32.0,
25574        );
25575        let r = _mm512_cvttph_epu16(a);
25576        let e = _mm512_set_epi16(
25577            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25578            25, 26, 27, 28, 29, 30, 31, 32,
25579        );
25580        assert_eq_m512i(r, e);
25581    }
25582
25583    #[simd_test(enable = "avx512fp16")]
25584    unsafe fn test_mm512_mask_cvtph_epu16() {
25585        let a = _mm512_set_ph(
25586            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25587            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25588            31.0, 32.0,
25589        );
25590        let src = _mm512_set_epi16(
25591            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25592            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25593        );
25594        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
25595        let e = _mm512_set_epi16(
25596            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25597            24, 34, 26, 36, 28, 38, 30, 40, 32,
25598        );
25599        assert_eq_m512i(r, e);
25600    }
25601
25602    #[simd_test(enable = "avx512fp16")]
25603    unsafe fn test_mm512_maskz_cvtph_epu16() {
25604        let a = _mm512_set_ph(
25605            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25606            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25607            31.0, 32.0,
25608        );
25609        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
25610        let e = _mm512_set_epi16(
25611            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25612            0, 28, 0, 30, 0, 32,
25613        );
25614        assert_eq_m512i(r, e);
25615    }
25616
25617    #[simd_test(enable = "avx512fp16")]
25618    unsafe fn test_mm512_cvt_roundph_epu16() {
25619        let a = _mm512_set_ph(
25620            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25621            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25622            31.0, 32.0,
25623        );
25624        let r = _mm512_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
25625        let e = _mm512_set_epi16(
25626            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25627            25, 26, 27, 28, 29, 30, 31, 32,
25628        );
25629        assert_eq_m512i(r, e);
25630    }
25631
25632    #[simd_test(enable = "avx512fp16")]
25633    unsafe fn test_mm512_mask_cvt_roundph_epu16() {
25634        let a = _mm512_set_ph(
25635            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25636            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25637            31.0, 32.0,
25638        );
25639        let src = _mm512_set_epi16(
25640            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25641            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25642        );
25643        let r = _mm512_mask_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25644            src,
25645            0b01010101010101010101010101010101,
25646            a,
25647        );
25648        let e = _mm512_set_epi16(
25649            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25650            24, 34, 26, 36, 28, 38, 30, 40, 32,
25651        );
25652        assert_eq_m512i(r, e);
25653    }
25654
25655    #[simd_test(enable = "avx512fp16")]
25656    unsafe fn test_mm512_maskz_cvt_roundph_epu16() {
25657        let a = _mm512_set_ph(
25658            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25659            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25660            31.0, 32.0,
25661        );
25662        let r = _mm512_maskz_cvt_roundph_epu16::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
25663            0b01010101010101010101010101010101,
25664            a,
25665        );
25666        let e = _mm512_set_epi16(
25667            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25668            0, 28, 0, 30, 0, 32,
25669        );
25670        assert_eq_m512i(r, e);
25671    }
25672
25673    #[simd_test(enable = "avx512fp16,avx512vl")]
25674    unsafe fn test_mm_cvttph_epi16() {
25675        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25676        let r = _mm_cvttph_epi16(a);
25677        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25678        assert_eq_m128i(r, e);
25679    }
25680
25681    #[simd_test(enable = "avx512fp16,avx512vl")]
25682    unsafe fn test_mm_mask_cvttph_epi16() {
25683        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25684        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25685        let r = _mm_mask_cvttph_epi16(src, 0b01010101, a);
25686        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25687        assert_eq_m128i(r, e);
25688    }
25689
25690    #[simd_test(enable = "avx512fp16,avx512vl")]
25691    unsafe fn test_mm_maskz_cvttph_epi16() {
25692        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25693        let r = _mm_maskz_cvttph_epi16(0b01010101, a);
25694        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25695        assert_eq_m128i(r, e);
25696    }
25697
25698    #[simd_test(enable = "avx512fp16,avx512vl")]
25699    unsafe fn test_mm256_cvttph_epi16() {
25700        let a = _mm256_set_ph(
25701            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25702        );
25703        let r = _mm256_cvttph_epi16(a);
25704        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25705        assert_eq_m256i(r, e);
25706    }
25707
25708    #[simd_test(enable = "avx512fp16,avx512vl")]
25709    unsafe fn test_mm256_mask_cvttph_epi16() {
25710        let a = _mm256_set_ph(
25711            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25712        );
25713        let src = _mm256_set_epi16(
25714            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25715        );
25716        let r = _mm256_mask_cvttph_epi16(src, 0b0101010101010101, a);
25717        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25718        assert_eq_m256i(r, e);
25719    }
25720
25721    #[simd_test(enable = "avx512fp16,avx512vl")]
25722    unsafe fn test_mm256_maskz_cvttph_epi16() {
25723        let a = _mm256_set_ph(
25724            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25725        );
25726        let r = _mm256_maskz_cvttph_epi16(0b0101010101010101, a);
25727        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25728        assert_eq_m256i(r, e);
25729    }
25730
25731    #[simd_test(enable = "avx512fp16")]
25732    unsafe fn test_mm512_cvttph_epi16() {
25733        let a = _mm512_set_ph(
25734            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25735            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25736            31.0, 32.0,
25737        );
25738        let r = _mm512_cvttph_epi16(a);
25739        let e = _mm512_set_epi16(
25740            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25741            25, 26, 27, 28, 29, 30, 31, 32,
25742        );
25743        assert_eq_m512i(r, e);
25744    }
25745
25746    #[simd_test(enable = "avx512fp16")]
25747    unsafe fn test_mm512_mask_cvttph_epi16() {
25748        let a = _mm512_set_ph(
25749            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25750            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25751            31.0, 32.0,
25752        );
25753        let src = _mm512_set_epi16(
25754            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25755            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25756        );
25757        let r = _mm512_mask_cvttph_epi16(src, 0b01010101010101010101010101010101, a);
25758        let e = _mm512_set_epi16(
25759            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25760            24, 34, 26, 36, 28, 38, 30, 40, 32,
25761        );
25762        assert_eq_m512i(r, e);
25763    }
25764
25765    #[simd_test(enable = "avx512fp16")]
25766    unsafe fn test_mm512_maskz_cvttph_epi16() {
25767        let a = _mm512_set_ph(
25768            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25769            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25770            31.0, 32.0,
25771        );
25772        let r = _mm512_maskz_cvttph_epi16(0b01010101010101010101010101010101, a);
25773        let e = _mm512_set_epi16(
25774            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25775            0, 28, 0, 30, 0, 32,
25776        );
25777        assert_eq_m512i(r, e);
25778    }
25779
25780    #[simd_test(enable = "avx512fp16")]
25781    unsafe fn test_mm512_cvtt_roundph_epi16() {
25782        let a = _mm512_set_ph(
25783            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25784            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25785            31.0, 32.0,
25786        );
25787        let r = _mm512_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(a);
25788        let e = _mm512_set_epi16(
25789            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25790            25, 26, 27, 28, 29, 30, 31, 32,
25791        );
25792        assert_eq_m512i(r, e);
25793    }
25794
25795    #[simd_test(enable = "avx512fp16")]
25796    unsafe fn test_mm512_mask_cvtt_roundph_epi16() {
25797        let a = _mm512_set_ph(
25798            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25799            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25800            31.0, 32.0,
25801        );
25802        let src = _mm512_set_epi16(
25803            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25804            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25805        );
25806        let r = _mm512_mask_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25807            src,
25808            0b01010101010101010101010101010101,
25809            a,
25810        );
25811        let e = _mm512_set_epi16(
25812            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25813            24, 34, 26, 36, 28, 38, 30, 40, 32,
25814        );
25815        assert_eq_m512i(r, e);
25816    }
25817
25818    #[simd_test(enable = "avx512fp16")]
25819    unsafe fn test_mm512_maskz_cvtt_roundph_epi16() {
25820        let a = _mm512_set_ph(
25821            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25822            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25823            31.0, 32.0,
25824        );
25825        let r = _mm512_maskz_cvtt_roundph_epi16::<_MM_FROUND_NO_EXC>(
25826            0b01010101010101010101010101010101,
25827            a,
25828        );
25829        let e = _mm512_set_epi16(
25830            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25831            0, 28, 0, 30, 0, 32,
25832        );
25833        assert_eq_m512i(r, e);
25834    }
25835
25836    #[simd_test(enable = "avx512fp16,avx512vl")]
25837    unsafe fn test_mm_cvttph_epu16() {
25838        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25839        let r = _mm_cvttph_epu16(a);
25840        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
25841        assert_eq_m128i(r, e);
25842    }
25843
25844    #[simd_test(enable = "avx512fp16,avx512vl")]
25845    unsafe fn test_mm_mask_cvttph_epu16() {
25846        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25847        let src = _mm_set_epi16(10, 11, 12, 13, 14, 15, 16, 17);
25848        let r = _mm_mask_cvttph_epu16(src, 0b01010101, a);
25849        let e = _mm_set_epi16(10, 2, 12, 4, 14, 6, 16, 8);
25850        assert_eq_m128i(r, e);
25851    }
25852
25853    #[simd_test(enable = "avx512fp16,avx512vl")]
25854    unsafe fn test_mm_maskz_cvttph_epu16() {
25855        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
25856        let r = _mm_maskz_cvttph_epu16(0b01010101, a);
25857        let e = _mm_set_epi16(0, 2, 0, 4, 0, 6, 0, 8);
25858        assert_eq_m128i(r, e);
25859    }
25860
25861    #[simd_test(enable = "avx512fp16,avx512vl")]
25862    unsafe fn test_mm256_cvttph_epu16() {
25863        let a = _mm256_set_ph(
25864            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25865        );
25866        let r = _mm256_cvttph_epu16(a);
25867        let e = _mm256_set_epi16(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
25868        assert_eq_m256i(r, e);
25869    }
25870
25871    #[simd_test(enable = "avx512fp16,avx512vl")]
25872    unsafe fn test_mm256_mask_cvttph_epu16() {
25873        let a = _mm256_set_ph(
25874            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25875        );
25876        let src = _mm256_set_epi16(
25877            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
25878        );
25879        let r = _mm256_mask_cvttph_epu16(src, 0b0101010101010101, a);
25880        let e = _mm256_set_epi16(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
25881        assert_eq_m256i(r, e);
25882    }
25883
25884    #[simd_test(enable = "avx512fp16,avx512vl")]
25885    unsafe fn test_mm256_maskz_cvttph_epu16() {
25886        let a = _mm256_set_ph(
25887            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25888        );
25889        let r = _mm256_maskz_cvttph_epu16(0b0101010101010101, a);
25890        let e = _mm256_set_epi16(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
25891        assert_eq_m256i(r, e);
25892    }
25893
25894    #[simd_test(enable = "avx512fp16")]
25895    unsafe fn test_mm512_cvttph_epu16() {
25896        let a = _mm512_set_ph(
25897            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25898            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25899            31.0, 32.0,
25900        );
25901        let r = _mm512_cvttph_epu16(a);
25902        let e = _mm512_set_epi16(
25903            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25904            25, 26, 27, 28, 29, 30, 31, 32,
25905        );
25906        assert_eq_m512i(r, e);
25907    }
25908
25909    #[simd_test(enable = "avx512fp16")]
25910    unsafe fn test_mm512_mask_cvttph_epu16() {
25911        let a = _mm512_set_ph(
25912            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25913            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25914            31.0, 32.0,
25915        );
25916        let src = _mm512_set_epi16(
25917            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25918            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25919        );
25920        let r = _mm512_mask_cvttph_epu16(src, 0b01010101010101010101010101010101, a);
25921        let e = _mm512_set_epi16(
25922            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25923            24, 34, 26, 36, 28, 38, 30, 40, 32,
25924        );
25925        assert_eq_m512i(r, e);
25926    }
25927
25928    #[simd_test(enable = "avx512fp16")]
25929    unsafe fn test_mm512_maskz_cvttph_epu16() {
25930        let a = _mm512_set_ph(
25931            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25932            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25933            31.0, 32.0,
25934        );
25935        let r = _mm512_maskz_cvttph_epu16(0b01010101010101010101010101010101, a);
25936        let e = _mm512_set_epi16(
25937            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25938            0, 28, 0, 30, 0, 32,
25939        );
25940        assert_eq_m512i(r, e);
25941    }
25942
25943    #[simd_test(enable = "avx512fp16")]
25944    unsafe fn test_mm512_cvtt_roundph_epu16() {
25945        let a = _mm512_set_ph(
25946            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25947            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25948            31.0, 32.0,
25949        );
25950        let r = _mm512_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(a);
25951        let e = _mm512_set_epi16(
25952            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
25953            25, 26, 27, 28, 29, 30, 31, 32,
25954        );
25955        assert_eq_m512i(r, e);
25956    }
25957
25958    #[simd_test(enable = "avx512fp16")]
25959    unsafe fn test_mm512_mask_cvtt_roundph_epu16() {
25960        let a = _mm512_set_ph(
25961            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25962            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25963            31.0, 32.0,
25964        );
25965        let src = _mm512_set_epi16(
25966            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
25967            32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
25968        );
25969        let r = _mm512_mask_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
25970            src,
25971            0b01010101010101010101010101010101,
25972            a,
25973        );
25974        let e = _mm512_set_epi16(
25975            10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16, 26, 18, 28, 20, 30, 22, 32,
25976            24, 34, 26, 36, 28, 38, 30, 40, 32,
25977        );
25978        assert_eq_m512i(r, e);
25979    }
25980
25981    #[simd_test(enable = "avx512fp16")]
25982    unsafe fn test_mm512_maskz_cvtt_roundph_epu16() {
25983        let a = _mm512_set_ph(
25984            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
25985            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
25986            31.0, 32.0,
25987        );
25988        let r = _mm512_maskz_cvtt_roundph_epu16::<_MM_FROUND_NO_EXC>(
25989            0b01010101010101010101010101010101,
25990            a,
25991        );
25992        let e = _mm512_set_epi16(
25993            0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16, 0, 18, 0, 20, 0, 22, 0, 24, 0, 26,
25994            0, 28, 0, 30, 0, 32,
25995        );
25996        assert_eq_m512i(r, e);
25997    }
25998
25999    #[simd_test(enable = "avx512fp16,avx512vl")]
26000    unsafe fn test_mm_cvtph_epi32() {
26001        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26002        let r = _mm_cvtph_epi32(a);
26003        let e = _mm_set_epi32(1, 2, 3, 4);
26004        assert_eq_m128i(r, e);
26005    }
26006
26007    #[simd_test(enable = "avx512fp16,avx512vl")]
26008    unsafe fn test_mm_mask_cvtph_epi32() {
26009        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26010        let src = _mm_set_epi32(10, 11, 12, 13);
26011        let r = _mm_mask_cvtph_epi32(src, 0b0101, a);
26012        let e = _mm_set_epi32(10, 2, 12, 4);
26013        assert_eq_m128i(r, e);
26014    }
26015
26016    #[simd_test(enable = "avx512fp16,avx512vl")]
26017    unsafe fn test_mm_maskz_cvtph_epi32() {
26018        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26019        let r = _mm_maskz_cvtph_epi32(0b0101, a);
26020        let e = _mm_set_epi32(0, 2, 0, 4);
26021        assert_eq_m128i(r, e);
26022    }
26023
26024    #[simd_test(enable = "avx512fp16,avx512vl")]
26025    unsafe fn test_mm256_cvtph_epi32() {
26026        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26027        let r = _mm256_cvtph_epi32(a);
26028        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26029        assert_eq_m256i(r, e);
26030    }
26031
26032    #[simd_test(enable = "avx512fp16,avx512vl")]
26033    unsafe fn test_mm256_mask_cvtph_epi32() {
26034        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26035        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26036        let r = _mm256_mask_cvtph_epi32(src, 0b01010101, a);
26037        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26038        assert_eq_m256i(r, e);
26039    }
26040
26041    #[simd_test(enable = "avx512fp16,avx512vl")]
26042    unsafe fn test_mm256_maskz_cvtph_epi32() {
26043        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26044        let r = _mm256_maskz_cvtph_epi32(0b01010101, a);
26045        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26046        assert_eq_m256i(r, e);
26047    }
26048
26049    #[simd_test(enable = "avx512fp16")]
26050    unsafe fn test_mm512_cvtph_epi32() {
26051        let a = _mm256_set_ph(
26052            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26053        );
26054        let r = _mm512_cvtph_epi32(a);
26055        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26056        assert_eq_m512i(r, e);
26057    }
26058
26059    #[simd_test(enable = "avx512fp16")]
26060    unsafe fn test_mm512_mask_cvtph_epi32() {
26061        let a = _mm256_set_ph(
26062            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26063        );
26064        let src = _mm512_set_epi32(
26065            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26066        );
26067        let r = _mm512_mask_cvtph_epi32(src, 0b0101010101010101, a);
26068        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26069        assert_eq_m512i(r, e);
26070    }
26071
26072    #[simd_test(enable = "avx512fp16")]
26073    unsafe fn test_mm512_maskz_cvtph_epi32() {
26074        let a = _mm256_set_ph(
26075            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26076        );
26077        let r = _mm512_maskz_cvtph_epi32(0b0101010101010101, a);
26078        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26079        assert_eq_m512i(r, e);
26080    }
26081
26082    #[simd_test(enable = "avx512fp16")]
26083    unsafe fn test_mm512_cvt_roundph_epi32() {
26084        let a = _mm256_set_ph(
26085            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26086        );
26087        let r = _mm512_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26088        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26089        assert_eq_m512i(r, e);
26090    }
26091
26092    #[simd_test(enable = "avx512fp16")]
26093    unsafe fn test_mm512_mask_cvt_roundph_epi32() {
26094        let a = _mm256_set_ph(
26095            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26096        );
26097        let src = _mm512_set_epi32(
26098            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26099        );
26100        let r = _mm512_mask_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26101            src,
26102            0b0101010101010101,
26103            a,
26104        );
26105        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26106        assert_eq_m512i(r, e);
26107    }
26108
26109    #[simd_test(enable = "avx512fp16")]
26110    unsafe fn test_mm512_maskz_cvt_roundph_epi32() {
26111        let a = _mm256_set_ph(
26112            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26113        );
26114        let r = _mm512_maskz_cvt_roundph_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26115            0b0101010101010101,
26116            a,
26117        );
26118        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26119        assert_eq_m512i(r, e);
26120    }
26121
26122    #[simd_test(enable = "avx512fp16")]
26123    unsafe fn test_mm_cvtsh_i32() {
26124        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26125        let r = _mm_cvtsh_i32(a);
26126        assert_eq!(r, 1);
26127    }
26128
26129    #[simd_test(enable = "avx512fp16")]
26130    unsafe fn test_mm_cvt_roundsh_i32() {
26131        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26132        let r = _mm_cvt_roundsh_i32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26133        assert_eq!(r, 1);
26134    }
26135
26136    #[simd_test(enable = "avx512fp16,avx512vl")]
26137    unsafe fn test_mm_cvtph_epu32() {
26138        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26139        let r = _mm_cvtph_epu32(a);
26140        let e = _mm_set_epi32(1, 2, 3, 4);
26141        assert_eq_m128i(r, e);
26142    }
26143
26144    #[simd_test(enable = "avx512fp16,avx512vl")]
26145    unsafe fn test_mm_mask_cvtph_epu32() {
26146        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26147        let src = _mm_set_epi32(10, 11, 12, 13);
26148        let r = _mm_mask_cvtph_epu32(src, 0b0101, a);
26149        let e = _mm_set_epi32(10, 2, 12, 4);
26150        assert_eq_m128i(r, e);
26151    }
26152
26153    #[simd_test(enable = "avx512fp16,avx512vl")]
26154    unsafe fn test_mm_maskz_cvtph_epu32() {
26155        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26156        let r = _mm_maskz_cvtph_epu32(0b0101, a);
26157        let e = _mm_set_epi32(0, 2, 0, 4);
26158        assert_eq_m128i(r, e);
26159    }
26160
26161    #[simd_test(enable = "avx512fp16,avx512vl")]
26162    unsafe fn test_mm256_cvtph_epu32() {
26163        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26164        let r = _mm256_cvtph_epu32(a);
26165        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26166        assert_eq_m256i(r, e);
26167    }
26168
26169    #[simd_test(enable = "avx512fp16,avx512vl")]
26170    unsafe fn test_mm256_mask_cvtph_epu32() {
26171        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26172        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26173        let r = _mm256_mask_cvtph_epu32(src, 0b01010101, a);
26174        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26175        assert_eq_m256i(r, e);
26176    }
26177
26178    #[simd_test(enable = "avx512fp16,avx512vl")]
26179    unsafe fn test_mm256_maskz_cvtph_epu32() {
26180        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26181        let r = _mm256_maskz_cvtph_epu32(0b01010101, a);
26182        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26183        assert_eq_m256i(r, e);
26184    }
26185
26186    #[simd_test(enable = "avx512fp16")]
26187    unsafe fn test_mm512_cvtph_epu32() {
26188        let a = _mm256_set_ph(
26189            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26190        );
26191        let r = _mm512_cvtph_epu32(a);
26192        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26193        assert_eq_m512i(r, e);
26194    }
26195
26196    #[simd_test(enable = "avx512fp16")]
26197    unsafe fn test_mm512_mask_cvtph_epu32() {
26198        let a = _mm256_set_ph(
26199            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26200        );
26201        let src = _mm512_set_epi32(
26202            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26203        );
26204        let r = _mm512_mask_cvtph_epu32(src, 0b0101010101010101, a);
26205        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26206        assert_eq_m512i(r, e);
26207    }
26208
26209    #[simd_test(enable = "avx512fp16")]
26210    unsafe fn test_mm512_maskz_cvtph_epu32() {
26211        let a = _mm256_set_ph(
26212            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26213        );
26214        let r = _mm512_maskz_cvtph_epu32(0b0101010101010101, a);
26215        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26216        assert_eq_m512i(r, e);
26217    }
26218
26219    #[simd_test(enable = "avx512fp16")]
26220    unsafe fn test_mm512_cvt_roundph_epu32() {
26221        let a = _mm256_set_ph(
26222            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26223        );
26224        let r = _mm512_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26225        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26226        assert_eq_m512i(r, e);
26227    }
26228
26229    #[simd_test(enable = "avx512fp16")]
26230    unsafe fn test_mm512_mask_cvt_roundph_epu32() {
26231        let a = _mm256_set_ph(
26232            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26233        );
26234        let src = _mm512_set_epi32(
26235            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26236        );
26237        let r = _mm512_mask_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26238            src,
26239            0b0101010101010101,
26240            a,
26241        );
26242        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26243        assert_eq_m512i(r, e);
26244    }
26245
26246    #[simd_test(enable = "avx512fp16")]
26247    unsafe fn test_mm512_maskz_cvt_roundph_epu32() {
26248        let a = _mm256_set_ph(
26249            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26250        );
26251        let r = _mm512_maskz_cvt_roundph_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26252            0b0101010101010101,
26253            a,
26254        );
26255        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26256        assert_eq_m512i(r, e);
26257    }
26258
26259    #[simd_test(enable = "avx512fp16")]
26260    unsafe fn test_mm_cvtsh_u32() {
26261        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26262        let r = _mm_cvtsh_u32(a);
26263        assert_eq!(r, 1);
26264    }
26265
26266    #[simd_test(enable = "avx512fp16")]
26267    unsafe fn test_mm_cvt_roundsh_u32() {
26268        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26269        let r = _mm_cvt_roundsh_u32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26270        assert_eq!(r, 1);
26271    }
26272
26273    #[simd_test(enable = "avx512fp16,avx512vl")]
26274    unsafe fn test_mm_cvttph_epi32() {
26275        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26276        let r = _mm_cvttph_epi32(a);
26277        let e = _mm_set_epi32(1, 2, 3, 4);
26278        assert_eq_m128i(r, e);
26279    }
26280
26281    #[simd_test(enable = "avx512fp16,avx512vl")]
26282    unsafe fn test_mm_mask_cvttph_epi32() {
26283        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26284        let src = _mm_set_epi32(10, 11, 12, 13);
26285        let r = _mm_mask_cvttph_epi32(src, 0b0101, a);
26286        let e = _mm_set_epi32(10, 2, 12, 4);
26287        assert_eq_m128i(r, e);
26288    }
26289
26290    #[simd_test(enable = "avx512fp16,avx512vl")]
26291    unsafe fn test_mm_maskz_cvttph_epi32() {
26292        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26293        let r = _mm_maskz_cvttph_epi32(0b0101, a);
26294        let e = _mm_set_epi32(0, 2, 0, 4);
26295        assert_eq_m128i(r, e);
26296    }
26297
26298    #[simd_test(enable = "avx512fp16,avx512vl")]
26299    unsafe fn test_mm256_cvttph_epi32() {
26300        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26301        let r = _mm256_cvttph_epi32(a);
26302        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26303        assert_eq_m256i(r, e);
26304    }
26305
26306    #[simd_test(enable = "avx512fp16,avx512vl")]
26307    unsafe fn test_mm256_mask_cvttph_epi32() {
26308        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26309        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26310        let r = _mm256_mask_cvttph_epi32(src, 0b01010101, a);
26311        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26312        assert_eq_m256i(r, e);
26313    }
26314
26315    #[simd_test(enable = "avx512fp16,avx512vl")]
26316    unsafe fn test_mm256_maskz_cvttph_epi32() {
26317        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26318        let r = _mm256_maskz_cvttph_epi32(0b01010101, a);
26319        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26320        assert_eq_m256i(r, e);
26321    }
26322
26323    #[simd_test(enable = "avx512fp16")]
26324    unsafe fn test_mm512_cvttph_epi32() {
26325        let a = _mm256_set_ph(
26326            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26327        );
26328        let r = _mm512_cvttph_epi32(a);
26329        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26330        assert_eq_m512i(r, e);
26331    }
26332
26333    #[simd_test(enable = "avx512fp16")]
26334    unsafe fn test_mm512_mask_cvttph_epi32() {
26335        let a = _mm256_set_ph(
26336            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26337        );
26338        let src = _mm512_set_epi32(
26339            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26340        );
26341        let r = _mm512_mask_cvttph_epi32(src, 0b0101010101010101, a);
26342        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26343        assert_eq_m512i(r, e);
26344    }
26345
26346    #[simd_test(enable = "avx512fp16")]
26347    unsafe fn test_mm512_maskz_cvttph_epi32() {
26348        let a = _mm256_set_ph(
26349            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26350        );
26351        let r = _mm512_maskz_cvttph_epi32(0b0101010101010101, a);
26352        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26353        assert_eq_m512i(r, e);
26354    }
26355
26356    #[simd_test(enable = "avx512fp16")]
26357    unsafe fn test_mm512_cvtt_roundph_epi32() {
26358        let a = _mm256_set_ph(
26359            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26360        );
26361        let r = _mm512_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(a);
26362        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26363        assert_eq_m512i(r, e);
26364    }
26365
26366    #[simd_test(enable = "avx512fp16")]
26367    unsafe fn test_mm512_mask_cvtt_roundph_epi32() {
26368        let a = _mm256_set_ph(
26369            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26370        );
26371        let src = _mm512_set_epi32(
26372            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26373        );
26374        let r = _mm512_mask_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26375        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26376        assert_eq_m512i(r, e);
26377    }
26378
26379    #[simd_test(enable = "avx512fp16")]
26380    unsafe fn test_mm512_maskz_cvtt_roundph_epi32() {
26381        let a = _mm256_set_ph(
26382            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26383        );
26384        let r = _mm512_maskz_cvtt_roundph_epi32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26385        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26386        assert_eq_m512i(r, e);
26387    }
26388
26389    #[simd_test(enable = "avx512fp16")]
26390    unsafe fn test_mm_cvttsh_i32() {
26391        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26392        let r = _mm_cvttsh_i32(a);
26393        assert_eq!(r, 1);
26394    }
26395
26396    #[simd_test(enable = "avx512fp16")]
26397    unsafe fn test_mm_cvtt_roundsh_i32() {
26398        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26399        let r = _mm_cvtt_roundsh_i32::<_MM_FROUND_NO_EXC>(a);
26400        assert_eq!(r, 1);
26401    }
26402
26403    #[simd_test(enable = "avx512fp16,avx512vl")]
26404    unsafe fn test_mm_cvttph_epu32() {
26405        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26406        let r = _mm_cvttph_epu32(a);
26407        let e = _mm_set_epi32(1, 2, 3, 4);
26408        assert_eq_m128i(r, e);
26409    }
26410
26411    #[simd_test(enable = "avx512fp16,avx512vl")]
26412    unsafe fn test_mm_mask_cvttph_epu32() {
26413        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26414        let src = _mm_set_epi32(10, 11, 12, 13);
26415        let r = _mm_mask_cvttph_epu32(src, 0b0101, a);
26416        let e = _mm_set_epi32(10, 2, 12, 4);
26417        assert_eq_m128i(r, e);
26418    }
26419
26420    #[simd_test(enable = "avx512fp16,avx512vl")]
26421    unsafe fn test_mm_maskz_cvttph_epu32() {
26422        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26423        let r = _mm_maskz_cvttph_epu32(0b0101, a);
26424        let e = _mm_set_epi32(0, 2, 0, 4);
26425        assert_eq_m128i(r, e);
26426    }
26427
26428    #[simd_test(enable = "avx512fp16,avx512vl")]
26429    unsafe fn test_mm256_cvttph_epu32() {
26430        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26431        let r = _mm256_cvttph_epu32(a);
26432        let e = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
26433        assert_eq_m256i(r, e);
26434    }
26435
26436    #[simd_test(enable = "avx512fp16,avx512vl")]
26437    unsafe fn test_mm256_mask_cvttph_epu32() {
26438        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26439        let src = _mm256_set_epi32(10, 11, 12, 13, 14, 15, 16, 17);
26440        let r = _mm256_mask_cvttph_epu32(src, 0b01010101, a);
26441        let e = _mm256_set_epi32(10, 2, 12, 4, 14, 6, 16, 8);
26442        assert_eq_m256i(r, e);
26443    }
26444
26445    #[simd_test(enable = "avx512fp16,avx512vl")]
26446    unsafe fn test_mm256_maskz_cvttph_epu32() {
26447        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26448        let r = _mm256_maskz_cvttph_epu32(0b01010101, a);
26449        let e = _mm256_set_epi32(0, 2, 0, 4, 0, 6, 0, 8);
26450        assert_eq_m256i(r, e);
26451    }
26452
26453    #[simd_test(enable = "avx512fp16")]
26454    unsafe fn test_mm512_cvttph_epu32() {
26455        let a = _mm256_set_ph(
26456            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26457        );
26458        let r = _mm512_cvttph_epu32(a);
26459        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26460        assert_eq_m512i(r, e);
26461    }
26462
26463    #[simd_test(enable = "avx512fp16")]
26464    unsafe fn test_mm512_mask_cvttph_epu32() {
26465        let a = _mm256_set_ph(
26466            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26467        );
26468        let src = _mm512_set_epi32(
26469            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26470        );
26471        let r = _mm512_mask_cvttph_epu32(src, 0b0101010101010101, a);
26472        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26473        assert_eq_m512i(r, e);
26474    }
26475
26476    #[simd_test(enable = "avx512fp16")]
26477    unsafe fn test_mm512_maskz_cvttph_epu32() {
26478        let a = _mm256_set_ph(
26479            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26480        );
26481        let r = _mm512_maskz_cvttph_epu32(0b0101010101010101, a);
26482        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26483        assert_eq_m512i(r, e);
26484    }
26485
26486    #[simd_test(enable = "avx512fp16")]
26487    unsafe fn test_mm512_cvtt_roundph_epu32() {
26488        let a = _mm256_set_ph(
26489            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26490        );
26491        let r = _mm512_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(a);
26492        let e = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
26493        assert_eq_m512i(r, e);
26494    }
26495
26496    #[simd_test(enable = "avx512fp16")]
26497    unsafe fn test_mm512_mask_cvtt_roundph_epu32() {
26498        let a = _mm256_set_ph(
26499            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26500        );
26501        let src = _mm512_set_epi32(
26502            10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26503        );
26504        let r = _mm512_mask_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
26505        let e = _mm512_set_epi32(10, 2, 12, 4, 14, 6, 16, 8, 18, 10, 20, 12, 22, 14, 24, 16);
26506        assert_eq_m512i(r, e);
26507    }
26508
26509    #[simd_test(enable = "avx512fp16")]
26510    unsafe fn test_mm512_maskz_cvtt_roundph_epu32() {
26511        let a = _mm256_set_ph(
26512            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26513        );
26514        let r = _mm512_maskz_cvtt_roundph_epu32::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
26515        let e = _mm512_set_epi32(0, 2, 0, 4, 0, 6, 0, 8, 0, 10, 0, 12, 0, 14, 0, 16);
26516        assert_eq_m512i(r, e);
26517    }
26518
26519    #[simd_test(enable = "avx512fp16")]
26520    unsafe fn test_mm_cvttsh_u32() {
26521        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26522        let r = _mm_cvttsh_u32(a);
26523        assert_eq!(r, 1);
26524    }
26525
26526    #[simd_test(enable = "avx512fp16")]
26527    unsafe fn test_mm_cvtt_roundsh_u32() {
26528        let a = _mm_setr_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26529        let r = _mm_cvtt_roundsh_u32::<_MM_FROUND_NO_EXC>(a);
26530        assert_eq!(r, 1);
26531    }
26532
26533    #[simd_test(enable = "avx512fp16,avx512vl")]
26534    unsafe fn test_mm_cvtph_epi64() {
26535        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26536        let r = _mm_cvtph_epi64(a);
26537        let e = _mm_set_epi64x(1, 2);
26538        assert_eq_m128i(r, e);
26539    }
26540
26541    #[simd_test(enable = "avx512fp16,avx512vl")]
26542    unsafe fn test_mm_mask_cvtph_epi64() {
26543        let src = _mm_set_epi64x(3, 4);
26544        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26545        let r = _mm_mask_cvtph_epi64(src, 0b01, a);
26546        let e = _mm_set_epi64x(3, 2);
26547        assert_eq_m128i(r, e);
26548    }
26549
26550    #[simd_test(enable = "avx512fp16,avx512vl")]
26551    unsafe fn test_mm_maskz_cvtph_epi64() {
26552        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26553        let r = _mm_maskz_cvtph_epi64(0b01, a);
26554        let e = _mm_set_epi64x(0, 2);
26555        assert_eq_m128i(r, e);
26556    }
26557
26558    #[simd_test(enable = "avx512fp16,avx512vl")]
26559    unsafe fn test_mm256_cvtph_epi64() {
26560        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26561        let r = _mm256_cvtph_epi64(a);
26562        let e = _mm256_set_epi64x(1, 2, 3, 4);
26563        assert_eq_m256i(r, e);
26564    }
26565
26566    #[simd_test(enable = "avx512fp16,avx512vl")]
26567    unsafe fn test_mm256_mask_cvtph_epi64() {
26568        let src = _mm256_set_epi64x(5, 6, 7, 8);
26569        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26570        let r = _mm256_mask_cvtph_epi64(src, 0b0101, a);
26571        let e = _mm256_set_epi64x(5, 2, 7, 4);
26572        assert_eq_m256i(r, e);
26573    }
26574
26575    #[simd_test(enable = "avx512fp16,avx512vl")]
26576    unsafe fn test_mm256_maskz_cvtph_epi64() {
26577        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26578        let r = _mm256_maskz_cvtph_epi64(0b0101, a);
26579        let e = _mm256_set_epi64x(0, 2, 0, 4);
26580        assert_eq_m256i(r, e);
26581    }
26582
26583    #[simd_test(enable = "avx512fp16")]
26584    unsafe fn test_mm512_cvtph_epi64() {
26585        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26586        let r = _mm512_cvtph_epi64(a);
26587        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26588        assert_eq_m512i(r, e);
26589    }
26590
26591    #[simd_test(enable = "avx512fp16")]
26592    unsafe fn test_mm512_mask_cvtph_epi64() {
26593        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26594        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26595        let r = _mm512_mask_cvtph_epi64(src, 0b01010101, a);
26596        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26597        assert_eq_m512i(r, e);
26598    }
26599
26600    #[simd_test(enable = "avx512fp16")]
26601    unsafe fn test_mm512_maskz_cvtph_epi64() {
26602        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26603        let r = _mm512_maskz_cvtph_epi64(0b01010101, a);
26604        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26605        assert_eq_m512i(r, e);
26606    }
26607
26608    #[simd_test(enable = "avx512fp16")]
26609    unsafe fn test_mm512_cvt_roundph_epi64() {
26610        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26611        let r = _mm512_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26612        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26613        assert_eq_m512i(r, e);
26614    }
26615
26616    #[simd_test(enable = "avx512fp16")]
26617    unsafe fn test_mm512_mask_cvt_roundph_epi64() {
26618        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26619        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26620        let r = _mm512_mask_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26621            src, 0b01010101, a,
26622        );
26623        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26624        assert_eq_m512i(r, e);
26625    }
26626
26627    #[simd_test(enable = "avx512fp16")]
26628    unsafe fn test_mm512_maskz_cvt_roundph_epi64() {
26629        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26630        let r = _mm512_maskz_cvt_roundph_epi64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26631            0b01010101, a,
26632        );
26633        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26634        assert_eq_m512i(r, e);
26635    }
26636
26637    #[simd_test(enable = "avx512fp16,avx512vl")]
26638    unsafe fn test_mm_cvtph_epu64() {
26639        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26640        let r = _mm_cvtph_epu64(a);
26641        let e = _mm_set_epi64x(1, 2);
26642        assert_eq_m128i(r, e);
26643    }
26644
26645    #[simd_test(enable = "avx512fp16,avx512vl")]
26646    unsafe fn test_mm_mask_cvtph_epu64() {
26647        let src = _mm_set_epi64x(3, 4);
26648        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26649        let r = _mm_mask_cvtph_epu64(src, 0b01, a);
26650        let e = _mm_set_epi64x(3, 2);
26651        assert_eq_m128i(r, e);
26652    }
26653
26654    #[simd_test(enable = "avx512fp16,avx512vl")]
26655    unsafe fn test_mm_maskz_cvtph_epu64() {
26656        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26657        let r = _mm_maskz_cvtph_epu64(0b01, a);
26658        let e = _mm_set_epi64x(0, 2);
26659        assert_eq_m128i(r, e);
26660    }
26661
26662    #[simd_test(enable = "avx512fp16,avx512vl")]
26663    unsafe fn test_mm256_cvtph_epu64() {
26664        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26665        let r = _mm256_cvtph_epu64(a);
26666        let e = _mm256_set_epi64x(1, 2, 3, 4);
26667        assert_eq_m256i(r, e);
26668    }
26669
26670    #[simd_test(enable = "avx512fp16,avx512vl")]
26671    unsafe fn test_mm256_mask_cvtph_epu64() {
26672        let src = _mm256_set_epi64x(5, 6, 7, 8);
26673        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26674        let r = _mm256_mask_cvtph_epu64(src, 0b0101, a);
26675        let e = _mm256_set_epi64x(5, 2, 7, 4);
26676        assert_eq_m256i(r, e);
26677    }
26678
26679    #[simd_test(enable = "avx512fp16,avx512vl")]
26680    unsafe fn test_mm256_maskz_cvtph_epu64() {
26681        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26682        let r = _mm256_maskz_cvtph_epu64(0b0101, a);
26683        let e = _mm256_set_epi64x(0, 2, 0, 4);
26684        assert_eq_m256i(r, e);
26685    }
26686
26687    #[simd_test(enable = "avx512fp16")]
26688    unsafe fn test_mm512_cvtph_epu64() {
26689        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26690        let r = _mm512_cvtph_epu64(a);
26691        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26692        assert_eq_m512i(r, e);
26693    }
26694
26695    #[simd_test(enable = "avx512fp16")]
26696    unsafe fn test_mm512_mask_cvtph_epu64() {
26697        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26698        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26699        let r = _mm512_mask_cvtph_epu64(src, 0b01010101, a);
26700        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26701        assert_eq_m512i(r, e);
26702    }
26703
26704    #[simd_test(enable = "avx512fp16")]
26705    unsafe fn test_mm512_maskz_cvtph_epu64() {
26706        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26707        let r = _mm512_maskz_cvtph_epu64(0b01010101, a);
26708        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26709        assert_eq_m512i(r, e);
26710    }
26711
26712    #[simd_test(enable = "avx512fp16")]
26713    unsafe fn test_mm512_cvt_roundph_epu64() {
26714        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26715        let r = _mm512_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
26716        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26717        assert_eq_m512i(r, e);
26718    }
26719
26720    #[simd_test(enable = "avx512fp16")]
26721    unsafe fn test_mm512_mask_cvt_roundph_epu64() {
26722        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26723        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26724        let r = _mm512_mask_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26725            src, 0b01010101, a,
26726        );
26727        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26728        assert_eq_m512i(r, e);
26729    }
26730
26731    #[simd_test(enable = "avx512fp16")]
26732    unsafe fn test_mm512_maskz_cvt_roundph_epu64() {
26733        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26734        let r = _mm512_maskz_cvt_roundph_epu64::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
26735            0b01010101, a,
26736        );
26737        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26738        assert_eq_m512i(r, e);
26739    }
26740
26741    #[simd_test(enable = "avx512fp16,avx512vl")]
26742    unsafe fn test_mm_cvttph_epi64() {
26743        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26744        let r = _mm_cvttph_epi64(a);
26745        let e = _mm_set_epi64x(1, 2);
26746        assert_eq_m128i(r, e);
26747    }
26748
26749    #[simd_test(enable = "avx512fp16,avx512vl")]
26750    unsafe fn test_mm_mask_cvttph_epi64() {
26751        let src = _mm_set_epi64x(3, 4);
26752        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26753        let r = _mm_mask_cvttph_epi64(src, 0b01, a);
26754        let e = _mm_set_epi64x(3, 2);
26755        assert_eq_m128i(r, e);
26756    }
26757
26758    #[simd_test(enable = "avx512fp16,avx512vl")]
26759    unsafe fn test_mm_maskz_cvttph_epi64() {
26760        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26761        let r = _mm_maskz_cvttph_epi64(0b01, a);
26762        let e = _mm_set_epi64x(0, 2);
26763        assert_eq_m128i(r, e);
26764    }
26765
26766    #[simd_test(enable = "avx512fp16,avx512vl")]
26767    unsafe fn test_mm256_cvttph_epi64() {
26768        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26769        let r = _mm256_cvttph_epi64(a);
26770        let e = _mm256_set_epi64x(1, 2, 3, 4);
26771        assert_eq_m256i(r, e);
26772    }
26773
26774    #[simd_test(enable = "avx512fp16,avx512vl")]
26775    unsafe fn test_mm256_mask_cvttph_epi64() {
26776        let src = _mm256_set_epi64x(5, 6, 7, 8);
26777        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26778        let r = _mm256_mask_cvttph_epi64(src, 0b0101, a);
26779        let e = _mm256_set_epi64x(5, 2, 7, 4);
26780        assert_eq_m256i(r, e);
26781    }
26782
26783    #[simd_test(enable = "avx512fp16,avx512vl")]
26784    unsafe fn test_mm256_maskz_cvttph_epi64() {
26785        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26786        let r = _mm256_maskz_cvttph_epi64(0b0101, a);
26787        let e = _mm256_set_epi64x(0, 2, 0, 4);
26788        assert_eq_m256i(r, e);
26789    }
26790
26791    #[simd_test(enable = "avx512fp16")]
26792    unsafe fn test_mm512_cvttph_epi64() {
26793        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26794        let r = _mm512_cvttph_epi64(a);
26795        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26796        assert_eq_m512i(r, e);
26797    }
26798
26799    #[simd_test(enable = "avx512fp16")]
26800    unsafe fn test_mm512_mask_cvttph_epi64() {
26801        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26802        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26803        let r = _mm512_mask_cvttph_epi64(src, 0b01010101, a);
26804        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26805        assert_eq_m512i(r, e);
26806    }
26807
26808    #[simd_test(enable = "avx512fp16")]
26809    unsafe fn test_mm512_maskz_cvttph_epi64() {
26810        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26811        let r = _mm512_maskz_cvttph_epi64(0b01010101, a);
26812        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26813        assert_eq_m512i(r, e);
26814    }
26815
26816    #[simd_test(enable = "avx512fp16")]
26817    unsafe fn test_mm512_cvtt_roundph_epi64() {
26818        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26819        let r = _mm512_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(a);
26820        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26821        assert_eq_m512i(r, e);
26822    }
26823
26824    #[simd_test(enable = "avx512fp16")]
26825    unsafe fn test_mm512_mask_cvtt_roundph_epi64() {
26826        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26827        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26828        let r = _mm512_mask_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
26829        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26830        assert_eq_m512i(r, e);
26831    }
26832
26833    #[simd_test(enable = "avx512fp16")]
26834    unsafe fn test_mm512_maskz_cvtt_roundph_epi64() {
26835        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26836        let r = _mm512_maskz_cvtt_roundph_epi64::<_MM_FROUND_NO_EXC>(0b01010101, a);
26837        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26838        assert_eq_m512i(r, e);
26839    }
26840
26841    #[simd_test(enable = "avx512fp16,avx512vl")]
26842    unsafe fn test_mm_cvttph_epu64() {
26843        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26844        let r = _mm_cvttph_epu64(a);
26845        let e = _mm_set_epi64x(1, 2);
26846        assert_eq_m128i(r, e);
26847    }
26848
26849    #[simd_test(enable = "avx512fp16,avx512vl")]
26850    unsafe fn test_mm_mask_cvttph_epu64() {
26851        let src = _mm_set_epi64x(3, 4);
26852        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26853        let r = _mm_mask_cvttph_epu64(src, 0b01, a);
26854        let e = _mm_set_epi64x(3, 2);
26855        assert_eq_m128i(r, e);
26856    }
26857
26858    #[simd_test(enable = "avx512fp16,avx512vl")]
26859    unsafe fn test_mm_maskz_cvttph_epu64() {
26860        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
26861        let r = _mm_maskz_cvttph_epu64(0b01, a);
26862        let e = _mm_set_epi64x(0, 2);
26863        assert_eq_m128i(r, e);
26864    }
26865
26866    #[simd_test(enable = "avx512fp16,avx512vl")]
26867    unsafe fn test_mm256_cvttph_epu64() {
26868        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26869        let r = _mm256_cvttph_epu64(a);
26870        let e = _mm256_set_epi64x(1, 2, 3, 4);
26871        assert_eq_m256i(r, e);
26872    }
26873
26874    #[simd_test(enable = "avx512fp16,avx512vl")]
26875    unsafe fn test_mm256_mask_cvttph_epu64() {
26876        let src = _mm256_set_epi64x(5, 6, 7, 8);
26877        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26878        let r = _mm256_mask_cvttph_epu64(src, 0b0101, a);
26879        let e = _mm256_set_epi64x(5, 2, 7, 4);
26880        assert_eq_m256i(r, e);
26881    }
26882
26883    #[simd_test(enable = "avx512fp16,avx512vl")]
26884    unsafe fn test_mm256_maskz_cvttph_epu64() {
26885        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26886        let r = _mm256_maskz_cvttph_epu64(0b0101, a);
26887        let e = _mm256_set_epi64x(0, 2, 0, 4);
26888        assert_eq_m256i(r, e);
26889    }
26890
26891    #[simd_test(enable = "avx512fp16")]
26892    unsafe fn test_mm512_cvttph_epu64() {
26893        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26894        let r = _mm512_cvttph_epu64(a);
26895        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26896        assert_eq_m512i(r, e);
26897    }
26898
26899    #[simd_test(enable = "avx512fp16")]
26900    unsafe fn test_mm512_mask_cvttph_epu64() {
26901        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26902        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26903        let r = _mm512_mask_cvttph_epu64(src, 0b01010101, a);
26904        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26905        assert_eq_m512i(r, e);
26906    }
26907
26908    #[simd_test(enable = "avx512fp16")]
26909    unsafe fn test_mm512_maskz_cvttph_epu64() {
26910        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26911        let r = _mm512_maskz_cvttph_epu64(0b01010101, a);
26912        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26913        assert_eq_m512i(r, e);
26914    }
26915
26916    #[simd_test(enable = "avx512fp16")]
26917    unsafe fn test_mm512_cvtt_roundph_epu64() {
26918        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26919        let r = _mm512_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(a);
26920        let e = _mm512_set_epi64(1, 2, 3, 4, 5, 6, 7, 8);
26921        assert_eq_m512i(r, e);
26922    }
26923
26924    #[simd_test(enable = "avx512fp16")]
26925    unsafe fn test_mm512_mask_cvtt_roundph_epu64() {
26926        let src = _mm512_set_epi64(9, 10, 11, 12, 13, 14, 15, 16);
26927        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26928        let r = _mm512_mask_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
26929        let e = _mm512_set_epi64(9, 2, 11, 4, 13, 6, 15, 8);
26930        assert_eq_m512i(r, e);
26931    }
26932
26933    #[simd_test(enable = "avx512fp16")]
26934    unsafe fn test_mm512_maskz_cvtt_roundph_epu64() {
26935        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26936        let r = _mm512_maskz_cvtt_roundph_epu64::<_MM_FROUND_NO_EXC>(0b01010101, a);
26937        let e = _mm512_set_epi64(0, 2, 0, 4, 0, 6, 0, 8);
26938        assert_eq_m512i(r, e);
26939    }
26940
26941    #[simd_test(enable = "avx512fp16,avx512vl")]
26942    unsafe fn test_mm_cvtxph_ps() {
26943        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26944        let r = _mm_cvtxph_ps(a);
26945        let e = _mm_set_ps(1.0, 2.0, 3.0, 4.0);
26946        assert_eq_m128(r, e);
26947    }
26948
26949    #[simd_test(enable = "avx512fp16,avx512vl")]
26950    unsafe fn test_mm_mask_cvtxph_ps() {
26951        let src = _mm_set_ps(10.0, 11.0, 12.0, 13.0);
26952        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26953        let r = _mm_mask_cvtxph_ps(src, 0b0101, a);
26954        let e = _mm_set_ps(10.0, 2.0, 12.0, 4.0);
26955        assert_eq_m128(r, e);
26956    }
26957
26958    #[simd_test(enable = "avx512fp16,avx512vl")]
26959    unsafe fn test_mm_maskz_cvtxph_ps() {
26960        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
26961        let r = _mm_maskz_cvtxph_ps(0b0101, a);
26962        let e = _mm_set_ps(0.0, 2.0, 0.0, 4.0);
26963        assert_eq_m128(r, e);
26964    }
26965
26966    #[simd_test(enable = "avx512fp16,avx512vl")]
26967    unsafe fn test_mm256_cvtxph_ps() {
26968        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26969        let r = _mm256_cvtxph_ps(a);
26970        let e = _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26971        assert_eq_m256(r, e);
26972    }
26973
26974    #[simd_test(enable = "avx512fp16,avx512vl")]
26975    unsafe fn test_mm256_mask_cvtxph_ps() {
26976        let src = _mm256_set_ps(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
26977        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26978        let r = _mm256_mask_cvtxph_ps(src, 0b01010101, a);
26979        let e = _mm256_set_ps(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
26980        assert_eq_m256(r, e);
26981    }
26982
26983    #[simd_test(enable = "avx512fp16,avx512vl")]
26984    unsafe fn test_mm256_maskz_cvtxph_ps() {
26985        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
26986        let r = _mm256_maskz_cvtxph_ps(0b01010101, a);
26987        let e = _mm256_set_ps(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
26988        assert_eq_m256(r, e);
26989    }
26990
26991    #[simd_test(enable = "avx512fp16")]
26992    unsafe fn test_mm512_cvtxph_ps() {
26993        let a = _mm256_set_ph(
26994            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26995        );
26996        let r = _mm512_cvtxph_ps(a);
26997        let e = _mm512_set_ps(
26998            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
26999        );
27000        assert_eq_m512(r, e);
27001    }
27002
27003    #[simd_test(enable = "avx512fp16")]
27004    unsafe fn test_mm512_mask_cvtxph_ps() {
27005        let src = _mm512_set_ps(
27006            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
27007            24.0, 25.0,
27008        );
27009        let a = _mm256_set_ph(
27010            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27011        );
27012        let r = _mm512_mask_cvtxph_ps(src, 0b0101010101010101, a);
27013        let e = _mm512_set_ps(
27014            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
27015            16.0,
27016        );
27017        assert_eq_m512(r, e);
27018    }
27019
27020    #[simd_test(enable = "avx512fp16")]
27021    unsafe fn test_mm512_maskz_cvtxph_ps() {
27022        let a = _mm256_set_ph(
27023            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27024        );
27025        let r = _mm512_maskz_cvtxph_ps(0b0101010101010101, a);
27026        let e = _mm512_set_ps(
27027            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
27028        );
27029        assert_eq_m512(r, e);
27030    }
27031
27032    #[simd_test(enable = "avx512fp16")]
27033    unsafe fn test_mm512_cvtx_roundph_ps() {
27034        let a = _mm256_set_ph(
27035            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27036        );
27037        let r = _mm512_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(a);
27038        let e = _mm512_set_ps(
27039            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27040        );
27041        assert_eq_m512(r, e);
27042    }
27043
27044    #[simd_test(enable = "avx512fp16")]
27045    unsafe fn test_mm512_mask_cvtx_roundph_ps() {
27046        let src = _mm512_set_ps(
27047            10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
27048            24.0, 25.0,
27049        );
27050        let a = _mm256_set_ph(
27051            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27052        );
27053        let r = _mm512_mask_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b0101010101010101, a);
27054        let e = _mm512_set_ps(
27055            10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0, 18.0, 10.0, 20.0, 12.0, 22.0, 14.0, 24.0,
27056            16.0,
27057        );
27058        assert_eq_m512(r, e);
27059    }
27060
27061    #[simd_test(enable = "avx512fp16")]
27062    unsafe fn test_mm512_maskz_cvtx_roundph_ps() {
27063        let a = _mm256_set_ph(
27064            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27065        );
27066        let r = _mm512_maskz_cvtx_roundph_ps::<_MM_FROUND_NO_EXC>(0b0101010101010101, a);
27067        let e = _mm512_set_ps(
27068            0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0, 0.0, 10.0, 0.0, 12.0, 0.0, 14.0, 0.0, 16.0,
27069        );
27070        assert_eq_m512(r, e);
27071    }
27072
27073    #[simd_test(enable = "avx512fp16")]
27074    unsafe fn test_mm_cvtsh_ss() {
27075        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27076        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27077        let r = _mm_cvtsh_ss(a, b);
27078        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27079        assert_eq_m128(r, e);
27080    }
27081
27082    #[simd_test(enable = "avx512fp16")]
27083    unsafe fn test_mm_mask_cvtsh_ss() {
27084        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
27085        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27086        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27087        let r = _mm_mask_cvtsh_ss(src, 0, a, b);
27088        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
27089        assert_eq_m128(r, e);
27090        let r = _mm_mask_cvtsh_ss(src, 1, a, b);
27091        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27092        assert_eq_m128(r, e);
27093    }
27094
27095    #[simd_test(enable = "avx512fp16")]
27096    unsafe fn test_mm_maskz_cvtsh_ss() {
27097        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27098        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27099        let r = _mm_maskz_cvtsh_ss(0, a, b);
27100        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
27101        assert_eq_m128(r, e);
27102        let r = _mm_maskz_cvtsh_ss(1, a, b);
27103        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27104        assert_eq_m128(r, e);
27105    }
27106
27107    #[simd_test(enable = "avx512fp16")]
27108    unsafe fn test_mm_cvt_roundsh_ss() {
27109        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27110        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27111        let r = _mm_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(a, b);
27112        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27113        assert_eq_m128(r, e);
27114    }
27115
27116    #[simd_test(enable = "avx512fp16")]
27117    unsafe fn test_mm_mask_cvt_roundsh_ss() {
27118        let src = _mm_setr_ps(3.0, 11.0, 12.0, 13.0);
27119        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27120        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27121        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 0, a, b);
27122        let e = _mm_setr_ps(3.0, 20.0, 21.0, 22.0);
27123        assert_eq_m128(r, e);
27124        let r = _mm_mask_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(src, 1, a, b);
27125        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27126        assert_eq_m128(r, e);
27127    }
27128
27129    #[simd_test(enable = "avx512fp16")]
27130    unsafe fn test_mm_maskz_cvt_roundsh_ss() {
27131        let a = _mm_setr_ps(2.0, 20.0, 21.0, 22.0);
27132        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27133        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(0, a, b);
27134        let e = _mm_setr_ps(0.0, 20.0, 21.0, 22.0);
27135        assert_eq_m128(r, e);
27136        let r = _mm_maskz_cvt_roundsh_ss::<_MM_FROUND_NO_EXC>(1, a, b);
27137        let e = _mm_setr_ps(1.0, 20.0, 21.0, 22.0);
27138        assert_eq_m128(r, e);
27139    }
27140
27141    #[simd_test(enable = "avx512fp16,avx512vl")]
27142    unsafe fn test_mm_cvtph_pd() {
27143        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27144        let r = _mm_cvtph_pd(a);
27145        let e = _mm_set_pd(1.0, 2.0);
27146        assert_eq_m128d(r, e);
27147    }
27148
27149    #[simd_test(enable = "avx512fp16,avx512vl")]
27150    unsafe fn test_mm_mask_cvtph_pd() {
27151        let src = _mm_set_pd(10.0, 11.0);
27152        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27153        let r = _mm_mask_cvtph_pd(src, 0b01, a);
27154        let e = _mm_set_pd(10.0, 2.0);
27155        assert_eq_m128d(r, e);
27156    }
27157
27158    #[simd_test(enable = "avx512fp16,avx512vl")]
27159    unsafe fn test_mm_maskz_cvtph_pd() {
27160        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0);
27161        let r = _mm_maskz_cvtph_pd(0b01, a);
27162        let e = _mm_set_pd(0.0, 2.0);
27163        assert_eq_m128d(r, e);
27164    }
27165
27166    #[simd_test(enable = "avx512fp16,avx512vl")]
27167    unsafe fn test_mm256_cvtph_pd() {
27168        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27169        let r = _mm256_cvtph_pd(a);
27170        let e = _mm256_set_pd(1.0, 2.0, 3.0, 4.0);
27171        assert_eq_m256d(r, e);
27172    }
27173
27174    #[simd_test(enable = "avx512fp16,avx512vl")]
27175    unsafe fn test_mm256_mask_cvtph_pd() {
27176        let src = _mm256_set_pd(10.0, 11.0, 12.0, 13.0);
27177        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27178        let r = _mm256_mask_cvtph_pd(src, 0b0101, a);
27179        let e = _mm256_set_pd(10.0, 2.0, 12.0, 4.0);
27180        assert_eq_m256d(r, e);
27181    }
27182
27183    #[simd_test(enable = "avx512fp16,avx512vl")]
27184    unsafe fn test_mm256_maskz_cvtph_pd() {
27185        let a = _mm_set_ph(0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0);
27186        let r = _mm256_maskz_cvtph_pd(0b0101, a);
27187        let e = _mm256_set_pd(0.0, 2.0, 0.0, 4.0);
27188        assert_eq_m256d(r, e);
27189    }
27190
27191    #[simd_test(enable = "avx512fp16")]
27192    unsafe fn test_mm512_cvtph_pd() {
27193        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27194        let r = _mm512_cvtph_pd(a);
27195        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27196        assert_eq_m512d(r, e);
27197    }
27198
27199    #[simd_test(enable = "avx512fp16")]
27200    unsafe fn test_mm512_mask_cvtph_pd() {
27201        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27202        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27203        let r = _mm512_mask_cvtph_pd(src, 0b01010101, a);
27204        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27205        assert_eq_m512d(r, e);
27206    }
27207
27208    #[simd_test(enable = "avx512fp16")]
27209    unsafe fn test_mm512_maskz_cvtph_pd() {
27210        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27211        let r = _mm512_maskz_cvtph_pd(0b01010101, a);
27212        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27213        assert_eq_m512d(r, e);
27214    }
27215
27216    #[simd_test(enable = "avx512fp16")]
27217    unsafe fn test_mm512_cvt_roundph_pd() {
27218        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27219        let r = _mm512_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(a);
27220        let e = _mm512_set_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27221        assert_eq_m512d(r, e);
27222    }
27223
27224    #[simd_test(enable = "avx512fp16")]
27225    unsafe fn test_mm512_mask_cvt_roundph_pd() {
27226        let src = _mm512_set_pd(10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0);
27227        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27228        let r = _mm512_mask_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(src, 0b01010101, a);
27229        let e = _mm512_set_pd(10.0, 2.0, 12.0, 4.0, 14.0, 6.0, 16.0, 8.0);
27230        assert_eq_m512d(r, e);
27231    }
27232
27233    #[simd_test(enable = "avx512fp16")]
27234    unsafe fn test_mm512_maskz_cvt_roundph_pd() {
27235        let a = _mm_set_ph(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
27236        let r = _mm512_maskz_cvt_roundph_pd::<_MM_FROUND_NO_EXC>(0b01010101, a);
27237        let e = _mm512_set_pd(0.0, 2.0, 0.0, 4.0, 0.0, 6.0, 0.0, 8.0);
27238        assert_eq_m512d(r, e);
27239    }
27240
27241    #[simd_test(enable = "avx512fp16")]
27242    unsafe fn test_mm_cvtsh_sd() {
27243        let a = _mm_setr_pd(2.0, 20.0);
27244        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27245        let r = _mm_cvtsh_sd(a, b);
27246        let e = _mm_setr_pd(1.0, 20.0);
27247        assert_eq_m128d(r, e);
27248    }
27249
27250    #[simd_test(enable = "avx512fp16")]
27251    unsafe fn test_mm_mask_cvtsh_sd() {
27252        let src = _mm_setr_pd(3.0, 11.0);
27253        let a = _mm_setr_pd(2.0, 20.0);
27254        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27255        let r = _mm_mask_cvtsh_sd(src, 0, a, b);
27256        let e = _mm_setr_pd(3.0, 20.0);
27257        assert_eq_m128d(r, e);
27258        let r = _mm_mask_cvtsh_sd(src, 1, a, b);
27259        let e = _mm_setr_pd(1.0, 20.0);
27260        assert_eq_m128d(r, e);
27261    }
27262
27263    #[simd_test(enable = "avx512fp16")]
27264    unsafe fn test_mm_maskz_cvtsh_sd() {
27265        let a = _mm_setr_pd(2.0, 20.0);
27266        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27267        let r = _mm_maskz_cvtsh_sd(0, a, b);
27268        let e = _mm_setr_pd(0.0, 20.0);
27269        assert_eq_m128d(r, e);
27270        let r = _mm_maskz_cvtsh_sd(1, a, b);
27271        let e = _mm_setr_pd(1.0, 20.0);
27272        assert_eq_m128d(r, e);
27273    }
27274
27275    #[simd_test(enable = "avx512fp16")]
27276    unsafe fn test_mm_cvt_roundsh_sd() {
27277        let a = _mm_setr_pd(2.0, 20.0);
27278        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27279        let r = _mm_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(a, b);
27280        let e = _mm_setr_pd(1.0, 20.0);
27281        assert_eq_m128d(r, e);
27282    }
27283
27284    #[simd_test(enable = "avx512fp16")]
27285    unsafe fn test_mm_mask_cvt_roundsh_sd() {
27286        let src = _mm_setr_pd(3.0, 11.0);
27287        let a = _mm_setr_pd(2.0, 20.0);
27288        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27289        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 0, a, b);
27290        let e = _mm_setr_pd(3.0, 20.0);
27291        assert_eq_m128d(r, e);
27292        let r = _mm_mask_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(src, 1, a, b);
27293        let e = _mm_setr_pd(1.0, 20.0);
27294        assert_eq_m128d(r, e);
27295    }
27296
27297    #[simd_test(enable = "avx512fp16")]
27298    unsafe fn test_mm_maskz_cvt_roundsh_sd() {
27299        let a = _mm_setr_pd(2.0, 20.0);
27300        let b = _mm_setr_ph(1.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0);
27301        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(0, a, b);
27302        let e = _mm_setr_pd(0.0, 20.0);
27303        assert_eq_m128d(r, e);
27304        let r = _mm_maskz_cvt_roundsh_sd::<_MM_FROUND_NO_EXC>(1, a, b);
27305        let e = _mm_setr_pd(1.0, 20.0);
27306        assert_eq_m128d(r, e);
27307    }
27308
27309    #[simd_test(enable = "avx512fp16")]
27310    unsafe fn test_mm_cvtsh_h() {
27311        let a = _mm_setr_ph(1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0);
27312        let r = _mm_cvtsh_h(a);
27313        assert_eq!(r, 1.0);
27314    }
27315
27316    #[simd_test(enable = "avx512fp16")]
27317    unsafe fn test_mm256_cvtsh_h() {
27318        let a = _mm256_setr_ph(
27319            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27320        );
27321        let r = _mm256_cvtsh_h(a);
27322        assert_eq!(r, 1.0);
27323    }
27324
27325    #[simd_test(enable = "avx512fp16")]
27326    unsafe fn test_mm512_cvtsh_h() {
27327        let a = _mm512_setr_ph(
27328            1.0, 2.0, 3.0, 42.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
27329            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0,
27330            31.0, 32.0,
27331        );
27332        let r = _mm512_cvtsh_h(a);
27333        assert_eq!(r, 1.0);
27334    }
27335
27336    #[simd_test(enable = "avx512fp16")]
27337    unsafe fn test_mm_cvtsi128_si16() {
27338        let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
27339        let r = _mm_cvtsi128_si16(a);
27340        assert_eq!(r, 1);
27341    }
27342
27343    #[simd_test(enable = "avx512fp16")]
27344    unsafe fn test_mm_cvtsi16_si128() {
27345        let a = 1;
27346        let r = _mm_cvtsi16_si128(a);
27347        let e = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
27348        assert_eq_m128i(r, e);
27349    }
27350}